misc: Merge v24.0 release staging branch to stable (#1274)

This merge officially marks the release of gem5 v24.0.
2024-06-27 23:22:40 -07:00
parent 84c3b0c111 3acb6e59cf
commit 43769abaf0
624 changed files with 78553 additions and 156412 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,37 @@
+{
+    "name": "gem5 Development Container",
+    "image": "ghcr.io/gem5/devcontainer:latest",
+    "hostRequirements": {
+        "cpus": 8,
+        "memory": "16gb",
+        "storage": "32gb"
+     },
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "eamodio.gitlens",
+                "GitHub.copilot",
+                "GitHub.copilot-chat",
+                "GitHub.vscode-pull-request-github",
+                "ms-python.debugpy",
+                "ms-python.isort",
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "ms-vscode.cpptools",
+                "ms-vscode.cpptools-extension-pack",
+                "ms-vscode.cpptools-themes",
+                "ms-vscode.makefile-tools",
+                "ms-vscode-remote.remote-containers",
+                "Tsinghua-Hexin-Joint-Institute.gem5-slicc",
+                "VisualStudioExptTeam.vscodeintellicode"
+            ]
+        }
+    },
+    "features": {
+        "ghcr.io/devcontainers/features/docker-in-docker:2": {},
+        "ghcr.io/devcontainers/features/github-cli:1": {},
+        "ghcr.io/devcontainers-contrib/features/actionlint:1": {},
+        "ghcr.io/devcontainers-contrib/features/vscode-cli:1": {}
+    },
+    "onCreateCommand": "./.devcontainer/on-create.sh"
+}
--- a/.devcontainer/on-create.sh
+++ b/.devcontainer/on-create.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright (c) 2024 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This script is run when the Docker container specified in devcontainer.json
+# is created.
+
+set -e
+
+# Refresh the git index.
+git update-index
+
+# Install the pre-commit checks.
+./util/pre-commit-install.sh
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,4 @@ configs/dram/lowp_sweep.cfg
 .pyenv
 .vscode
 typings
+.DS_Store
--- a/.mailmap
+++ b/.mailmap
@@ -1,8 +1,11 @@
 Abdul Mutaal Ahmad <abdul.mutaal@gmail.com>
 adarshpatil <adarshpatil123@gmail.com>
+Aditya K Kamath <a_kamath@hotmail.com> aditya <a_kamath@hotmail.com>
 Adrià Armejach <adria.armejach@bsc.es> Adrià Armejach <adria.armejach@gmail.com>
+Adrià Armejach <adria.armejach@bsc.es> Adrià Armejach <66964292+aarmejach@users.noreply.github.com>
 Adrian Herrera <adrian.herrera@arm.com>
 Adrien Pesle <adrien.pesle@arm.com>
+Adwaith R Krishna <adwaithrk19@gmail.com>
 Akash Bagdia <akash.bagdia@ARM.com> Akash Bagdia <akash.bagdia@arm.com>
 Alec Roelke <alec.roelke@gmail.com> Alec Roelke <ar4jc@virginia.edu>
 Alexander Klimov <Alexander.Klimov@arm.com>
@@ -10,21 +13,19 @@ Alexandru Dutu <alexandru.dutu@amd.com> Alexandru <alexandru.dutu@amd.com>
 Alex Richardson <alexrichardson@google.com>
 Ali Jafri <ali.jafri@arm.com>
 Ali Saidi <Ali.Saidi@arm.com> Ali Saidi <ali.saidi@arm.com>
-Ali Saidi <Ali.Saidi@arm.com> Ali Saidi <Ali.Saidi@ARM.com>
 Ali Saidi <Ali.Saidi@arm.com> Ali Saidi <saidi@eecs.umich.edu>
 Alistair Delva <adelva@google.com>
+Alvaro Moreno <alvaro.moreno@bsc.es>
 Amin Farmahini <aminfar@gmail.com>
 Anders Handler <s052838@student.dtu.dk>
 Andrea Mondelli <andrea.mondelli@huawei.com> Andrea Mondelli <andrea.mondelli@ucf.edu>
-Andrea Mondelli <andrea.mondelli@huawei.com> Andrea Mondelli <Andrea.Mondelli@ucf.edu>
 Andrea Pellegrini <andrea.pellegrini@gmail.com>
 Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <andreas.hansson>
 Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <andreas.hansson@arm.com>
-Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <Andreas.Hansson@ARM.com>
 Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <andreas.hansson@armm.com>
 Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <andreas.sandberg@arm.com>
-Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <Andreas.Sandberg@ARM.com>
 Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <andreas@sandberg.pp.se>
+Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <andreas@sandberg.uk>
 Andrew Bardsley <Andrew.Bardsley@arm.com> Andrew Bardsley <Andreas.Bardsley@arm.com>
 Andrew Lukefahr <lukefahr@umich.edu>
 Andrew Schultz <alschult@umich.edu>
@@ -32,11 +33,14 @@ Andriani Mappoura <andriani.mappoura@arm.com>
 Angie Lee <peiyinglee@google.com>
 Anis Peysieux <anis.peysieux@inria.fr>
 Ani Udipi <ani.udipi@arm.com>
+anoop <mysanoop@gmail.com>
 Anouk Van Laer <anouk.vanlaer@arm.com>
 ARM gem5 Developers <none@none>
 Arthur Perais <Arthur.Perais@univ-grenoble-alpes.fr> Arthur Perais <arthur.perais@inria.fr>
 Arun Rodrigues <afrodri@gmail.com>
 Ashkan Tousi <ashkan.tousimojarad@arm.com>
+atrah22 <atul.rahman@outlook.com>
+Atri Bhattacharyya <atri.bhattacharyya@epfl.ch>
 Austin Harris <austinharris@utexas.edu> Austin Harris <mail@austin-harris.com>
 Avishai Tvila <avishai.tvila@gmail.com>
 Ayaz Akram <yazakram@ucdavis.edu>
@@ -48,6 +52,7 @@ Bjoern A. Zeeb <baz21@cam.ac.uk>
 Blake Hechtman <bah13@duke.edu> Blake Hechtman <blake.hechtman@amd.com>
 Blake Hechtman <bah13@duke.edu> Blake Hechtman ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <bah13@duke.edu>
 Bobby R. Bruce <bbruce@ucdavis.edu> Bobby Bruce <bbruce@amarillo.cs.ucdavis.edu>
+Bobby R. Bruce <bbruce@ucdavis.edu> Bobby Bruce <bbruce@ucdavis.edu>
 Boris Shingarov <shingarov@gmail.com> Boris Shingarov <shingarov@labware.com>
 Brad Beckmann <brad.beckmann@amd.com> Brad Beckmann <Brad.Beckmann@amd.com>
 Brad Beckmann <brad.beckmann@amd.com> Brad Beckmann ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <Brad.Beckmann@amd.com>
@@ -60,15 +65,13 @@ Brian Grayson <b.grayson@samsung.com>
 Cagdas Dirik <cdirik@micron.com> cdirik <cdirik@micron.com>
 Carlos Falquez <c.falquez@fz-juelich.de>
 Chander Sudanthi <chander.sudanthi@arm.com> Chander Sudanthi <Chander.Sudanthi@arm.com>
-Chander Sudanthi <chander.sudanthi@arm.com> Chander Sudanthi <Chander.Sudanthi@ARM.com>
 Charles Jamieson <cjamieson2@wisc.edu>
-CHEN Meng <tundriolaxy@gmail.com>
+Chen Meng <tundriolaxy@gmail.com>
 Chen Zou <chenzou@uchicago.edu>
 Chia-You Chen <hortune@google.com>
-Chow, Marcus <marcus.chow@amd.com>
+Marcus Chow <marcus.chow@amd.com>
 Chris Adeniyi-Jones <Chris.Adeniyi-Jones@arm.com>
 Chris Emmons <chris.emmons@arm.com> Chris Emmons <Chris.Emmons@arm.com>
-Chris Emmons <chris.emmons@arm.com> Chris Emmons <Chris.Emmons@ARM.com>
 Chris January <chris.january@arm.com>
 Christian Menard <christian.menard@tu-dresden.de> Christian Menard <Christian.Menard@tu-dresden.de>
 Christopher Torng <clt67@cornell.edu>
@@ -83,17 +86,19 @@ Daecheol You <daecheol.you@samsung.com>
 Dam Sunwoo <dam.sunwoo@arm.com>
 Dan Gibson <gibson@cs.wisc.edu>
 Daniel Carvalho <odanrc@yahoo.com.br> Daniel <odanrc@yahoo.com.br>
+Daniel Carvalho <odanrc@yahoo.com.br> Daniel Carvalho <odanrc@users.noreply.github.com>
 Daniel Carvalho <odanrc@yahoo.com.br> Daniel R. Carvalho <odanrc@yahoo.com.br>
 Daniel Gerzhoy <daniel.gerzhoy@gmail.com>
 Daniel Johnson <daniel.johnson@arm.com>
+Daniel Kouchekinia <DanKouch@users.noreply.github.com>
 Daniel Sanchez <sanchezd@stanford.edu>
 Davide Basilio Bartolini <davide.basilio.bartolini@huawei.com>
 David Guillen-Fandos <david.guillen@arm.com> David Guillen <david.guillen@arm.com>
 David Guillen-Fandos <david.guillen@arm.com> David Guillen Fandos <david.guillen@arm.com>
 David Hashe <david.hashe@amd.com> David Hashe <david.j.hashe@gmail.com>
 David Oehmke <doehmke@umich.edu>
-David Schall <david.schall2@arm.com>
-Derek Christ <dchrist@rhrk.uni-kl.de>
+David Schall <david.schall@ed.ac.uk> David Schall <david.schall2@arm.com>
+Derek Christ <dchrist@rhrk.uni-kl.de> Derek Christ <44267643+derchr@users.noreply.github.com>
 Derek Hower <drh5@cs.wisc.edu>
 Deyaun Guo <guodeyuan@tsinghua.org.cn> Deyuan Guo ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <guodeyuan@tsinghua.org.cn>
 Deyaun Guo <guodeyuan@tsinghua.org.cn> Deyuan Guo <guodeyuan@tsinghua.org.cn>
@@ -107,11 +112,12 @@ Earl Ou <shunhsingou@google.com>
 eavivi <eavivi@ucdavis.edu>
 Éder F. Zulian <zulian@eit.uni-kl.de>
 Edmund Grimley Evans <Edmund.Grimley-Evans@arm.com>
-Eduardo José Gómez Hernández <eduardojose.gomez@um.es>
+Eduardo José Gómez Hernández <eduardojose.gomez@um.es> Eduardo José Gómez Hernández <git@edujgh.net>
 Eliot Moss <moss@cs.umass.edu>
 Emilio Castillo <castilloe@unican.es> Emilio Castillo <ecastill@bsc.es>
 Emilio Castillo <castilloe@unican.es> Emilio Castillo ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <castilloe@unican.es>
 Emily Brickey <esbrickey@ucdavis.edu>
+Emin Gadzhiev <e.gadzhiev.mhk@gmail.com>
 Erfan Azarkhish <erfan.azarkhish@unibo.it>
 Erhu <fengerhu.ipads@gmail.com>
 Eric Van Hensbergen <eric.vanhensbergen@arm.com> Eric Van Hensbergen <Eric.VanHensbergen@ARM.com>
@@ -125,11 +131,12 @@ Gabe Black <gabe.black@gmail.com> Gabe Black <gabeblack@google.com>
 Gabe Black <gabe.black@gmail.com> Gabe Black <gblack@eecs.umich.edu>
 Gabe Loh <gabriel.loh@amd.com> gloh <none@none>
 Gabor Dozsa <gabor.dozsa@arm.com>
-Gabriel Busnot <gabriel.busnot@arteris.com>
+Gabriel Busnot <gabriel.busnot@arteris.com> Gabriel Busnot <gabriel.busnot@cea.fr>
+Gabriel Busnot <gabriel.busnot@arteris.com> Gabriel Busnot <gabibusnot@gmail.com>
 gauravjain14 <gjain6@wisc.edu>
+Gautham Pathak <gspathak@gitlab.uwaterloo.ca>
 Gedare Bloom <gedare@rtems.org> Gedare Bloom <gedare@gwmail.gwu.edu>
 Gene Wu <gene.wu@arm.com> Gene WU <gene.wu@arm.com>
-Gene WU <gene.wu@arm.com> Gene Wu <Gene.Wu@arm.com>
 Geoffrey Blake <geoffrey.blake@arm.com> Geoffrey Blake <blakeg@umich.edu>
 Geoffrey Blake <geoffrey.blake@arm.com> Geoffrey Blake <Geoffrey.Blake@arm.com>
 Georg Kotheimer <georg.kotheimer@mailbox.tu-dresden.de>
@@ -140,10 +147,14 @@ GWDx <gwdx@mail.ustc.edu.cn>
 Hamid Reza Khaleghzadeh <khaleghzadeh@gmail.com> Hamid Reza Khaleghzadeh ext:(%2C%20Lluc%20Alvarez%20%3Clluc.alvarez%40bsc.es%3E%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <khaleghzadeh@gmail.com>
 handsomeliu <handsomeliu@google.com>
 Hanhwi Jang <jang.hanhwi@gmail.com>
-Hoa Nguyen <hoanguyen@ucdavis.edu>
+Harshil Patel <hpppatel@ucdavis.edu> Harshil Patel <harshilp2107@gmail.com>
+Harshil Patel <hpppatel@ucdavis.edu> Harshil Patel <91860903+Harshil2107@users.noreply.github.com>
+Wenjian He <wheac@connect.ust.hk>
+HJikram <humzajahangirikram@gmail.com>
+Hoa Nguyen <hn@hnpl.org> Hoa Nguyen <hoanguyen@ucdavis.edu>
 Hongil Yoon <ongal@cs.wisc.edu>
 Hsuan Hsu <hsuan.hsu@mediatek.com>
-huangjs <jiasen.hjs@alibaba-inc.com>
+hungweihsu <hungweihsu@google.com> hungweihsuG <145444687+hungweihsuG@users.noreply.github.com>
 Hussein Elnawawy <hussein.elnawawy@gmail.com>
 Ian Jiang <ianjiang.ict@gmail.com>
 IanJiangICT <ianjiang.ict@gmail.com>
@@ -152,9 +163,13 @@ Iru Cai <mytbk920423@gmail.com>
 Isaac Richter <isaac.richter@rochester.edu>
 Isaac Sánchez Barrera <isaac.sanchez@bsc.es>
 Ivan Pizarro <ivan.pizarro@metempsy.com>
-Jack Whitham <jack-m5ml2@cs.york.ac.uk> Jack Whitman <jack-m5ml2@cs.york.ac.uk>
+Ivan Turasov <turasov.ivan@gmail.com>
+Ivana Mitrovic <imitrovic@ucdavis.edu> Ivana Mitrovic <ivanamit91@gmail.com>
+Ivana Mitrovic <imitrovic@ucdavis.edu> ivanaamit <ivanamit91@gmail.com>
+Jack Whitham <jack-m5ml2@cs.york.ac.uk>
 Jairo Balart <jairo.balart@metempsy.com>
 Jakub Jermar <jakub@jermar.eu>
+James Braun <jebraun3@wisc.edu>
 James Clarkson <james.clarkson@arm.com>
 Jan-Peter Larsson <jan-peter.larsson@arm.com>
 Jan Vrany <jan.vrany@labware.com>
@@ -174,8 +189,8 @@ Jayneel Gandhi <jayneel@cs.wisc.edu>
 Jennifer Treichler <jtreichl@umich.edu>
 Jerin Joy <joy@rivosinc.com>
 Jiajie Chen <c@jia.je>
-Jiasen Huang <jiasen.hjs@alibaba-inc.com>
-Jiasen <jiasen.hjs@alibaba-inc.com>
+Jiasen Huang <jiasen.hjs@alibaba-inc.com> Jiasen <jiasen.hjs@alibaba-inc.com>
+Jiasen Huang <jiasen.hjs@alibaba-inc.com> huangjs <jiasen.hjs@alibaba-inc.com>
 Jiayi Huang <jyhuang91@gmail.com>
 jiegec <noc@jiegec.ac.cn>
 Jieming Yin <jieming.yin@amd.com> jiemingyin <bjm419@gmail.com>
@@ -188,14 +203,17 @@ Joel Hestness <jthestness@gmail.com> Joel Hestness <hestness@cs.wisc.edu>
 Joël Porquet-Lupine <joel@porquet.org>
 John Alsop <johnathan.alsop@amd.com>
 John Kalamatianos <john.kalamatianos@amd.com> jkalamat <john.kalamatianos@amd.com>
+Johnny <johnnyko@google.com>
 Jordi Vaquero <jordi.vaquero@metempsy.com>
 Jose Marinho <jose.marinho@arm.com>
 Juan M. Cebrian <jm.cebriangonzalez@gmail.com>
 Jui-min Lee <fcrh@google.com>
-kai.ren <kai.ren@streamcomputing.com> Kai Ren <binarystar2006@outlook.com>
+Kai Ren <kai.ren@streamcomputing.com> kai.ren <kai.ren@streamcomputing.com>
+Kai Ren <kai.ren@streamcomputing.com> Kai Ren <binarystar2006@outlook.com>
+KaiBatley <68886332+KaiBatley@users.noreply.github.com>
 Kanishk Sugand <kanishk.sugand@arm.com>
 Karthik Sangaiah <karthik.sangaiah@arm.com>
-Kaustav Goswami <kggoswami@ucdavis.edu>
+Kaustav Goswami <kggoswami@ucdavis.edu> Kaustav Goswami <39310478+kaustav-goswami@users.noreply.github.com>
 Kelly Nguyen <klynguyen@ucdavis.edu>
 Ke Meng <mengke97@hotmail.com>
 Kevin Brodsky <kevin.brodsky@arm.com>
@@ -206,11 +224,16 @@ Koan-Sin Tan <koansin.tan@gmail.com>
 Korey Sewell <ksewell@umich.edu>
 Krishnendra Nathella <Krishnendra.Nathella@arm.com> Krishnendra Nathella <krinat01@arm.com>
 ksco <numbksco@gmail.com>
-kunpai <kunpai@ucdavis.edu>
+Kunal Pai <kunpai@ucdavis.edu> Kunal Pai <62979320+kunpai@users.noreply.github.com>
+Kunal Pai <kunpai@ucdavis.edu> kunpai <kunpai@ucdavis.edu>
+Kunal Pai <kunpai@ucdavis.edu> paikunal <kunpai@ucdavis.edu>
+Kunal Pai <kunpai@ucdavis.edu> KUNAL PAI <kunpai@ucdavis.edu>
 Kyle Roarty <kyleroarty1716@gmail.com> Kyle Roarty <Kyle.Roarty@amd.com>
 Laura Hinman <llhinman@ucdavis.edu>
 Lena Olson <leolson@google.com> Lena Olson <lena@cs.wisc,edu>
 Lena Olson <leolson@google.com> Lena Olson <lena@cs.wisc.edu>
+Leo Redivo <lredivo@ucdavis.edu> leoredivo <94771718+leoredivo@users.noreply.github.com>
+Lingkang <karlzhu12@gmail.com>
 Lisa Hsu <Lisa.Hsu@amd.com> Lisa Hsu <hsul@eecs.umich.edu>
 Lluc Alvarez <lluc.alvarez@bsc.es>
 Lluís Vilanova <vilanova@ac.upc.edu> Lluis Vilanova <vilanova@ac.upc.edu>
@@ -221,9 +244,11 @@ Mahyar Samani <msamani@ucdavis.edu>
 Majid Jalili <majid0jalili@gmail.com>
 Malek Musleh <malek.musleh@gmail.com> Nilay Vaish ext:(%2C%20Malek%20Musleh%20%3Cmalek.musleh%40gmail.com%3E) <nilay@cs.wisc.edu>
 Marc Mari Barcelo <marc.maribarcelo@arm.com>
-Marco Balboni <Marco.Balboni@ARM.com>
-Marco Elver <Marco.Elver@ARM.com> Marco Elver <marco.elver@ed.ac.uk>
 Marc Orr <marc.orr@gmail.com> Marc Orr <morr@cs.wisc.edu>
+Marco Balboni <Marco.Balboni@ARM.com>
+Marco Chen <mc@soc.pub>
+Marco Elver <Marco.Elver@ARM.com> Marco Elver <marco.elver@ed.ac.uk>
+Marco Kurzynski <marcokurzynski@icloud.com>
 Marjan Fariborz <mfariborz@ucdavis.edu> marjanfariborz <mfariborz@ucdavis.edu>
 Mark Hildebrand <mhildebrand@ucdavis.edu>
 Marton Erdos <marton.erdos@arm.com>
@@ -233,20 +258,18 @@ Matteo Andreozzi <matteo.andreozzi@arm.com> Matteo Andreozzi <Matteo.Andreozzi@a
 Matteo M. Fusi <matteo.fusi@bsc.es>
 Matt Evans <matt.evans@arm.com> Matt Evans <Matt.Evans@arm.com>
 Matthew Poremba <matthew.poremba@amd.com> Matthew Poremba <Matthew.Poremba@amd.com>
+Matthias Boettcher <matthias.boettcher@arm.com>
 Matthias Hille <matthiashille8@gmail.com>
-Matthias Jung <jungma@eit.uni-kl.de>
-Matthias Jung <matthias.jung@iese.fraunhofer.de>
-Matt Horsnell <matt.horsnell@arm.com> Matt Horsnell <matt.horsnell@ARM.com>
+Matthias Jung <matthias.jung@iese.fraunhofer.de> Matthias Jung <jungma@eit.uni-kl.de>
 Matt Horsnell <matt.horsnell@arm.com> Matt Horsnell <Matt.Horsnell@arm.com>
-Matt Horsnell <matt.horsnell@arm.com>Matt Horsnell <Matt.Horsnell@ARM.com>
 Matt Poremba <matthew.poremba@amd.com> Matt Poremba <Matthew.Poremba@amd.com>
-Matt Sinclair <mattdsinclair@gmail.com> Matthew Sinclair <matthew.sinclair@amd.com>
-Matt Sinclair <mattdsinclair.wisc@gmail.com> Matt Sinclair <Matthew.Sinclair@amd.com>
+Matt Sinclair <mattdsinclair.wisc@gmail.com> Matt Sinclair <mattdsinclair@gmail.com>
+Matt Sinclair <mattdsinclair.wisc@gmail.com> Matthew Sinclair <matthew.sinclair@amd.com>
 Maurice Becker <madnaurice@googlemail.com>
 Maxime Martinasso <maxime.cscs@gmail.com>
 Maximilian Stein <maximilian.stein@tu-dresden.de>Maximilian Stein <m@steiny.biz>
 Maximilien Breughe <maximilien.breughe@elis.ugent.be> Maximilien Breughe <Maximilien.Breughe@elis.ugent.be>
-Melissa Jost <melissakjost@gmail.com>
+Melissa Jost <melissakjost@gmail.com> Melissa Jost <50555529+mkjost0@users.noreply.github.com>
 Michael Adler <Michael.Adler@intel.com>
 Michael Boyer <Michael.Boyer@amd.com>
 Michael LeBeane <michael.lebeane@amd.com> Michael LeBeane <Michael.Lebeane@amd.com>
@@ -262,7 +285,6 @@ Min Kyu Jeong <minkyu.jeong@arm.com> Min Kyu Jeong <MinKyu.Jeong@arm.com>
 Mitch Hayenga <mitch.hayenga@arm.com> Mitchell Hayenga <Mitchell.Hayenga@ARM.com>
 Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga ext:(%2C%20Amin%20Farmahini%20%3Caminfar%40gmail.com%3E) <mitch.hayenga+gem5@gmail.com>
 Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga <Mitch.Hayenga@arm.com>
-Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga <Mitch.Hayenga@ARM.com>
 Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga <mitch.hayenga+gem5@gmail.com>
 Mohammad Alian <m.alian1369@gmail.com>
 Monir Mozumder <monir.mozumder@amd.com>
@@ -279,13 +301,17 @@ Nathan Binkert <nate@binkert.org> Nathan Binkert <binkertn@umich.edu>
 Nayan Deshmukh <nayan26deshmukh@gmail.com>
 Neha Agarwal <neha.agarwal@arm.com>
 Neil Natekar <nanatekar@ucdavis.edu>
-Nicholas Lindsay <nicholas.lindsay@arm.com>
+Nicholas Lindsay <nicholas.lindsay@arm.com> Nicholas Lindsay <Nicholas.Lindsey@arm.com>
+Nicholas Mosier <nmosier@stanford.edu> Nicholas Mosier <nh.mosier@gmail.com>
 Nicolas Boichat <drinkcat@google.com>
 Nicolas Derumigny <nderumigny@gmail.com>
 Nicolas Zea <nicolas.zea@gmail.com>
+Nikolaos Kyparissas <nikolaos.kyparissas@arm.com>
 Nikos Nikoleris <nikos.nikoleris@arm.com> Nikos Nikoleris <nikos.nikoleris@gmail.com>
 Nilay Vaish ext:(%2C%20Timothy%20Jones%20%3Ctimothy.jones%40cl.cam.ac.uk%3E) <nilay@cs.wisc.edu>
 Nils Asmussen <nils.asmussen@barkhauseninstitut.org> Nils Asmussen <nilsasmussen7@gmail.com>
+Nitesh Narayana <nitesh.dps@gmail.com>
+Nitish Arya <42148385+aryanitish@users.noreply.github.com>
 Noah Katz <nkatz@rivosinc.com>
 ntampouratzis <ntampouratzis@isc.tuc.gr>
 Nuwan Jayasena <Nuwan.Jayasena@amd.com>
@@ -293,7 +319,6 @@ Ola Jeppsson <ola.jeppsson@gmail.com>
 Omar Naji <Omar.Naji@arm.com>
 Onur Kayiran <onur.kayiran@amd.com>
 Pablo Prieto <pablo.prieto@unican.es>
-paikunal <kunpai@ucdavis.edu>
 Palle Lyckegaard <palle@lyckegaard.dk>
 Pau Cabre <pau.cabre@metempsy.com>
 Paul Rosenfeld <prosenfeld@micron.com> Paul Rosenfeld <dramninjas@gmail.com>
@@ -308,29 +333,39 @@ Po-Hao Su <supohaosu@gmail.com>
 Polina Dudnik <pdudnik@cs.wisc.edu> Polina Dudnik <pdudnik@gmail.com>
 Polydoros Petrakis <ppetrak@ics.forth.gr>
 Pouya Fotouhi <pfotouhi@ucdavis.edu> Pouya Fotouhi <Pouya.Fotouhi@amd.com>
+Prajwal Hegde <prhegde@wisc.edu>
 Prakash Ramrakhyani <prakash.ramrakhyani@arm.com> Prakash Ramrakhani <Prakash.Ramrakhani@arm.com>
 Prakash Ramrakhyani <prakash.ramrakhyani@arm.com> Prakash Ramrakhyani <Prakash.Ramrakhyani@arm.com>
 Pritha Ghoshal <pritha9987@tamu.edu>
+Pu (Luke) Yi <lukeyi@stanford.edu>
 Quentin Forcioli <quentin.forcioli@telecom-paris.fr>
 Radhika Jagtap <radhika.jagtap@arm.com> Radhika Jagtap <radhika.jagtap@ARM.com>
 Rahul Thakur <rjthakur@google.com>
-Reiley Jeapaul <Reiley.Jeyapaul@arm.com>
+Rajarshi Das <drajarsh@gmail.com>
+Ranganath (Bujji) Selagamsetty <bujji.selagamsetty@amd.com> BujSet <ranganath1000@gmail.com>
+Razeza <borisov.dn@phystech.edu>
+Reiley Jeapaul <reiley.jeyapaul@arm.com> Reiley Jeapaul <Reiley.Jeyapaul@arm.com>
 Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai Gonzalez Alberquilla <rekai.gonzalezalberquilla@arm.com>
-Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai Gonzalez Alberquilla <Rekai.GonzalezAlberquilla@arm.com>
 Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai Gonzalez-Alberquilla <Rekai.GonzalezAlberquilla@arm.com>
 Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai <Rekai.GonzalezAlberquilla@arm.com>
 Rene de Jong <rene.dejong@arm.com>
 Ricardo Alves <ricardo.alves@arm.com>
 Richard Cooper <richard.cooper@arm.com>
-Richard D. Strong <r.d.strong@gmail.com>
+Richard Strong <rstrong@hp.com> Richard D. Strong <r.d.strong@gmail.com>
 Richard Strong <rstrong@hp.com> Richard Strong <r.d.strong@gmail.com>
 Richard Strong <rstrong@hp.com> Richard Strong <rstrong@cs.ucsd.edu>
 Richard Strong <rstrong@hp.com> Rick Strong <rstrong@cs.ucsd.edu>
 Rico Amslinger <rico.amslinger@informatik.uni-augsburg.de>
 Riken Gohil <Riken.Gohil@arm.com>
 Rizwana Begum <rb639@drexel.edu>
+Robert Hauser <85344819+robhau@users.noreply.github.com>
 Robert Kovacsics <rmk35@cl.cam.ac.uk>
 Robert Scheffel <robert.scheffel1@tu-dresden.de> Robert <robert.scheffel1@tu-dresden.de>
+Rocky Tatiefo <rtatiefo@google.com>
+Roger Chang <rogerycchang@google.com> rogerchang23424 <rogerycchang@google.com>
+Roger Chang <rogerycchang@google.com> rogerchang23424 <32214817+rogerchang23424@users.noreply.github.com>
+Roger Chang <rogerycchang@google.com> rogerchang23424 <aucixw45876@gmail.com>
+Roger Chang <rogerycchang@google.com> Yu-Cheng Chang <rogerycchang@google.com>
 Rohit Kurup <rohit.kurup@arm.com>
 Ron Dreslinski <rdreslin@umich.edu> Ronald Dreslinski <rdreslin@umich.edu>
 Ruben Ayrapetyan <ruben.ayrapetyan@arm.com>
@@ -342,23 +377,21 @@ sacak32 <byrakocalan99@gmail.com>
 Sampad Mohapatra <sampad.mohapatra@gmail.com>
 Samuel Grayson <sam@samgrayson.me>
 Samuel Stark <samuel.stark2@arm.com>
-Sandipan Das <31861871+sandip4n@users.noreply.github.com>
 Sandipan Das <sandipan@linux.ibm.com> Sandipan Das <31861871+sandip4n@users.noreply.github.com>
 Santi Galan <santi.galan@metempsy.com>
 Sascha Bischoff <sascha.bischoff@arm.com> Sascha Bischoff <sascha.bischoff@ARM.com>
-Sascha Bischoff <sascha.bischoff@arm.com> Sascha Bischoff <Sascha.Bischoff@ARM.com>
+Saúl Adserias <33020671+saul44203@users.noreply.github.com>
 Sean McGoogan <Sean.McGoogan@arm.com>
 Sean Wilson <spwilson2@wisc.edu>
 Sergei Trofimov <sergei.trofimov@arm.com>
 Severin Wischmann <wiseveri@student.ethz.ch> Severin Wischmann ext:(%2C%20Ioannis%20Ilkos%20%3Cioannis.ilkos09%40imperial.ac.uk%3E) <wiseveri@student.ethz.ch>
 Shawn Rosti <shawn.rosti@gmail.com>
 Sherif Elhabbal <elhabbalsherif@gmail.com>
-Shivani Parekh <shparekh@ucdavis.edu>
-Shivani <shparekh@ucdavis.edu>
+Shivani Parekh <shparekh@ucdavis.edu> Shivani <shparekh@ucdavis.edu>
 Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
+Simon Park <seminpark@google.com>
 Somayeh Sardashti <somayeh@cs.wisc.edu>
-Sooraj Puthoor <puthoorsooraj@gmail.com>
-Sooraj Puthoor <Sooraj.Puthoor@amd.com>
+Sooraj Puthoor <puthoorsooraj@gmail.com> Sooraj Puthoor <Sooraj.Puthoor@amd.com>
 Sophiane Senni <sophiane.senni@gmail.com>
 Soumyaroop Roy <sroy@cse.usf.edu>
 Srikant Bharadwaj <srikant.bharadwaj@amd.com>
@@ -370,7 +403,6 @@ Steve Raasch <sraasch@umich.edu>
 Steve Reinhardt <stever@gmail.com> Steve Reinhardt ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E%2C%20Ali%20Saidi%20%3CAli.Saidi%40ARM.com%3E) <stever@gmail.com>
 Steve Reinhardt <stever@gmail.com> Steve Reinhardt <stever@eecs.umich.edu>
 Steve Reinhardt <stever@gmail.com> Steve Reinhardt <steve.reinhardt@amd.com>
-Steve Reinhardt <stever@gmail.com> Steve Reinhardt <Steve.Reinhardt@amd.com>
 Stian Hvatum <stian@dream-web.no>
 Sudhanshu Jha <sudhanshu.jha@arm.com>
 Sujay Phadke <electronicsguy123@gmail.com>
@@ -378,16 +410,18 @@ Sungkeun Kim <ksungkeun84@tamu.edu>
 Swapnil Haria <swapnilster@gmail.com> Swapnil Haria <swapnilh@cs.wisc.edu>
 Taeho Kgil <tkgil@umich.edu>
 Tao Zhang <tao.zhang.0924@gmail.com>
+Thilo Vörtler <thilo.voertler@coseda-tech.com> root <thilo.voertler@coseda-tech.com>
 Thomas Grass <Thomas.Grass@ARM.com>
 Tiago Mück <tiago.muck@arm.com> Tiago Muck <tiago.muck@arm.com>
+Tiberiu Bucur <36485854+TiberiuBucur@users.noreply.github.com>
 Tim Harris <tharris@microsoft.com>
 Timothy Hayes <timothy.hayes@arm.com>
 Timothy M. Jones <timothy.jones@arm.com> Timothy Jones <timothy.jones@cl.cam.ac.uk>
 Timothy M. Jones <timothy.jones@arm.com> Timothy M. Jones <timothy.jones@cl.cam.ac.uk>
 Timothy M. Jones <timothy.jones@arm.com> Timothy M. Jones <tjones1@inf.ed.ac.uk>
 Tom Jablin <tjablin@gmail.com>
-Tommaso Marinelli <tommarin@ucm.es>
 Tom Rollet <tom.rollet@huawei.com>
+Tommaso Marinelli <tommarin@ucm.es>
 Tong Shen <endlessroad@google.com>
 Tony Gutierrez <anthony.gutierrez@amd.com> Anthony Gutierrez <atgutier@umich.edu>
 Travis Boraten <travis.boraten@amd.com>
@@ -401,6 +435,7 @@ Victor Garcia <victor.garcia@arm.com>
 Vilas Sridharan <vilas.sridharan@gmail.com>
 Vincentius Robby <acolyte@umich.edu>
 Vince Weaver <vince@csl.cornell.edu>
+Vishnu Ramadas <vramadas@outlook.com>
 vramadas95 <vramadas@wisc.edu>
 vsoria <victor.soria@bsc.es>
 Wade Walker <wade.walker@arm.com>
@@ -409,14 +444,16 @@ Weiping Liao <weipingliao@google.com>
 Wende Tan <twd2@163.com>
 Wendy Elsasser <wendy.elsasser@arm.com>
 William Wang <william.wang@arm.com> William Wang <William.Wang@arm.com>
-William Wang <william.wang@arm.com> William Wang <William.Wang@ARM.com>
 Willy Wolff <willy.mh.wolff.ml@gmail.com>
 Wing Li <wingers@google.com>
+wmin0 <wmin0@hotmail.com>
 Xiangyu Dong <rioshering@gmail.com>
 Xianwei Zhang <xianwei.zhang.@amd.com> Xianwei Zhang <xianwei.zhang@amd.com>
 Xiaoyu Ma <xiaoyuma@google.com>
 Xin Ouyang <xin.ouyang@streamcomputing.com>
 Xiongfei <xiongfei.liao@gmail.com>
+Xuan Hu <huxuan@bosc.ac.cn>
+Yan Lee <yanlee@google.com>
 Yasuko Eckert <yasuko.eckert@amd.com>
 Yen-lin Lai <yenlinlai@google.com>
 Yifei Liu <liu.ad2039@gmail.com>
@@ -426,7 +463,10 @@ Yuan Yao <yuanyao@seas.harvard.edu>
 Yuetsu Kodama <yuetsu.kodama@riken.jp> yuetsu.kodama <yuetsu.kodama@riken.jp>
 Yu-hsin Wang <yuhsingw@google.com>
 Zhang Zheng <perise@gmail.com>
-Zhantong Qiu <ztqiu@ucdavis.edu>
+Zhantong Qiu <ztqiu@ucdavis.edu> studyztp <studyztp@gmail.com>
 Zhengrong Wang <seanzw@ucla.edu> seanzw <seanyukigeek@gmail.com>
+Zhengrong Wang <seanzw@ucla.edu> Zhengrong Wang <seanyukigeek@gmail.com>
 zhongchengyong <zhongcy93@gmail.com>
 Zicong Wang <wangzicong@nudt.edu.cn>
+Zixian Cai <2891235+caizixian@users.noreply.github.com>
+zmckevitt <zack.mckevitt@gmail.com>
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -1,3 +1,161 @@
+# Version 24.0
+
+gem5 Version 24.0 is the first major release of 2024.
+During this time there have been 298 pull requests merged, comprising of over 600 commits, from 56 unique contributors.
+
+## API and user-facing changes
+
+* The GCN3 GPU model has been removed in favor of the newer VEGA_X85 GPU model.
+* gem5 now supports building, running, and simulating Ubuntu 24.04.
+
+### Compiler and OS support
+
+As of this release gem5 support Clang version 6 to 16 and GCC version 10 to 13.
+While other compilers and versions may work, they are not regularly tested.
+
+gem5 now supports building, running, and simulating on Ubuntu 24.04.
+We continue to support 22.04 with 20.04 being deprecated in the coming year.
+The majority of our testing is done on Ubuntu LTS systems though Apple Silicon machines and other Linux distributions have also been used regularly during development.
+Improvements have been made to ensure a wider support of operating systems.
+
+## New features
+
+### gem5 MultiSim: Multiprocessing for gem5
+
+The gem5 "MultiSim" module allows for multiple simulations to be run from a single gem5 execution via a single gem5 configuration script.
+This allows for multiple simulations to be run in parallel in a structured manner.
+
+To use MultiSim first create multiple simulators and add them to the MultiSim with the `add_simulator` function.
+If needed, limit the maximum number of parallel processes with the `set_num_processes` function.
+Then run the simulations in parallel with the `gem5` binary using  `-m gem5.utils.multisim`.
+
+Here is an example of how to use MultiSim:
+
+```python
+import gem5.util.multisim as multisim
+
+# Set the maximum number of processes to run in parallel
+multisim.set_num_processes(4)
+
+# Create multiple simulators.
+# In this case, one for each workload in the benchmark suite.
+for workload in benchmark_suite:
+    board = X86Board(
+        # ...
+    )
+    board.set_workload(workload)
+
+    # Useful to set the ID here. This is used to create unique output
+    # directorires for each gem5 process and can be used to idenfify and
+    # run gem5 processes individually.
+    simulator = Simulator(board, id=f"{workload.get_id()}")
+    multisim.add_simulator(simulator)
+```
+
+Then to run the simulations in parallel:
+
+```sh
+<gem5 binary> -m gem5.utils.multisim <config script>
+```
+
+The output directory ("m5out" by default) will contain sub-directories for each simulation run.
+The sub-directory will be named after the simulator ID set in the configuration script.
+We therefore recommend setting the simulator ID to something meaningful to help identify the output directories (i.e., the workload run or something identifying the meaningful characteristics of the simulated system in comparison to others).
+
+If only one simulation specified in the config needs run, you can do so with:
+
+```sh
+<gem5 binary>  <config script> --list # Lists the simulations by ID
+
+<gem5 binary> <config script> <ID> # Run the simulation with the specified ID.
+```
+
+Example scripts of using MultiSim can be found in "configs/example/gem5_library/multisim".
+
+
+### RISC-V Vector Extension Support
+
+There have been significant improvements to the RVV support in gem5 including
+
+* Fixed viota (#1137)
+* Fixed vrgather (#1134)
+* Added RVV FP16 support (#1123)
+* Fixed widening and narrowing instructions (#1079)
+* Fixed bug in vfmv.f.s (#863)
+* Add unit stride segment loads and stores (#851) (#913)
+* Fix vl in masked load/store (#830)
+* Add unit-stride loads (#794)
+* Fix many RVV instructions (#814) (#805) (#715)
+
+### General RISC-V bugfixes
+
+* Fixed problem in TLB lookup (#1264)
+* Fixed sign-extended branch target (#1173)
+* Fixed compressed jump instructions (#1163)
+* Fixed GDB connection (#1152)
+* Fixed CSR behavior (#1099)
+* Add Integer conditional operations Zicond (#1078)
+* Add RISC-V Semihosting support (#681)
+* Added more detailed instruction types (#589)
+* Fixed 32-bit m5op arguments (#900)
+* Fixed c.fswsp and c.fsw (#998) (#1005)
+* Update PLIC implementation (#886)
+* Fix fflags behavior in O3 (#868)
+* Add support for local interrupts (#813)
+* Removebit 63 of physical address (#756)
+
+## Improvements
+
+* Added an new generator which can generate requests based on [spatter](https://github.com/hpcgarage/spatter) patterns.
+* KVM is now supported in the gem5 Standard Library ARM Board.
+* Generic Cache template added to the Standard Library: https://github.com/gem5/gem5/pull/745
+* Support added for partitioning caches.
+* The Standard Library `obtain_resources` function can request multiple resources at once thus reducing delay associated with multiple requests.
+* An official gem5 DevContainer has been added to the gem5 repository.
+This can be used to build and run gem5 in consistent environment and enables GitHub Codespaces support.
+
+### gem5 Python Statistics
+
+The gem5 Python statistics API has been improved.
+The gem5 Project's general intent with this improvement is make it easier and more desirable to obtain and interact with gem5 simulation statistics via Python.
+
+For example, the following code snippet demonstrates how to obtain statistics from a gem5 simulation:
+
+```python
+from m5.stats.gem5stats import get_simstat
+
+## Setup and run the configuation ...
+simstat = get_simstat(board)
+
+# Print the number of cycles the CPU at index 0 has executed.
+print(simstat.cpu[0].numCycles)
+
+# Strings can also be used to access statistics.
+print(simstat['cpu'][0]['numCycles'])
+
+# Print the total number of cycles executed by all CPUs.
+print(sum(simstat.cpu[i].numCycles for i in range(len(simstat.cpu))))
+```
+
+We hope the usage of the gem5 Python statistics API will be more intuitive and easier to use while allowing better processing of statistical data.
+
+### GPU Model
+
+* Support for MI300X and MI200 GPU models including their features and most instructions.
+* ROCm 6.1 disk image and compile docker files have been added. ROCm 5.4.2 and 4.2 resources are removed.
+* The deprecated GCN3 ISA has been removed. Use VEGA instead.
+
+## Bug Fixes
+
+* An integer overflow error known to affect the `AddrRange` class has been fixed.
+* Fix fflags behavior of floating point instruction in RISC-V for Out-of-Order CPUs.
+
+### Arm FEAT_MPAM Support
+
+An initial implementation of FEAT_MPAM has been introduced in gem5 with the capability to statically partition
+classic caches. Guidance on how to use this is available on a Arm community [blog post](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/gem5-cache-partitioning)
+
+
 # Version 23.1

 gem5 Version 23.1 is our first release where the development has been on GitHub.
--- a/77
+++ b/77
@@ -117,6 +117,8 @@ AddOption('--no-compress-debug', action='store_true',
          help="Don't compress debug info in build files")
 AddOption('--with-lto', action='store_true',
          help='Enable Link-Time Optimization')
+AddOption('--with-libcxx', action='store_true',
+          help='Use libc++ as the C++ standard library (requires Clang)')
 AddOption('--verbose', action='store_true',
          help='Print full tool command lines')
 AddOption('--without-python', action='store_true',
@@ -550,11 +552,6 @@ for variant_path in variant_paths:
        env.Append(CCFLAGS=['-pipe'])
        env.Append(CCFLAGS=['-fno-strict-aliasing'])

-        # Enable -Wall and -Wextra and then disable the few warnings that
-        # we consistently violate
-        env.Append(CCFLAGS=['-Wall', '-Wundef', '-Wextra',
-                            '-Wno-sign-compare', '-Wno-unused-parameter'])
-
        # We always compile using C++17
        env.Append(CXXFLAGS=['-std=c++17'])

@@ -567,6 +564,16 @@ for variant_path in variant_paths:
        with gem5_scons.Configure(env) as conf:
            conf.CheckLinkFlag('-Wl,--as-needed')

+        want_libcxx = GetOption('with_libcxx')
+        if want_libcxx:
+            with gem5_scons.Configure(env) as conf:
+                # Try using libc++ if it supports the <filesystem> library.
+                code = '#include <filesystem>\nint main() { return 0; }'
+                if (not conf.CheckCxxFlag('-stdlib=libc++') or
+                    not conf.CheckLinkFlag('-stdlib=libc++', code=code)
+                ):
+                    error('Requested libc++ but it is not usable')
+
        linker = GetOption('linker')
        if linker:
            with gem5_scons.Configure(env) as conf:
@@ -597,6 +604,13 @@ for variant_path in variant_paths:
                    env.Append(LINKFLAGS=['-Wl,--no-keep-memory'])
                else:
                    error("Unable to use --no-keep-memory with the linker")
+
+        # Treat warnings as errors but white list some warnings that we
+        # want to allow (e.g., deprecation warnings).
+        env.Append(CCFLAGS=['-Werror',
+                             '-Wno-error=deprecated-declarations',
+                             '-Wno-error=deprecated',
+                            ])
    else:
        error('\n'.join((
              "Don't know what compiler options to use for your compiler.",
@@ -612,8 +626,8 @@ for variant_path in variant_paths:
              "src/SConscript to support that compiler.")))

    if env['GCC']:
-        if compareVersions(env['CXXVERSION'], "7") < 0:
-            error('gcc version 7 or newer required.\n'
+        if compareVersions(env['CXXVERSION'], "10") < 0:
+            error('gcc version 10 or newer required.\n'
                  'Installed version:', env['CXXVERSION'])

        # Add the appropriate Link-Time Optimization (LTO) flags if
@@ -637,17 +651,6 @@ for variant_path in variant_paths:
            '-fno-builtin-malloc', '-fno-builtin-calloc',
            '-fno-builtin-realloc', '-fno-builtin-free'])

-        if compareVersions(env['CXXVERSION'], "9") < 0:
-            # `libstdc++fs`` must be explicitly linked for `std::filesystem``
-            # in GCC version 8. As of GCC version 9, this is not required.
-            #
-            # In GCC 7 the `libstdc++fs`` library explicit linkage is also
-            # required but the `std::filesystem` is under the `experimental`
-            # namespace(`std::experimental::filesystem`).
-            #
-            # Note: gem5 does not support GCC versions < 7.
-            env.Append(LIBS=['stdc++fs'])
-
    elif env['CLANG']:
        if compareVersions(env['CXXVERSION'], "6") < 0:
            error('clang version 6 or newer required.\n'
@@ -665,7 +668,7 @@ for variant_path in variant_paths:

        env.Append(TCMALLOC_CCFLAGS=['-fno-builtin'])

-        if compareVersions(env['CXXVERSION'], "11") < 0:
+        if not want_libcxx and compareVersions(env['CXXVERSION'], "11") < 0:
            # `libstdc++fs`` must be explicitly linked for `std::filesystem``
            # in clang versions 6 through 10.
            #
@@ -679,7 +682,7 @@ for variant_path in variant_paths:

        # On Mac OS X/Darwin we need to also use libc++ (part of XCode) as
        # opposed to libstdc++, as the later is dated.
-        if sys.platform == "darwin":
+        if not want_libcxx and sys.platform == "darwin":
            env.Append(CXXFLAGS=['-stdlib=libc++'])
            env.Append(LIBS=['c++'])

@@ -688,20 +691,26 @@ for variant_path in variant_paths:
    if GetOption('with_ubsan'):
        sanitizers.append('undefined')
    if GetOption('with_asan'):
-        # Available for gcc >= 5 or llvm >= 3.1 both a requirement
-        # by the build system
-        sanitizers.append('address')
-        suppressions_file = Dir('util').File('lsan-suppressions').get_abspath()
-        suppressions_opt = 'suppressions=%s' % suppressions_file
-        suppressions_opts = ':'.join([suppressions_opt,
-                                      'print_suppressions=0'])
-        env['ENV']['LSAN_OPTIONS'] = suppressions_opts
-        print()
-        warning('To suppress false positive leaks, set the LSAN_OPTIONS '
-                'environment variable to "%s" when running gem5' %
-                suppressions_opts)
-        warning('LSAN_OPTIONS=%s' % suppressions_opts)
-        print()
+        if env['GCC']:
+            # Address sanitizer is not supported with GCC. Please see Github
+            # Issue https://github.com/gem5/gem5/issues/916 for more details.
+            warning("Address Sanitizer is not supported with GCC. "
+                    "This option will be ignored.")
+        else:
+            # Available for llvm >= 3.1. A requirement by the build system.
+            sanitizers.append('address')
+            suppressions_file = Dir('util').File('lsan-suppressions')\
+                                .get_abspath()
+            suppressions_opt = 'suppressions=%s' % suppressions_file
+            suppressions_opts = ':'.join([suppressions_opt,
+                                        'print_suppressions=0'])
+            env['ENV']['LSAN_OPTIONS'] = suppressions_opts
+            print()
+            warning('To suppress false positive leaks, set the LSAN_OPTIONS '
+                    'environment variable to "%s" when running gem5' %
+                    suppressions_opts)
+            warning('LSAN_OPTIONS=%s' % suppressions_opts)
+            print()
    if sanitizers:
        sanitizers = ','.join(sanitizers)
        if env['GCC'] or env['CLANG']:
--- a/build_opts/ALL
+++ b/build_opts/ALL
@@ -7,3 +7,4 @@ USE_POWER_ISA=y
 USE_RISCV_ISA=y
 USE_SPARC_ISA=y
 USE_X86_ISA=y
+USE_TEST_OBJECTS=y
--- a/build_opts/GCN3_X86
+++ b/build_opts/GCN3_X86
@@ -1,6 +0,0 @@
-RUBY=y
-RUBY_PROTOCOL_GPU_VIPER=y
-BUILD_ISA=y
-USE_X86_ISA=y
-GCN3_GPU_ISA=y
-BUILD_GPU=y
--- a/build_tools/sim_object_param_struct_hh.py
+++ b/build_tools/sim_object_param_struct_hh.py
@@ -211,8 +211,7 @@ code.indent()
 if sim_object == SimObject:
    code(
        """
-SimObjectParams() {}
-virtual ~SimObjectParams() {}
+virtual ~SimObjectParams() = default;

 std::string name;
    """
--- a/configs/deprecated/example/se.py
+++ b/configs/deprecated/example/se.py
@@ -224,7 +224,7 @@ for cpu in system.cpu:
 if ObjectList.is_kvm_cpu(CPUClass) or ObjectList.is_kvm_cpu(FutureClass):
    if buildEnv["USE_X86_ISA"]:
        system.kvm_vm = KvmVM()
-        system.m5ops_base = 0xFFFF0000
+        system.m5ops_base = max(0xFFFF0000, Addr(args.mem_size).getValue())
        for process in multiprocesses:
            process.useArchPT = True
            process.kvmInSE = True
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -335,6 +335,12 @@ parser.add_argument(
    default="dynamic",
    help="register allocation policy (simple/dynamic)",
 )
+parser.add_argument(
+    "--register-file-cache-size",
+    type=int,
+    default=0,
+    help="number of registers in cache",
+)

 parser.add_argument(
    "--dgpu",
@@ -369,11 +375,33 @@ parser.add_argument(
 parser.add_argument(
    "--gfx-version",
    type=str,
-    default="gfx801",
+    default="gfx902",
    choices=GfxVersion.vals,
    help="Gfx version for gpuNote: gfx902 is not fully supported by ROCm",
 )

+parser.add_argument(
+    "--tcp-rp",
+    type=str,
+    default="TreePLRURP",
+    help="cache replacement policy" "policy for tcp",
+)
+
+parser.add_argument(
+    "--tcc-rp",
+    type=str,
+    default="TreePLRURP",
+    help="cache replacement policy" "policy for tcc",
+)
+
+# sqc rp both changes sqc rp and scalar cache rp
+parser.add_argument(
+    "--sqc-rp",
+    type=str,
+    default="TreePLRURP",
+    help="cache replacement policy" "policy for sqc",
+)
+
 Ruby.define_options(parser)

 # add TLB options to the parser
@@ -428,6 +456,7 @@ print(
 # shader is the GPU
 shader = Shader(
    n_wf=args.wfs_per_simd,
+    cu_per_sqc=args.cu_per_sqc,
    clk_domain=SrcClockDomain(
        clock=args.gpu_clock,
        voltage_domain=VoltageDomain(voltage=args.gpu_voltage),
@@ -493,6 +522,7 @@ for i in range(n_cu):
    vrfs = []
    vrf_pool_mgrs = []
    srfs = []
+    rfcs = []
    srf_pool_mgrs = []
    for j in range(args.simds_per_cu):
        for k in range(shader.n_wf):
@@ -537,10 +567,16 @@ for i in range(n_cu):
                simd_id=j, wf_size=args.wf_size, num_regs=args.sreg_file_size
            )
        )
+        rfcs.append(
+            RegisterFileCache(
+                simd_id=j, cache_size=args.register_file_cache_size
+            )
+        )

    compute_units[-1].wavefronts = wavefronts
    compute_units[-1].vector_register_file = vrfs
    compute_units[-1].scalar_register_file = srfs
+    compute_units[-1].register_file_cache = rfcs
    compute_units[-1].register_manager = RegisterManager(
        policy=args.registerManagerPolicy,
        vrf_pool_managers=vrf_pool_mgrs,
@@ -671,7 +707,7 @@ render_driver = GPURenderDriver(filename=f"dri/renderD{renderDriNum}")
 gpu_hsapp = HSAPacketProcessor(
    pioAddr=hsapp_gpu_map_paddr, numHWQueues=args.num_hw_queues
 )
-dispatcher = GPUDispatcher()
+dispatcher = GPUDispatcher(kernel_exit_events=True)
 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp, dispatcher=dispatcher)
 gpu_driver.device = gpu_cmd_proc
 shader.dispatcher = dispatcher
@@ -798,6 +834,8 @@ if fast_forward:
 # configure the TLB hierarchy
 GPUTLBConfig.config_tlb_hierarchy(args, system, shader_idx)

+system.exit_on_work_items = True
+
 # create Ruby system
 system.piobus = IOXBar(
    width=32, response_latency=0, frontend_latency=0, forward_latency=0
@@ -938,19 +976,15 @@ root = Root(system=system, full_system=False)
 # knows what type of GPU hardware we are simulating
 if args.dgpu:
    assert args.gfx_version in [
-        "gfx803",
        "gfx900",
    ], "Incorrect gfx version for dGPU"
-    if args.gfx_version == "gfx803":
-        hsaTopology.createFijiTopology(args)
-    elif args.gfx_version == "gfx900":
+    if args.gfx_version == "gfx900":
        hsaTopology.createVegaTopology(args)
 else:
    assert args.gfx_version in [
-        "gfx801",
        "gfx902",
    ], "Incorrect gfx version for APU"
-    hsaTopology.createCarrizoTopology(args)
+    hsaTopology.createRavenTopology(args)

 m5.ticks.setGlobalFrequency("1THz")
 if args.abs_max_tick:
@@ -976,6 +1010,41 @@ if args.fast_forward:

 exit_event = m5.simulate(maxtick)

+while True:
+    if (
+        exit_event.getCause() == "m5_exit instruction encountered"
+        or exit_event.getCause() == "user interrupt received"
+        or exit_event.getCause() == "simulate() limit reached"
+        or "exiting with last active thread context" in exit_event.getCause()
+    ):
+        print(f"breaking loop due to: {exit_event.getCause()}.")
+        break
+    elif "checkpoint" in exit_event.getCause():
+        assert args.checkpoint_dir is not None
+        m5.checkpoint(args.checkpoint_dir)
+        print("breaking loop with checkpoint")
+        break
+    elif "GPU Kernel Completed" in exit_event.getCause():
+        print("GPU Kernel Completed dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    elif "GPU Blit Kernel Completed" in exit_event.getCause():
+        print("GPU Blit Kernel Completed dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    elif "workbegin" in exit_event.getCause():
+        print("m5 work begin dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    elif "workend" in exit_event.getCause():
+        print("m5 work end dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    else:
+        print(f"Unknown exit event: {exit_event.getCause()}. Continuing...")
+
+    exit_event = m5.simulate(maxtick - m5.curTick())
+
 if args.fast_forward:
    if exit_event.getCause() == "a thread reached the max instruction count":
        m5.switchCpus(system, switch_cpu_list)
--- a/configs/example/arm/starter_se.py
+++ b/configs/example/arm/starter_se.py
@@ -53,15 +53,24 @@ from common import (
    MemConfig,
    ObjectList,
 )
-from common.cores.arm import HPI
+from common.cores.arm import (
+    HPI,
+    O3_ARM_v7a,
+)

 # Pre-defined CPU configurations. Each tuple must be ordered as : (cpu_class,
-# l1_icache_class, l1_dcache_class, walk_cache_class, l2_Cache_class). Any of
+# l1_icache_class, l1_dcache_class, l2_Cache_class). Any of
 # the cache class may be 'None' if the particular cache is not present.
 cpu_types = {
    "atomic": (AtomicSimpleCPU, None, None, None),
    "minor": (MinorCPU, devices.L1I, devices.L1D, devices.L2),
    "hpi": (HPI.HPI, HPI.HPI_ICache, HPI.HPI_DCache, HPI.HPI_L2),
+    "o3": (
+        O3_ARM_v7a.O3_ARM_v7a_3,
+        O3_ARM_v7a.O3_ARM_v7a_ICache,
+        O3_ARM_v7a.O3_ARM_v7a_DCache,
+        O3_ARM_v7a.O3_ARM_v7aL2,
+    ),
 }


--- a/configs/example/cache_partitioning.py
+++ b/configs/example/cache_partitioning.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2024 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This script showcases the functionality of cache partitioning policies,
+# containg a simple system comprised of a memory requestor (TrafficGen),
+# a cache enforcing policies for requests and a SimpleMemory backing store.
+#
+# Using the Way policy, the cache should show the following statistics in the
+# provided configuration:
+#
+# | Allocated Ways | 1 | 2   | 3   | 4   | 5   | 6   | 7   | 8    |
+# |----------------|---|-----|-----|-----|-----|-----|-----|------|
+# | Cache Hits     | 0 | 256 | 384 | 512 | 640 | 768 | 896 | 1024 |
+#
+# Using the MaxCapacity policy, expected results are the following:
+#
+# | Allocation % | 10 | 20  | 30  | 40  | 50  | 60  | 70  | 80  | 90  | 100  |
+# |--------------|----|-----|-----|-----|-----|-----|-----|-----|-----|------|
+# | Cache Hits   | 0  | 152 | 307 | 409 | 512 | 614 | 716 | 819 | 921 | 1024 |
+
+import argparse
+
+import m5
+from m5.objects import *
+
+
+def capacityAllocation(capacity_str):
+    """
+    Verify that Max Capacity partitioning policy has been provided with a suitable
+    configuration
+    """
+    capacity = float(capacity_str)
+
+    if capacity > 1 or capacity < 0:
+        raise argparse.ArgumentTypeError(
+            "Max Capacity Policy needs allocation in range [0, 1]"
+        )
+
+    return capacity
+
+
+def wayAllocation(way_str):
+    """
+    Verify that Way partitioning policy has been provided with a suitable
+    configuration
+    """
+    way_alloc = int(way_str)
+
+    if way_alloc < 0:
+        raise argparse.ArgumentTypeError(
+            "Way Policy needs positive number of ways"
+        )
+
+    return way_alloc
+
+
+def generatePartPolicy(args):
+    """
+    Generate Partitioning Policy object based on provided arguments
+    """
+    assert args.policy in [
+        "way",
+        "max_capacity",
+    ], "Only support generating way and max_capacity policies"
+
+    if args.policy == "way":
+        allocated_ways = [way for way in range(0, args.way_allocation)]
+        allocation = WayPolicyAllocation(partition_id=0, ways=allocated_ways)
+
+        return WayPartitioningPolicy(allocations=[allocation])
+
+    if args.policy == "max_capacity":
+        return MaxCapacityPartitioningPolicy(
+            partition_ids=[0], capacities=[args.capacity_allocation]
+        )
+
+
+def configSystem():
+    """
+    Configure base system and memory
+    """
+
+    system = System(membus=IOXBar(width=128))
+    system.clk_domain = SrcClockDomain(
+        clock="10THz",
+        voltage_domain=VoltageDomain(),
+    )
+
+    # Memory configuration
+    system.mem_ctrl = SimpleMemory(bandwidth="1GiB/s", latency="10ns")
+
+    # add memory
+    system.mem_ctrl.range = AddrRange("64KiB")
+    system.mem_ctrl.port = system.membus.mem_side_ports
+    return system
+
+
+parser = argparse.ArgumentParser(
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+parser.add_argument(
+    "--policy",
+    default="way",
+    choices=["way", "max_capacity"],
+    help="This option defines which Cache Partitioning Policy to use for "
+    "the system cache",
+)
+
+parser.add_argument(
+    "--capacity-allocation",
+    type=capacityAllocation,
+    default=0.5,
+    help="The amount of the cache to partition to the default PartitionID "
+    "when using Max Capacity Cache Partitioning Policy in [0,1] range",
+)
+
+parser.add_argument(
+    "--way-allocation",
+    type=wayAllocation,
+    default=4,
+    help="The number of ways in the cache to partition to the default "
+    "PartitionID when using Way Cache Partitioning Policy",
+)
+
+args = parser.parse_args()
+
+m5.ticks.setGlobalFrequency("10THz")
+system = configSystem()
+
+# create a cache to sit between the memory and traffic gen to enforce
+# partitioning policies
+part_manager = PartitionManager(
+    partitioning_policies=[generatePartPolicy(args)]
+)
+system.cache = NoncoherentCache(
+    size="64KiB",
+    assoc=8,
+    partitioning_manager=part_manager,
+    tag_latency=0,
+    data_latency=0,
+    response_latency=0,
+    mshrs=1,
+    tgts_per_mshr=8,
+    write_buffers=1,
+    replacement_policy=MRURP(),
+)
+system.cache.mem_side = system.membus.cpu_side_ports
+
+# instantiate traffic gen and connect to crossbar
+system.tgen = PyTrafficGen()
+system.tgen.port = system.cache.cpu_side
+
+# finalise config and run simulation
+root = Root(full_system=False, system=system)
+root.system.mem_mode = "timing"
+m5.instantiate()
+
+# configure traffic generator to do 2x 64KiB sequential reads from address 0
+# to 65536; one to warm up the cache one to test cache partitioning
+linear_tgen = system.tgen.createLinear(
+    1000000000, 0, 65536, 64, 1, 1, 100, 65536
+)
+exit_tgen = system.tgen.createExit(1)
+system.tgen.start([linear_tgen, linear_tgen, exit_tgen])
+
+# handle exit reporting
+exit_event = m5.simulate(2000000000)
+print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")
--- a/configs/example/gem5_library/arm-hello.py
+++ b/configs/example/gem5_library/arm-hello.py
@@ -84,7 +84,7 @@ board.set_se_binary_workload(
    # Any resource specified in this file will be automatically retrieved.
    # At the time of writing, this file is a WIP and does not contain all
    # resources. Jira ticket: https://gem5.atlassian.net/browse/GEM5-1096
-    obtain_resource("arm-hello64-static")
+    obtain_resource("arm-hello64-static", resource_version="1.0.0")
 )

 # Lastly we run the simulation.
--- a/configs/example/gem5_library/arm-ubuntu-run-with-kvm.py
+++ b/configs/example/gem5_library/arm-ubuntu-run-with-kvm.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2022-23 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This script further shows an example of booting an ARM based full system Ubuntu
+disk image. This simulation boots the disk image using 2 TIMING CPU cores. The
+simulation ends when the startup is completed successfully (i.e. when an
+`m5_exit instruction is reached on successful boot).
+
+Usage
+-----
+
+```
+scons build/ARM/gem5.opt -j<NUM_CPUS>
+./build/ARM/gem5.opt configs/example/gem5_library/arm-ubuntu-run-with-kvm.py
+```
+
+"""
+
+from m5.objects import (
+    ArmDefaultRelease,
+    VExpress_GEM5_V1,
+)
+
+from gem5.coherence_protocol import CoherenceProtocol
+from gem5.components.boards.arm_board import ArmBoard
+from gem5.components.memory import DualChannelDDR4_2400
+from gem5.components.processors.cpu_types import CPUTypes
+from gem5.components.processors.simple_switchable_processor import (
+    SimpleSwitchableProcessor,
+)
+from gem5.isas import ISA
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.exit_event import ExitEvent
+from gem5.simulate.simulator import Simulator
+from gem5.utils.requires import requires
+
+# This runs a check to ensure the gem5 binary is compiled for ARM.
+requires(isa_required=ISA.ARM)
+
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
+    PrivateL1PrivateL2CacheHierarchy,
+)
+
+# Here we setup the parameters of the l1 and l2 caches.
+cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+    l1d_size="16kB", l1i_size="16kB", l2_size="256kB"
+)
+
+# Memory: Dual Channel DDR4 2400 DRAM device.
+memory = DualChannelDDR4_2400(size="2GB")
+
+# Here we setup the processor. This is a special switchable processor in which
+# a starting core type and a switch core type must be specified. Once a
+# configuration is instantiated a user may call `processor.switch()` to switch
+# from the starting core types to the switch core types. In this simulation
+# we start with KVM cores to simulate the OS boot, then switch to the Timing
+# cores for the command we wish to run after boot.
+processor = SimpleSwitchableProcessor(
+    starting_core_type=CPUTypes.KVM,
+    switch_core_type=CPUTypes.TIMING,
+    isa=ISA.ARM,
+    num_cores=2,
+)
+
+# The ArmBoard requires a `release` to be specified. This adds all the
+# extensions or features to the system. We are setting this to for_kvm()
+# to enable KVM simulation.
+release = ArmDefaultRelease.for_kvm()
+
+# The platform sets up the memory ranges of all the on-chip and off-chip
+# devices present on the ARM system. ARM KVM only works with VExpress_GEM5_V1
+# on the ArmBoard at the moment.
+platform = VExpress_GEM5_V1()
+
+# Here we setup the board. The ArmBoard allows for Full-System ARM simulations.
+board = ArmBoard(
+    clk_freq="3GHz",
+    processor=processor,
+    memory=memory,
+    cache_hierarchy=cache_hierarchy,
+    release=release,
+    platform=platform,
+)
+# This is the command to run after the system has booted. The first `m5 exit`
+# will stop the simulation so we can switch the CPU cores from KVM to timing
+# and continue the simulation to run the echo command, sleep for a second,
+# then, again, call `m5 exit` to terminate the simulation. After simulation
+# has ended you may inspect `m5out/system.pc.com_1.device` to see the echo
+# output.
+command = (
+    "m5 --addr=0x10010000 exit;"
+    + "echo 'This is running on Timing CPU cores.';"
+    + "m5 exit;"
+)
+
+# Here we set a full system workload. The "arm64-ubuntu-20.04-boot" boots
+# Ubuntu 20.04. We use arm64-bootloader (boot.arm64) as the bootloader to use
+# ARM KVM.
+board.set_kernel_disk_workload(
+    kernel=obtain_resource(
+        "arm64-linux-kernel-5.4.49", resource_version="1.0.0"
+    ),
+    disk_image=obtain_resource(
+        "arm64-ubuntu-20.04-img", resource_version="1.0.0"
+    ),
+    bootloader=obtain_resource("arm64-bootloader", resource_version="1.0.0"),
+    readfile_contents=command,
+)
+# We define the system with the aforementioned system defined.
+simulator = Simulator(
+    board=board,
+    on_exit_event={ExitEvent.EXIT: (func() for func in [processor.switch])},
+)
+
+# Once the system successfully boots, it encounters an
+# `m5_exit instruction encountered`. We stop the simulation then. When the
+# simulation has ended you may inspect `m5out/board.terminal` to see
+# the stdout.
+simulator.run()
--- a/configs/example/gem5_library/arm-ubuntu-run.py
+++ b/configs/example/gem5_library/arm-ubuntu-run.py
@@ -102,7 +102,9 @@ board = ArmBoard(
 # Here we set a full system workload. The "arm64-ubuntu-20.04-boot" boots
 # Ubuntu 20.04.

-board.set_workload(obtain_resource("arm64-ubuntu-20.04-boot"))
+board.set_workload(
+    obtain_resource("arm64-ubuntu-20.04-boot", resource_version="2.0.0")
+)

 # We define the system with the aforementioned system defined.

--- a/configs/example/gem5_library/caches/octopi-cache-example.py
+++ b/configs/example/gem5_library/caches/octopi-cache-example.py
@@ -97,7 +97,9 @@ board = ArmBoard(
    platform=platform,
 )

-board.set_workload(obtain_resource("arm64-ubuntu-20.04-boot"))
+board.set_workload(
+    obtain_resource("arm64-ubuntu-20.04-boot", resource_version="2.0.0")
+)

 simulator = Simulator(board=board)
 simulator.run()
--- a/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
+++ b/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
@@ -90,7 +90,9 @@ board = SimpleBoard(
 board.set_se_binary_workload(
    # the workload should be the same as the save-checkpoint script
    obtain_resource("riscv-hello"),
-    checkpoint=obtain_resource("riscv-hello-example-checkpoint"),
+    checkpoint=obtain_resource(
+        "riscv-hello-example-checkpoint", resource_version="3.0.0"
+    ),
 )

 simulator = Simulator(
--- a/configs/example/gem5_library/checkpoints/riscv-hello-save-checkpoint.py
+++ b/configs/example/gem5_library/checkpoints/riscv-hello-save-checkpoint.py
@@ -107,8 +107,8 @@ board.set_se_binary_workload(

 # Lastly we run the simulation.
 max_ticks = 10**6
-simulator = Simulator(board=board, full_system=False)
-simulator.run(max_ticks=max_ticks)
+simulator = Simulator(board=board, full_system=False, max_ticks=max_ticks)
+simulator.run()

 print(
    "Exiting @ tick {} because {}.".format(
--- a/configs/example/gem5_library/checkpoints/simpoints-se-restore.py
+++ b/configs/example/gem5_library/checkpoints/simpoints-se-restore.py
@@ -60,8 +60,8 @@ from m5.stats import (
 )

 from gem5.components.boards.simple_board import SimpleBoard
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )
 from gem5.components.memory import DualChannelDDR4_2400
 from gem5.components.processors.cpu_types import CPUTypes
@@ -80,7 +80,7 @@ requires(isa_required=ISA.X86)

 # The cache hierarchy can be different from the cache hierarchy used in taking
 # the checkpoints
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
    l1d_size="32kB",
    l1i_size="32kB",
    l2_size="256kB",
@@ -125,7 +125,9 @@ board.set_se_simpoint_workload(
        weight_list=[0.1, 0.2, 0.4, 0.3],
        warmup_interval=1000000,
    ),
-    checkpoint=obtain_resource("simpoints-se-checkpoints-v23-0-v1"),
+    checkpoint=obtain_resource(
+        "simpoints-se-checkpoints", resource_version="3.0.0"
+    ),
 )


--- a/configs/example/gem5_library/dramsys/arm-hello-dramsys.py
+++ b/configs/example/gem5_library/dramsys/arm-hello-dramsys.py
@@ -78,7 +78,7 @@ board.set_se_binary_workload(
    # Any resource specified in this file will be automatically retrieved.
    # At the time of writing, this file is a WIP and does not contain all
    # resources. Jira ticket: https://gem5.atlassian.net/browse/GEM5-1096
-    obtain_resource("arm-hello64-static")
+    obtain_resource("arm-hello64-static", resource_version="1.0.0")
 )

 # Lastly we run the simulation.
--- a/configs/example/gem5_library/looppoints/restore-looppoint-checkpoint.py
+++ b/configs/example/gem5_library/looppoints/restore-looppoint-checkpoint.py
@@ -48,8 +48,8 @@ from m5.stats import (
 )

 from gem5.components.boards.simple_board import SimpleBoard
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )
 from gem5.components.memory import DualChannelDDR4_2400
 from gem5.components.processors.cpu_types import CPUTypes
@@ -90,7 +90,7 @@ args = parser.parse_args()

 # The cache hierarchy can be different from the cache hierarchy used in taking
 # the checkpoints
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
    l1d_size="32kB",
    l1i_size="32kB",
    l2_size="256kB",
--- a/configs/example/gem5_library/multisim/multisim-fs-x86-npb.py
+++ b/configs/example/gem5_library/multisim/multisim-fs-x86-npb.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2024 The Regents of the University of California.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""An example of a single configuration script for defining multiple
+simulations through the gem5 `multisim` module.
+
+This script creates 6 full system simulations by interating through a suite
+of benchmarks and different cores counts.
+
+Usage
+-----
+
+1. To run all the simulations defined in this script::
+
+```shell
+<gem5-binary> -m gem5.utils.multisim \
+    configs/example/gem5_library/multisim/multisim-fs-x86-npb.py
+```
+
+2. To run a specific simulation defined in this script:
+
+```shell
+<gem5-binary> configs/example/gem5_library/multisim/multisim-fs-x86-npb.py \
+    <process_id> # e.g. npb-bt-a_cores-1
+```
+
+3. To list all the IDs of the simulations defined in this script:
+
+```shell
+<gem5-binary> configs/example/gem5_library/multisim/multisim-fs-x86-npb.py -l
+```
+"""
+
+import m5
+
+import gem5.utils.multisim as multisim
+from gem5.coherence_protocol import CoherenceProtocol
+from gem5.components.boards.x86_board import X86Board
+from gem5.components.memory import DualChannelDDR4_2400
+from gem5.components.processors.cpu_types import CPUTypes
+from gem5.components.processors.simple_switchable_processor import (
+    SimpleSwitchableProcessor,
+)
+from gem5.isas import ISA
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.simulator import (
+    ExitEvent,
+    Simulator,
+)
+from gem5.utils.requires import requires
+
+requires(
+    isa_required=ISA.X86,
+    coherence_protocol_required=CoherenceProtocol.MESI_TWO_LEVEL,
+)
+
+from gem5.components.cachehierarchies.ruby.mesi_two_level_cache_hierarchy import (
+    MESITwoLevelCacheHierarchy,
+)
+
+
+def handle_workbegin():
+    m5.stats.reset()
+    processor.switch()
+    yield False
+
+
+def handle_workend():
+    m5.stats.dump()
+    yield True
+
+
+# Set the maximum number of concurrent processes to be 3.
+multisim.set_num_processes(3)
+
+# Here we imagine an experiment wanting to run each NPB benchmark on the same
+# system twice: once with 1 core and once with 2 cores.
+for benchmark in obtain_resource("npb-benchmark-suite"):
+    for num_cores in [1, 2]:
+        cache_hierarchy = MESITwoLevelCacheHierarchy(
+            l1d_size="32kB",
+            l1i_size="32kB",
+            l2_size="256kB",
+            l1d_assoc=8,
+            l1i_assoc=8,
+            l2_assoc=16,
+            num_l2_banks=2,
+        )
+        memory = DualChannelDDR4_2400(size="3GB")
+        processor = SimpleSwitchableProcessor(
+            starting_core_type=CPUTypes.ATOMIC,
+            switch_core_type=CPUTypes.TIMING,
+            isa=ISA.X86,
+            num_cores=num_cores,
+        )
+        board = X86Board(
+            clk_freq="3GHz",
+            processor=processor,
+            memory=memory,
+            cache_hierarchy=cache_hierarchy,
+        )
+
+        board.set_workload(benchmark)
+
+        simulator = Simulator(
+            board=board,
+            on_exit_event={
+                ExitEvent.WORKBEGIN: handle_workbegin(),
+                ExitEvent.WORKEND: handle_workend(),
+            },
+        )
+
+        simulator.set_id(f"{benchmark.get_id()}_cores-{num_cores}")
+
+        multisim.add_simulator(simulator)
--- a/configs/example/gem5_library/multisim/multisim-print-this.py
+++ b/configs/example/gem5_library/multisim/multisim-print-this.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""An example of a single configuration script for defining multiple
+simulations through the gem5 `multisim` module.
+
+This script is very simple and simply prints a simple message once for each
+simulation, outputing the process id.
+
+Usage
+-----
+
+1. To run all the simulations defined in this script::
+
+```shell
+<gem5-binary> -m gem5.utils.multisim \
+    configs/example/gem5_library/multisim/multisim-print-this.py
+```
+
+2. To run a specific simulation defined in this script:
+
+```shell
+<gem5-binary> configs/example/gem5_library/multisim/multisim-print-this.py \
+    process_id_1
+```
+
+3. To list all the IDs of the simulations defined in this script:
+
+```shell
+<gem5-binary> configs/example/gem5_library/multisim/multisim-print-this.py -l
+```
+"""
+
+
+import gem5.utils.multisim as multisim
+from gem5.components.boards.simple_board import SimpleBoard
+from gem5.components.cachehierarchies.classic.no_cache import NoCache
+from gem5.components.memory import SingleChannelDDR3_1600
+from gem5.components.processors.cpu_types import CPUTypes
+from gem5.components.processors.simple_processor import SimpleProcessor
+from gem5.isas import ISA
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.simulator import Simulator
+
+# Set the maximum number of concurrent processes to be 2.
+multisim.set_num_processes(2)
+
+for process_id in range(5):
+    cache_hierarchy = NoCache()
+    memory = SingleChannelDDR3_1600(size="32MB")
+    processor = SimpleProcessor(
+        cpu_type=CPUTypes.TIMING, isa=ISA.X86, num_cores=1
+    )
+    board = SimpleBoard(
+        clk_freq="1GHz",
+        processor=processor,
+        memory=memory,
+        cache_hierarchy=cache_hierarchy,
+    )
+    board.set_se_binary_workload(
+        binary=obtain_resource("x86-print-this"),
+        arguments=[f"Hello from process {process_id}", 1],
+    )
+    multisim.add_simulator(Simulator(board=board, id=f"process_{process_id}"))
--- a/configs/example/gem5_library/power-hello.py
+++ b/configs/example/gem5_library/power-hello.py
@@ -75,7 +75,9 @@ board = SimpleBoard(
    cache_hierarchy=cache_hierarchy,
 )

-board.set_se_binary_workload(obtain_resource("power-hello"))
+board.set_se_binary_workload(
+    obtain_resource("power-hello", resource_version="1.0.0")
+)

 # Lastly we run the simulation.
 simulator = Simulator(board=board)
--- a/configs/example/gem5_library/riscv-fs.py
+++ b/configs/example/gem5_library/riscv-fs.py
@@ -40,8 +40,8 @@ Characteristics
 """

 from gem5.components.boards.riscv_board import RiscvBoard
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )
 from gem5.components.memory import SingleChannelDDR3_1600
 from gem5.components.processors.cpu_types import CPUTypes
@@ -57,7 +57,7 @@ requires(isa_required=ISA.RISCV)
 # Setup the cache hierarchy.
 # For classic, PrivateL1PrivateL2 and NoCache have been tested.
 # For Ruby, MESI_Two_Level and MI_example have been tested.
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
    l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
 )

@@ -79,8 +79,10 @@ board = RiscvBoard(

 # Set the Full System workload.
 board.set_kernel_disk_workload(
-    kernel=obtain_resource("riscv-bootloader-vmlinux-5.10"),
-    disk_image=obtain_resource("riscv-disk-img"),
+    kernel=obtain_resource(
+        "riscv-bootloader-vmlinux-5.10", resource_version="1.0.0"
+    ),
+    disk_image=obtain_resource("riscv-disk-img", resource_version="1.0.0"),
 )

 simulator = Simulator(board=board)
--- a/configs/example/gem5_library/riscv-ubuntu-run.py
+++ b/configs/example/gem5_library/riscv-ubuntu-run.py
@@ -57,12 +57,12 @@ from gem5.utils.requires import requires
 requires(isa_required=ISA.RISCV)

 # With RISCV, we use simple caches.
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )

 # Here we setup the parameters of the l1 and l2 caches.
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
    l1d_size="16kB", l1i_size="16kB", l2_size="256kB"
 )

@@ -88,7 +88,9 @@ board = RiscvBoard(
 # Ubuntu 20.04. Once the system successfully boots it encounters an `m5_exit`
 # instruction which stops the simulation. When the simulation has ended you may
 # inspect `m5out/system.pc.com_1.device` to see the stdout.
-board.set_workload(obtain_resource("riscv-ubuntu-20.04-boot"))
+board.set_workload(
+    obtain_resource("riscv-ubuntu-20.04-boot", resource_version="3.0.0")
+)

 simulator = Simulator(board=board)
 simulator.run()
--- a/configs/example/gem5_library/riscvmatched-fs.py
+++ b/configs/example/gem5_library/riscvmatched-fs.py
@@ -76,7 +76,7 @@ board = RISCVMatchedBoard(
 # In the case where the `-i` flag is passed, we add the kernel argument
 # `init=/root/exit.sh`. This means the simulation will exit after the Linux
 # Kernel has booted.
-workload = obtain_resource("riscv-ubuntu-20.04-boot")
+workload = obtain_resource("riscv-ubuntu-20.04-boot", resource_version="3.0.0")
 kernel_args = board.get_default_kernel_args()
 if args.to_init:
    kernel_args.append("init=/root/exit.sh")
--- a/configs/example/gem5_library/riscvmatched-hello.py
+++ b/configs/example/gem5_library/riscvmatched-hello.py
@@ -49,7 +49,9 @@ requires(isa_required=ISA.RISCV)
 board = RISCVMatchedBoard()

 # set the hello world riscv binary as the board workload
-board.set_se_binary_workload(obtain_resource("riscv-hello"))
+board.set_se_binary_workload(
+    obtain_resource("riscv-hello", resource_version="1.0.0")
+)

 # run the simulation with the RISCV Matched board
 simulator = Simulator(board=board, full_system=False)
--- a/configs/example/gem5_library/riscvmatched-microbenchmark-suite.py
+++ b/configs/example/gem5_library/riscvmatched-microbenchmark-suite.py
@@ -45,7 +45,9 @@ requires(isa_required=ISA.RISCV)
 board = RISCVMatchedBoard()

 # obtain the RISC-V Vertical Microbenchmarks
-microbenchmarks = obtain_resource("riscv-vertical-microbenchmarks")
+microbenchmarks = obtain_resource(
+    "riscv-vertical-microbenchmarks", resource_version="1.0.0"
+)

 # list all the microbenchmarks present in the suite
 print("Microbenchmarks present in the suite:")
--- a/configs/example/gem5_library/spatter_gen/spatter-gen-test.py
+++ b/configs/example/gem5_library/spatter_gen/spatter-gen-test.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+Script that runs a SpatterGen test with a specific trace file.
+This script can be used as an example on how to use SpatterGenerator,
+SpatterKernel, and its utilities to run a Spatter trace in gem5.
+
+The script uses a spatter trace taken from the hpcgarage github repository.
+Link to the original trace file:
+
+https://github.com/hpcgarage/spatter/blob/main/standard-suite/app-traces/amg.json
+
+It will create a system with `num_cores` SpatterGenerators and interleave the
+trace by `intlv_size` elements in the `pattern` field from the trace.
+Interleaving is done for assigning part of the access to each core.
+
+Usage:
+------
+
+```
+scons build/NULL/gem5.opt
+./build/NULL/gem5.opt configs/example/gem5_library/spatter_gen/spatter-gen-test.py
+```
+"""
+import argparse
+import json
+from pathlib import Path
+
+import m5
+from m5.objects import Root
+
+from gem5.components.boards.test_board import TestBoard
+from gem5.components.cachehierarchies.classic.private_l1_cache_hierarchy import (
+    PrivateL1CacheHierarchy,
+)
+from gem5.components.memory import DualChannelDDR4_2400
+from gem5.components.processors.spatter_gen import (
+    SpatterGenerator,
+    prepare_kernels,
+)
+from gem5.simulate.simulator import Simulator
+
+num_cores = 8
+intlv_size = 128
+
+memory = DualChannelDDR4_2400(size="8GiB")
+
+generator = SpatterGenerator(
+    processing_mode="synchronous", num_cores=num_cores
+)
+
+kernels = prepare_kernels(
+    Path(__file__).parent / "traces/amg.json",
+    num_cores,
+    intlv_size,
+    0,
+    memory.get_size() // 2,
+)
+for kernel in kernels:
+    generator.add_kernel(kernel)
+
+board = TestBoard(
+    clk_freq="4GHz",
+    generator=generator,
+    cache_hierarchy=PrivateL1CacheHierarchy(
+        l1d_size="32KiB", l1i_size="32KiB"
+    ),
+    memory=memory,
+)
+
+simulator = Simulator(board=board, full_system=False)
+
+simulator.run()
--- a/configs/example/gem5_library/spatter_gen/traces/amg.json
+++ b/configs/example/gem5_library/spatter_gen/traces/amg.json
@@ -0,0 +1 @@
+[{"delta": 1, "kernel": "Gather", "pattern": [1333, 0, 1, 2, 36, 37, 38, 72, 73, 74, 1296, 1297, 1298, 1332, 1334, 1368], "count": 1454647}, {"delta": 1, "kernel": "Gather", "pattern": [1333, 0, 1, 36, 37, 72, 73, 1296, 1297, 1332, 1368, 1369, 2592, 2593, 2628, 2629], "count": 1454647}]
--- a/configs/example/gem5_library/x86-gapbs-benchmarks.py
+++ b/configs/example/gem5_library/x86-gapbs-benchmarks.py
@@ -77,7 +77,9 @@ parser = argparse.ArgumentParser(
    description="An example configuration script to run the gapbs benchmarks."
 )

-gapbs_suite = obtain_resource("gapbs-benchmark-suite")
+gapbs_suite = obtain_resource(
+    "gapbs-benchmark-suite", resource_version="1.0.0"
+)

 # The only positional argument accepted is the benchmark name in this script.

--- a/configs/example/gem5_library/x86-npb-benchmarks.py
+++ b/configs/example/gem5_library/x86-npb-benchmarks.py
@@ -88,7 +88,7 @@ parser = argparse.ArgumentParser(
    description="An example configuration script to run the npb benchmarks."
 )

-npb_suite = obtain_resource("npb-benchmark-suite")
+npb_suite = obtain_resource("npb-benchmark-suite", resource_version="1.0.0")
 # The only positional argument accepted is the benchmark name in this script.

 parser.add_argument(
--- a/configs/example/gem5_library/x86-parsec-benchmarks.py
+++ b/configs/example/gem5_library/x86-parsec-benchmarks.py
@@ -185,10 +185,12 @@ board.set_kernel_disk_workload(
    # The x86 linux kernel will be automatically downloaded to the
    # `~/.cache/gem5` directory if not already present.
    # PARSEC benchamarks were tested with kernel version 4.19.83
-    kernel=obtain_resource("x86-linux-kernel-4.19.83"),
+    kernel=obtain_resource(
+        "x86-linux-kernel-4.19.83", resource_version="1.0.0"
+    ),
    # The x86-parsec image will be automatically downloaded to the
    # `~/.cache/gem5` directory if not already present.
-    disk_image=obtain_resource("x86-parsec"),
+    disk_image=obtain_resource("x86-parsec", resource_version="1.0.0"),
    readfile_contents=command,
 )

--- a/configs/example/gem5_library/x86-ubuntu-run-with-kvm-no-perf.py
+++ b/configs/example/gem5_library/x86-ubuntu-run-with-kvm-no-perf.py
@@ -121,7 +121,7 @@ command = (
    + "m5 exit;"
 )

-workload = obtain_resource("x86-ubuntu-18.04-boot")
+workload = obtain_resource("x86-ubuntu-18.04-boot", resource_version="2.0.0")
 workload.set_parameter("readfile_contents", command)
 board.set_workload(workload)

--- a/configs/example/gem5_library/x86-ubuntu-run-with-kvm.py
+++ b/configs/example/gem5_library/x86-ubuntu-run-with-kvm.py
@@ -117,7 +117,7 @@ command = (
    + "m5 exit;"
 )

-workload = obtain_resource("x86-ubuntu-18.04-boot")
+workload = obtain_resource("x86-ubuntu-18.04-boot", resource_version="2.0.0")
 workload.set_parameter("readfile_contents", command)
 board.set_workload(workload)

--- a/configs/example/gem5_library/x86-ubuntu-run.py
+++ b/configs/example/gem5_library/x86-ubuntu-run.py
@@ -55,7 +55,9 @@ board = X86DemoBoard()
 # We then set the workload. Here we use the "x86-ubuntu-18.04-boot" workload.
 # This boots Ubuntu 18.04 with Linux 5.4.49. If the required resources are not
 # found locally, they will be downloaded.
-board.set_workload(obtain_resource("x86-ubuntu-18.04-boot"))
+board.set_workload(
+    obtain_resource("x86-ubuntu-18.04-boot", resource_version="2.0.0")
+)

 simulator = Simulator(board=board)
 simulator.run()
--- a/configs/example/gpufs/Disjoint_VIPER.py
+++ b/configs/example/gpufs/Disjoint_VIPER.py
@@ -58,6 +58,8 @@ class Disjoint_VIPER(RubySystem):
            self.network_cpu = DisjointSimple(self)
            self.network_gpu = DisjointSimple(self)

+        self.block_size_bytes = options.cacheline_size
+
        # Construct CPU controllers
        cpu_dir_nodes = construct_dirs(options, system, self, self.network_cpu)
        (cp_sequencers, cp_cntrl_nodes) = construct_corepairs(
--- a/configs/example/gpufs/amd/AmdGPUOptions.py
+++ b/configs/example/gpufs/amd/AmdGPUOptions.py
@@ -247,3 +247,9 @@ def addAmdGPUOptions(parser):
        default="simple",
        help="register allocation policy (simple/dynamic)",
    )
+    parser.add_argument(
+        "--register-file-cache-size",
+        type=int,
+        default=0,
+        help="number of registers in cache",
+    )
--- a/configs/example/gpufs/mi200.py
+++ b/configs/example/gpufs/mi200.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import base64
+import os
+import sys
+import tempfile
+
+import runfs
+from amd import AmdGPUOptions
+from common import (
+    GPUTLBOptions,
+    Options,
+)
+from ruby import Ruby
+
+import m5
+
+demo_runscript_without_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx90a
+free -m
+dmesg -n8
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+./myapp {}
+/sbin/m5 exit
+"""
+
+demo_runscript_with_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx90a
+dmesg -n8
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+/sbin/m5 checkpoint
+./myapp {}
+/sbin/m5 exit
+"""
+
+
+def addDemoOptions(parser):
+    parser.add_argument(
+        "-a", "--app", default=None, help="GPU application to run"
+    )
+    parser.add_argument(
+        "-o", "--opts", default="", help="GPU application arguments"
+    )
+
+
+def runMI200GPUFS(cpu_type):
+    parser = argparse.ArgumentParser()
+    runfs.addRunFSOptions(parser)
+    Options.addCommonOptions(parser)
+    AmdGPUOptions.addAmdGPUOptions(parser)
+    Ruby.define_options(parser)
+    GPUTLBOptions.tlb_options(parser)
+    addDemoOptions(parser)
+
+    # Parse now so we can override options
+    args = parser.parse_args()
+    demo_runscript = ""
+
+    # Create temp script to run application
+    if args.app is None:
+        print(f"No application given. Use {sys.argv[0]} -a <app>")
+        sys.exit(1)
+    elif args.kernel is None:
+        print(f"No kernel path given. Use {sys.argv[0]} --kernel <vmlinux>")
+        sys.exit(1)
+    elif args.disk_image is None:
+        print(f"No disk path given. Use {sys.argv[0]} --disk-image <linux>")
+        sys.exit(1)
+    elif not os.path.isfile(args.app):
+        print("Could not find applcation", args.app)
+        sys.exit(1)
+
+    # Choose runscript Based on whether any checkpointing args are set
+    if args.checkpoint_dir is not None:
+        demo_runscript = demo_runscript_with_checkpoint
+    else:
+        demo_runscript = demo_runscript_without_checkpoint
+
+    with open(os.path.abspath(args.app), "rb") as binfile:
+        encodedBin = base64.b64encode(binfile.read()).decode()
+
+    _, tempRunscript = tempfile.mkstemp()
+    with open(tempRunscript, "w") as b64file:
+        runscriptStr = demo_runscript.format(
+            args.app, args.opts, encodedBin, args.opts
+        )
+        b64file.write(runscriptStr)
+
+    if args.second_disk == None:
+        args.second_disk = args.disk_image
+
+    # Defaults for MI200
+    args.ruby = True
+    args.cpu_type = "X86KvmCPU"
+    args.mem_size = "8GB"  # CPU host memory
+    args.dgpu = True
+    args.dgpu_mem_size = "16GB"  # GPU device memory
+    args.dgpu_start = "0GB"
+    args.checkpoint_restore = 0
+    args.disjoint = True
+    args.timing_gpu = True
+    args.script = tempRunscript
+    args.dgpu_xor_low_bit = 0
+    args.gpu_device = "MI200"
+
+    # Run gem5
+    runfs.runGpuFSSystem(args)
+
+
+if __name__ == "__m5_main__":
+    runMI200GPUFS("X86KvmCPU")
--- a/configs/example/gpufs/mi300.py
+++ b/configs/example/gpufs/mi300.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+""" This file creates an X86 system with a KVM CPU and GPU device capable of
+running the MI300 ISA (gfx942). Most of this file sets up a runscript which
+will load in a binary, shell script, or python file from the host and run that
+within gem5. Jump to line 146 for list of system parameters to configure.
+"""
+
+import argparse
+import base64
+import os
+import sys
+import tempfile
+from typing import Optional
+
+import runfs
+from amd import AmdGPUOptions
+from common import (
+    GPUTLBOptions,
+    Options,
+)
+from ruby import Ruby
+
+import m5
+
+from gem5.resources.resource import AbstractResource
+
+demo_runscript_without_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx942
+export HSA_OVERRIDE_GFX_VERSION="9.4.2"
+dmesg -n8
+cat /proc/cpuinfo
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+./myapp {}
+/sbin/m5 exit
+"""
+
+demo_runscript_with_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx942
+export HSA_OVERRIDE_GFX_VERSION="9.4.2"
+dmesg -n8
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+/sbin/m5 checkpoint
+./myapp {}
+/sbin/m5 exit
+"""
+
+
+def addDemoOptions(parser):
+    parser.add_argument(
+        "-a", "--app", default=None, help="GPU application to run"
+    )
+    parser.add_argument(
+        "-o", "--opts", default="", help="GPU application arguments"
+    )
+
+
+def runMI300GPUFS(
+    cpu_type,
+    disk: Optional[AbstractResource] = None,
+    kernel: Optional[AbstractResource] = None,
+    app: Optional[AbstractResource] = None,
+):
+    parser = argparse.ArgumentParser()
+    runfs.addRunFSOptions(parser)
+    Options.addCommonOptions(parser)
+    AmdGPUOptions.addAmdGPUOptions(parser)
+    Ruby.define_options(parser)
+    GPUTLBOptions.tlb_options(parser)
+    addDemoOptions(parser)
+
+    # Parse now so we can override options
+    args = parser.parse_args()
+    demo_runscript = ""
+
+    if disk != None:
+        args.disk_image = disk.get_local_path()
+    if kernel != None:
+        args.kernel = kernel.get_local_path()
+    if app != None:
+        args.app = app.get_local_path()
+
+    # Create temp script to run application
+    if not os.path.isfile(args.app):
+        print("Could not find applcation", args.app)
+        sys.exit(1)
+
+    # Choose runscript Based on whether any checkpointing args are set
+    if args.checkpoint_dir is not None:
+        demo_runscript = demo_runscript_with_checkpoint
+    else:
+        demo_runscript = demo_runscript_without_checkpoint
+
+    with open(os.path.abspath(args.app), "rb") as binfile:
+        encodedBin = base64.b64encode(binfile.read()).decode()
+
+    _, tempRunscript = tempfile.mkstemp()
+    with open(tempRunscript, "w") as b64file:
+        runscriptStr = demo_runscript.format(
+            args.app, args.opts, encodedBin, args.opts
+        )
+        b64file.write(runscriptStr)
+
+    args.script = tempRunscript
+
+    # Defaults for CPU
+    args.cpu_type = "X86KvmCPU"
+    args.mem_size = "8GB"
+
+    # Defaults for MI300X
+    args.gpu_device = "MI300X"
+    args.dgpu_mem_size = "16GB"  # GPU memory size, must be 16GB currently.
+
+    # See: https://rocm.docs.amd.com/en/latest/conceptual/gpu-arch/mi300.html
+    # Topology for one XCD. Number of CUs is approximately 304 / 8, rounded
+    # up to 40 due to gem5 restriction of 4 CUs per SQC / scalar cache.
+    args.num_compute_units = 40
+    args.gpu_topology = "Crossbar"
+
+    # Run gem5
+    runfs.runGpuFSSystem(args)
+
+
+if __name__ == "__m5_main__":
+    runMI300GPUFS("X86KvmCPU")
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -134,23 +134,41 @@ def addRunFSOptions(parser):
    parser.add_argument(
        "--gpu-device",
        default="Vega10",
-        choices=["Vega10", "MI100", "MI200"],
-        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
-        "MI200 (gfx90a)",
+        choices=["Vega10", "MI100", "MI200", "MI300X"],
+        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), MI200 "
+        "(gfx90a), or MI300X (gfx942).",
    )

    parser.add_argument(
-        "--debug-at-gpu-kernel",
+        "--debug-at-gpu-task",
        type=int,
        default=-1,
-        help="Turn on debug flags starting with this kernel",
+        help="Turn on debug flags starting with this task (counting both blit"
+        " and non-blit kernels)",
    )

    parser.add_argument(
-        "--exit-at-gpu-kernel",
+        "--exit-at-gpu-task",
        type=int,
        default=-1,
-        help="Exit simulation after running this many kernels",
+        help="Exit simulation after running this many tasks (counting both "
+        "blit and non-blit kernels)",
+    )
+
+    parser.add_argument(
+        "--exit-after-gpu-kernel",
+        type=int,
+        default=-1,
+        help="Exit simulation after completing this (non-blit) kernel",
+    )
+
+    parser.add_argument(
+        "--skip-until-gpu-kernel",
+        type=int,
+        default=0,
+        help="Skip (non-blit) kernels until reaching this kernel. Note that "
+        "this can impact correctness (the skipped kernels are completely "
+        "skipped, not fast forwarded)",
    )

    parser.add_argument(
@@ -177,6 +195,28 @@ def addRunFSOptions(parser):
        help="Disable KVM perf counters (use this with LSF / ETX)",
    )

+    parser.add_argument(
+        "--tcp-rp",
+        type=str,
+        default="TreePLRURP",
+        help="cache replacement policy" "policy for tcp",
+    )
+
+    parser.add_argument(
+        "--tcc-rp",
+        type=str,
+        default="TreePLRURP",
+        help="cache replacement policy" "policy for tcc",
+    )
+
+    # sqc rp both changes sqc rp and scalar cache rp
+    parser.add_argument(
+        "--sqc-rp",
+        type=str,
+        default="TreePLRURP",
+        help="cache replacement policy" "policy for sqc",
+    )
+

 def runGpuFSSystem(args):
    """
@@ -230,8 +270,9 @@ def runGpuFSSystem(args):

    print("Running the simulation")
    sim_ticks = args.abs_max_tick
-    kernels_launched = 0
-    if args.debug_at_gpu_kernel != -1:
+    kernels_completed = 0
+    tasks_completed = 0
+    if args.debug_at_gpu_task != -1:
        m5.trace.disable()

    exit_event = m5.simulate(sim_ticks)
@@ -249,16 +290,27 @@ def runGpuFSSystem(args):
            m5.checkpoint(args.checkpoint_dir)
            break
        elif "GPU Kernel Completed" in exit_event.getCause():
-            kernels_launched += 1
+            if kernels_completed == args.exit_after_gpu_kernel:
+                print(f"Exiting after GPU kernel {kernels_completed}")
+                break
+            kernels_completed += 1
+            tasks_completed += 1
+        elif "GPU Blit Kernel Completed" in exit_event.getCause():
+            tasks_completed += 1
+        elif "Skipping GPU Kernel" in exit_event.getCause():
+            print(f"Skipping GPU kernel {kernels_completed}")
+            kernels_completed += 1
+            tasks_completed += 1
        else:
            print(
                f"Unknown exit event: {exit_event.getCause()}. Continuing..."
            )

-        if kernels_launched == args.debug_at_gpu_kernel:
+        if tasks_completed == args.debug_at_gpu_task:
+            print(f"Enabling debug flags @ GPU task {tasks_completed}")
            m5.trace.enable()
-        if kernels_launched == args.exit_at_gpu_kernel:
-            print(f"Exiting @ GPU kernel {kernels_launched}")
+        if tasks_completed == args.exit_at_gpu_task:
+            print(f"Exiting @ GPU task {tasks_completed}")
            break

        exit_event = m5.simulate(sim_ticks - m5.curTick())
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -33,7 +33,10 @@ from m5.objects import *

 def createGPU(system, args):
    shader = Shader(
-        n_wf=args.wfs_per_simd, timing=True, clk_domain=system.clk_domain
+        n_wf=args.wfs_per_simd,
+        cu_per_sqc=args.cu_per_sqc,
+        timing=True,
+        clk_domain=system.clk_domain,
    )

    # VIPER GPU protocol implements release consistency at GPU side. So,
@@ -84,6 +87,7 @@ def createGPU(system, args):
        vrfs = []
        vrf_pool_mgrs = []
        srfs = []
+        rfcs = []
        srf_pool_mgrs = []
        for j in range(args.simds_per_cu):
            for k in range(shader.n_wf):
@@ -133,10 +137,16 @@ def createGPU(system, args):
                    num_regs=args.sreg_file_size,
                )
            )
+            rfcs.append(
+                RegisterFileCache(
+                    simd_id=j, cache_size=args.register_file_cache_size
+                )
+            )

        compute_units[-1].wavefronts = wavefronts
        compute_units[-1].vector_register_file = vrfs
        compute_units[-1].scalar_register_file = srfs
+        compute_units[-1].register_file_cache = rfcs
        compute_units[-1].register_manager = RegisterManager(
            policy=args.registerManagerPolicy,
            vrf_pool_managers=vrf_pool_mgrs,
@@ -181,10 +191,14 @@ def connectGPU(system, args):
        system.pc.south_bridge.gpu.DeviceID = 0x740F
        system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
        system.pc.south_bridge.gpu.SubsystemID = 0x0C34
+    elif args.gpu_device == "MI300X":
+        system.pc.south_bridge.gpu.DeviceID = 0x740F
+        system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
+        system.pc.south_bridge.gpu.SubsystemID = 0x0C34
    elif args.gpu_device == "Vega10":
        system.pc.south_bridge.gpu.DeviceID = 0x6863
    else:
-        panic(f"Unknown GPU device: {args.gpu_device}")
+        m5.util.panic(f"Unknown GPU device: {args.gpu_device}")

    # Use the gem5 default of 0x280 OR'd  with 0x10 which tells Linux there is
    # a PCI capabilities list to travse.
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -108,18 +108,26 @@ def makeGpuFSSystem(args):
    system.cpu.append(shader)

    # This arbitrary address is something in the X86 I/O hole
-    hsapp_gpu_map_paddr = 0xE00000000
+    hsapp_gpu_map_paddr = 0xE0000000
    hsapp_pt_walker = VegaPagetableWalker()
    gpu_hsapp = HSAPacketProcessor(
        pioAddr=hsapp_gpu_map_paddr,
        numHWQueues=args.num_hw_queues,
        walker=hsapp_pt_walker,
    )
-    dispatcher_exit_events = True if args.exit_at_gpu_kernel > -1 else False
+    dispatcher_exit_events = False
+    if args.exit_at_gpu_task > -1:
+        dispatcher_exit_events = True
+    if args.exit_after_gpu_kernel > -1:
+        dispatcher_exit_events = True
    dispatcher = GPUDispatcher(kernel_exit_events=dispatcher_exit_events)
    cp_pt_walker = VegaPagetableWalker()
+    target_kernel = args.skip_until_gpu_kernel
    gpu_cmd_proc = GPUCommandProcessor(
-        hsapp=gpu_hsapp, dispatcher=dispatcher, walker=cp_pt_walker
+        hsapp=gpu_hsapp,
+        dispatcher=dispatcher,
+        walker=cp_pt_walker,
+        target_non_blit_kernel_id=target_kernel,
    )
    shader.dispatcher = dispatcher
    shader.gpu_cmd_proc = gpu_cmd_proc
@@ -153,7 +161,7 @@ def makeGpuFSSystem(args):
            0x7D000,
        ]
        sdma_sizes = [0x1000] * 8
-    elif args.gpu_device == "MI200":
+    elif args.gpu_device == "MI200" or args.gpu_device == "MI300X":
        num_sdmas = 5
        sdma_bases = [
            0x4980,
@@ -180,9 +188,15 @@ def makeGpuFSSystem(args):

    system.pc.south_bridge.gpu.sdmas = sdma_engines

-    # Setup PM4 packet processor
-    pm4_pkt_proc = PM4PacketProcessor()
-    system.pc.south_bridge.gpu.pm4_pkt_proc = pm4_pkt_proc
+    # Setup PM4 packet processors
+    pm4_procs = []
+    pm4_procs.append(
+        PM4PacketProcessor(
+            ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000)
+        )
+    )
+
+    system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs

    # GPU data path
    gpu_mem_mgr = AMDGPUMemoryManager()
@@ -199,7 +213,8 @@ def makeGpuFSSystem(args):
    for sdma in sdma_engines:
        system._dma_ports.append(sdma)
    system._dma_ports.append(device_ih)
-    system._dma_ports.append(pm4_pkt_proc)
+    for pm4_proc in pm4_procs:
+        system._dma_ports.append(pm4_proc)
    system._dma_ports.append(system_hub)
    system._dma_ports.append(gpu_mem_mgr)
    system._dma_ports.append(hsapp_pt_walker)
@@ -213,7 +228,8 @@ def makeGpuFSSystem(args):
    for sdma in sdma_engines:
        sdma.pio = system.iobus.mem_side_ports
    device_ih.pio = system.iobus.mem_side_ports
-    pm4_pkt_proc.pio = system.iobus.mem_side_ports
+    for pm4_proc in pm4_procs:
+        pm4_proc.pio = system.iobus.mem_side_ports
    system_hub.pio = system.iobus.mem_side_ports

    # Full system needs special TLBs for SQC, Scalar, and vector data ports
@@ -247,7 +263,7 @@ def makeGpuFSSystem(args):
        0x00000340,
        0x00000000,
        0x00000340,
-        0x0000000F,
+        0x00000000,
        0x00000340,
        0x00000000,
        0x00000000,
@@ -265,7 +281,7 @@ def makeGpuFSSystem(args):
    # See: https://sandpile.org/x86/cpuid.htm#level_0000_0001h
    # Enables AVX, OSXSAVE, XSAVE, POPCNT, SSE4.2, SSE4.1, CMPXCHG16B,
    # and FMA.
-    avx_cpu_features = [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x1C983209]
+    avx_cpu_features = [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x1C803209]

    for i, cpu in enumerate(system.cpu):
        # Break once we reach the shader "CPU"
--- a/configs/example/gpufs/vega10.py
+++ b/configs/example/gpufs/vega10.py
@@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
    /sbin/m5 exit
 fi
-modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0
+modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0
 echo "Running {} {}"
 echo "{}" | base64 -d > myapp
 chmod +x myapp
--- a/configs/example/hsaTopology.py
+++ b/configs/example/hsaTopology.py
@@ -243,7 +243,7 @@ def createVegaTopology(options):

    file_append((node_dir, "properties"), node_prop)

-    # Fiji HBM reporting
+    # Vega HBM reporting
    # TODO: Extract size, clk, and width from sim paramters
    mem_dir = joinpath(node_dir, "mem_banks/0")
    remake_dir(mem_dir)
@@ -260,196 +260,7 @@ def createVegaTopology(options):
    file_append((mem_dir, "properties"), mem_prop)


-# This fakes out a dGPU setup so the runtime correctly operations.  The spoofed
-# system has a single dGPU and a single socket CPU.  Note that more complex
-# topologies (multi-GPU, multi-socket CPUs) need to have a different setup
-# here or the runtime won't be able to issue Memcpies from one node to another.
-#
-# TODO: There is way too much hardcoded here.  It doesn't effect anything in
-# our current ROCm stack (1.6), but it is highly possible that it will in the
-# future.  We might need to scrub through this and extract the appropriate
-# fields from the simulator in the future.
-def createFijiTopology(options):
-    topology_dir = joinpath(
-        m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology"
-    )
-    remake_dir(topology_dir)
-
-    amdgpu_dir = joinpath(m5.options.outdir, "fs/sys/module/amdgpu/parameters")
-    remake_dir(amdgpu_dir)
-
-    # Fiji reported VM size in GB.  Used to reserve an allocation from CPU
-    # to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree)
-    file_append((amdgpu_dir, "vm_size"), 256)
-
-    # Ripped from real Fiji platform to appease KMT version checks
-    file_append((topology_dir, "generation_id"), 2)
-
-    # Set up system properties.  Regiter as ast-rocm server
-    sys_prop = (
-        "platform_oem 35498446626881\n"
-        + "platform_id 71791775140929\n"
-        + "platform_rev 2\n"
-    )
-    file_append((topology_dir, "system_properties"), sys_prop)
-
-    # Populate the topology tree
-    # Our dGPU system is two nodes.  Node 0 is a CPU and Node 1 is a dGPU
-    node_dir = joinpath(topology_dir, "nodes/0")
-    remake_dir(node_dir)
-
-    # Register as a CPU
-    file_append((node_dir, "gpu_id"), 0)
-    file_append((node_dir, "name"), "")
-
-    # CPU links.  Only thing that matters is we tell the runtime that GPU is
-    # connected through PCIe to CPU socket 0.
-    io_links = 1
-    io_dir = joinpath(node_dir, "io_links/0")
-    remake_dir(io_dir)
-    io_prop = (
-        "type 2\n"
-        + "version_major 0\n"
-        + "version_minor 0\n"
-        + "node_from 0\n"
-        + "node_to 1\n"
-        + "weight 20\n"
-        + "min_latency 0\n"
-        + "max_latency 0\n"
-        + "min_bandwidth 0\n"
-        + "max_bandwidth 0\n"
-        + "recommended_transfer_size 0\n"
-        + "flags 13\n"
-    )
-    file_append((io_dir, "properties"), io_prop)
-
-    # Populate CPU node properties
-    node_prop = (
-        f"cpu_cores_count {options.num_cpus}\n"
-        + "simd_count 0\n"
-        + "mem_banks_count 1\n"
-        + "caches_count 0\n"
-        + f"io_links_count {io_links}\n"
-        + "cpu_core_id_base 0\n"
-        + "simd_id_base 0\n"
-        + "max_waves_per_simd 0\n"
-        + "lds_size_in_kb 0\n"
-        + "gds_size_in_kb 0\n"
-        + "wave_front_size 64\n"
-        + "array_count 0\n"
-        + "simd_arrays_per_engine 0\n"
-        + "cu_per_simd_array 0\n"
-        + "simd_per_cu 0\n"
-        + "max_slots_scratch_cu 0\n"
-        + "vendor_id 0\n"
-        + "device_id 0\n"
-        + "location_id 0\n"
-        + "drm_render_minor 0\n"
-        + "max_engine_clk_ccompute 3400\n"
-    )
-
-    file_append((node_dir, "properties"), node_prop)
-
-    # CPU memory reporting
-    mem_dir = joinpath(node_dir, "mem_banks/0")
-    remake_dir(mem_dir)
-    # Heap type value taken from real system, heap type values:
-    # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317
-    mem_prop = (
-        "heap_type 0\n"
-        + "size_in_bytes 33704329216\n"
-        + "flags 0\n"
-        + "width 72\n"
-        + "mem_clk_max 2400\n"
-    )
-
-    file_append((mem_dir, "properties"), mem_prop)
-
-    # Build the GPU node
-    node_dir = joinpath(topology_dir, "nodes/1")
-    remake_dir(node_dir)
-
-    # Register as a Fiji
-    file_append((node_dir, "gpu_id"), 50156)
-    file_append((node_dir, "name"), "Fiji\n")
-
-    # Should be the same as the render driver filename (dri/renderD<drm_num>)
-    drm_num = 128
-
-    # Real Fiji shows 96, but building that topology is complex and doesn't
-    # appear to be required for anything.
-    caches = 0
-
-    # GPU links.  Only thing that matters is we tell the runtime that GPU is
-    # connected through PCIe to CPU socket 0.
-    io_links = 1
-    io_dir = joinpath(node_dir, "io_links/0")
-    remake_dir(io_dir)
-    io_prop = (
-        "type 2\n"
-        + "version_major 0\n"
-        + "version_minor 0\n"
-        + "node_from 1\n"
-        + "node_to 0\n"
-        + "weight 20\n"
-        + "min_latency 0\n"
-        + "max_latency 0\n"
-        + "min_bandwidth 0\n"
-        + "max_bandwidth 0\n"
-        + "recommended_transfer_size 0\n"
-        + "flags 1\n"
-    )
-    file_append((io_dir, "properties"), io_prop)
-
-    # Populate GPU node properties
-    node_prop = (
-        "cpu_cores_count 0\n"
-        + f"simd_count {options.num_compute_units * options.simds_per_cu}\n"
-        + "mem_banks_count 1\n"
-        + f"caches_count {caches}\n"
-        + f"io_links_count {io_links}\n"
-        + "cpu_core_id_base 0\n"
-        + "simd_id_base 2147487744\n"
-        + f"max_waves_per_simd {options.wfs_per_simd}\n"
-        + f"lds_size_in_kb {int(options.lds_size / 1024)}\n"
-        + "gds_size_in_kb 0\n"
-        + f"wave_front_size {options.wf_size}\n"
-        + "array_count 4\n"
-        + f"simd_arrays_per_engine {options.sa_per_complex}\n"
-        + f"cu_per_simd_array {options.cu_per_sa}\n"
-        + f"simd_per_cu {options.simds_per_cu}\n"
-        + "max_slots_scratch_cu 32\n"
-        + "vendor_id 4098\n"
-        + "device_id 29440\n"
-        + "location_id 512\n"
-        + f"drm_render_minor {drm_num}\n"
-        + f"max_engine_clk_fcompute {int(toFrequency(options.gpu_clock) / 1000000.0)}\n"
-        + "local_mem_size 4294967296\n"
-        + "fw_version 730\n"
-        + "capability 4736\n"
-        + f"max_engine_clk_ccompute {int(toFrequency(options.CPUClock) / 1000000.0)}\n"
-    )
-
-    file_append((node_dir, "properties"), node_prop)
-
-    # Fiji HBM reporting
-    # TODO: Extract size, clk, and width from sim paramters
-    mem_dir = joinpath(node_dir, "mem_banks/0")
-    remake_dir(mem_dir)
-    # Heap type value taken from real system, heap type values:
-    # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317
-    mem_prop = (
-        "heap_type 1\n"
-        + "size_in_bytes 4294967296\n"
-        + "flags 0\n"
-        + "width 4096\n"
-        + "mem_clk_max 500\n"
-    )
-
-    file_append((mem_dir, "properties"), mem_prop)
-
-
-def createCarrizoTopology(options):
+def createRavenTopology(options):
    topology_dir = joinpath(
        m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology"
    )
@@ -476,7 +287,6 @@ def createCarrizoTopology(options):
    file_append((node_dir, "gpu_id"), 2765)

    gfx_dict = {
-        "gfx801": {"name": "Carrizo\n", "id": 39028},
        "gfx902": {"name": "Raven\n", "id": 5597},
    }

--- a/configs/example/lupv/run_lupv.py
+++ b/configs/example/lupv/run_lupv.py
@@ -49,8 +49,8 @@ from gem5.utils.requires import requires
 # Run a check to ensure the right version of gem5 is being used.
 requires(isa_required=ISA.RISCV)

-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )

 parser = argparse.ArgumentParser(description="Runs Linux fs test with RISCV.")
@@ -72,7 +72,7 @@ parser.add_argument(

 args = parser.parse_args()

-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
    l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
 )

@@ -98,8 +98,12 @@ board = LupvBoard(
 # Set the Full System workload.

 board.set_kernel_disk_workload(
-    kernel=obtain_resource("riscv-lupio-linux-kernel"),
-    disk_image=obtain_resource("riscv-lupio-busybox-img"),
+    kernel=obtain_resource(
+        "riscv-lupio-linux-kernel", resource_version="1.0.0"
+    ),
+    disk_image=obtain_resource(
+        "riscv-lupio-busybox-img", resource_version="1.0.0"
+    ),
 )


--- a/configs/example/riscv/fs_linux.py
+++ b/configs/example/riscv/fs_linux.py
@@ -145,7 +145,17 @@ Options.addFSOptions(parser)
 parser.add_argument(
    "--virtio-rng", action="store_true", help="Enable VirtIORng device"
 )
-
+parser.add_argument(
+    "--semihosting",
+    action="store_true",
+    help="Enable the RISC-V semihosting interface",
+)
+parser.add_argument(
+    "--semihosting-root",
+    default="/some/invalid/root/directory",
+    type=str,
+    help="The root directory for files exposed to semihosting",
+)
 # ---------------------------- Parse Options --------------------------- #
 args = parser.parse_args()

@@ -168,11 +178,17 @@ mdesc = SysConfig(
 system.mem_mode = mem_mode
 system.mem_ranges = [AddrRange(start=0x80000000, size=mdesc.mem())]

+workload_args = dict()
+if args.semihosting:
+    workload_args["semihosting"] = RiscvSemihosting(
+        files_root_dir=args.semihosting_root,
+        cmd_line=args.kernel,
+    )
 if args.bare_metal:
-    system.workload = RiscvBareMetal()
+    system.workload = RiscvBareMetal(**workload_args)
    system.workload.bootloader = args.kernel
 else:
-    system.workload = RiscvLinux()
+    system.workload = RiscvLinux(**workload_args)
    system.workload.object_file = args.kernel

 system.iobus = IOXBar()
--- a/configs/nvm/sweep.py
+++ b/configs/nvm/sweep.py
@@ -59,7 +59,7 @@ nvm_generators = {"NVM": lambda x: x.createNvm}

 # Use a single-channel DDR3-1600 x64 (8x8 topology) by default
 parser.add_argument(
-    "--nvm-type",
+    "--mem-type",
    default="NVM_2400_1x64",
    choices=ObjectList.mem_list.get_names(),
    help="type of memory to use",
@@ -212,7 +212,7 @@ def trace():
                nbr_banks,
                bank,
                addr_map,
-                args.dram_ranks,
+                args.nvm_ranks,
            )
    yield system.tgen.createExit(0)

--- a/configs/nvm/sweep_hybrid.py
+++ b/configs/nvm/sweep_hybrid.py
@@ -143,7 +143,7 @@ MemConfig.config_mem(args, system)

 # the following assumes that we are using the native controller
 # with NVM and DRAM interfaces, check to be sure
-if not isinstance(system.mem_ctrls[0], m5.objects.HeteroMemCtrl):
+if not isinstance(system.mem_ctrls[0], m5.objects.MemCtrl):
    fatal("This script assumes the controller is a HeteroMemCtrl subclass")
 if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
    fatal("This script assumes the first memory is a DRAMInterface subclass")
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -149,7 +149,8 @@ class TCPCache(RubyCache):
        self.size = MemorySize(options.tcp_size)
        self.assoc = options.tcp_assoc
        self.resourceStalls = options.no_tcc_resource_stalls
-        self.replacement_policy = TreePLRURP()
+        if hasattr(options, "tcp_rp"):
+            self.replacement_policy = RP_choose(options.tcp_rp)


 class TCPCntrl(TCP_Controller, CntrlBase):
@@ -241,7 +242,8 @@ class SQCCache(RubyCache):
    def create(self, options):
        self.size = MemorySize(options.sqc_size)
        self.assoc = options.sqc_assoc
-        self.replacement_policy = TreePLRURP()
+        if hasattr(options, "sqc_rp"):
+            self.replacement_policy = RP_choose(options.sqc_rp)


 class SQCCntrl(SQC_Controller, CntrlBase):
@@ -303,7 +305,8 @@ class TCC(RubyCache):
        self.start_index_bit = math.log(options.cacheline_size, 2) + math.log(
            options.num_tccs, 2
        )
-        self.replacement_policy = TreePLRURP()
+        if hasattr(options, "tcc_rp"):
+            self.replacement_policy = RP_choose(options.tcc_rp)


 class TCCCntrl(TCC_Controller, CntrlBase):
@@ -497,13 +500,6 @@ def define_options(parser):
    parser.add_argument(
        "--noL1", action="store_true", default=False, help="bypassL1"
    )
-    parser.add_argument(
-        "--scalar-buffer-size",
-        type=int,
-        default=128,
-        help="Size of the mandatory queue in the GPU scalar "
-        "cache controller",
-    )
    parser.add_argument(
        "--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency"
    )
@@ -841,9 +837,7 @@ def construct_scalars(options, system, ruby_system, network):
        scalar_cntrl.responseToSQC = MessageBuffer(ordered=True)
        scalar_cntrl.responseToSQC.in_port = network.out_port

-        scalar_cntrl.mandatoryQueue = MessageBuffer(
-            buffer_size=options.scalar_buffer_size
-        )
+        scalar_cntrl.mandatoryQueue = MessageBuffer()

    return (scalar_sequencers, scalar_cntrl_nodes)

@@ -1133,3 +1127,28 @@ def create_system(
    ruby_system.network.number_of_virtual_networks = 11

    return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
+
+
+def RP_choose(test_name):
+    if test_name == "TreePLRURP":
+        replacement_policy = TreePLRURP()
+    elif test_name == "LRURP":
+        replacement_policy = LRURP()
+    elif test_name == "FIFORP":
+        replacement_policy = FIFORP()
+    elif test_name == "LFURP":
+        replacement_policy = LFURP()
+    elif test_name == "LIPRP":
+        replacement_policy = LIPRP()
+    elif test_name == "MRURP":
+        replacement_policy = MRURP()
+    elif test_name == "NRURP":
+        replacement_policy = NRURP()
+    elif test_name == "RRIPRP":
+        replacement_policy = RRIPRP()
+    elif test_name == "SecondChanceRP":
+        replacement_policy = SecondChanceRP()
+    elif test_name == "SHiPMemRP":
+        replacement_policy = SHiPMemRP()
+
+    return replacement_policy
--- a/ext/softfloat/softfloat_types.h
+++ b/ext/softfloat/softfloat_types.h
@@ -47,6 +47,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | the types below may, if desired, be defined as aliases for the native types
 | (typically 'float' and 'double', and possibly 'long double').
 *----------------------------------------------------------------------------*/
+typedef struct { uint8_t  v; } float8_t;
 typedef struct { uint16_t v; } float16_t;
 typedef struct { uint32_t v; } float32_t;
 typedef struct { uint64_t v; } float64_t;
--- a/ext/systemc/SConscript
+++ b/ext/systemc/SConscript
@@ -25,13 +25,16 @@

 import os
 from m5.util.terminal import get_termcap
-import gem5_scons
+import sys

 Import('env')
 systemc = env.Clone()

 build_root = Dir('.').abspath
 src_root = Dir('.').srcdir.abspath
+gem5_root = Dir('#../..').srcnode().abspath
+sys.path.append(os.path.join(gem5_root, 'site_scons'))
+import gem5_scons

 systemc.Prepend(CPPPATH=Dir('./src').srcnode())
 systemc.Prepend(CPATH=Dir('./src'))
--- a/ext/testlib/configuration.py
+++ b/ext/testlib/configuration.py
@@ -245,7 +245,6 @@ def define_constants(constants):

    constants.isa_tag_type = "isa"
    constants.x86_tag = "X86"
-    constants.gcn3_x86_tag = "GCN3_X86"
    constants.vega_x86_tag = "VEGA_X86"
    constants.sparc_tag = "SPARC"
    constants.riscv_tag = "RISCV"
@@ -274,7 +273,6 @@ def define_constants(constants):
    constants.supported_tags = {
        constants.isa_tag_type: (
            constants.x86_tag,
-            constants.gcn3_x86_tag,
            constants.vega_x86_tag,
            constants.sparc_tag,
            constants.riscv_tag,
@@ -305,7 +303,6 @@ def define_constants(constants):
    constants.target_host = {
        constants.arm_tag: (constants.host_arm_tag,),
        constants.x86_tag: (constants.host_x86_64_tag,),
-        constants.gcn3_x86_tag: (constants.host_x86_64_tag,),
        constants.vega_x86_tag: (constants.host_x86_64_tag,),
        constants.sparc_tag: (constants.host_x86_64_tag,),
        constants.riscv_tag: (constants.host_x86_64_tag,),
--- a/optional-requirements.txt
+++ b/optional-requirements.txt
@@ -1 +1 @@
-tqdm==4.64.1
+tqdm==4.66.4
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
-mypy==1.5.1
-pre-commit==2.20.0
+mypy==1.10.0
+pre-commit==3.7.1
--- a/site_scons/gem5_scons/configure.py
+++ b/site_scons/gem5_scons/configure.py
@@ -59,13 +59,15 @@ def CheckCxxFlag(context, flag, autoadd=True):
    return ret


-def CheckLinkFlag(context, flag, autoadd=True, set_for_shared=True):
+def CheckLinkFlag(context, flag, autoadd=True, set_for_shared=True, code=None):
    context.Message(f"Checking for linker {flag} support... ")
    last_linkflags = context.env["LINKFLAGS"]
    context.env.Append(LINKFLAGS=[flag])
    pre_werror = context.env["LINKFLAGS"]
    context.env.Append(LINKFLAGS=["-Werror"])
-    ret = context.TryLink("int main(int, char *[]) { return 0; }", ".cc")
+    if not code:
+        code = "int main(int, char *[]) { return 0; }"
+    ret = context.TryLink(code, ".cc")
    context.env["LINKFLAGS"] = pre_werror
    if not (ret and autoadd):
        context.env["LINKFLAGS"] = last_linkflags
--- a/src/Doxyfile
+++ b/src/Doxyfile
@@ -31,7 +31,7 @@ PROJECT_NAME           = gem5
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = v23.1.0.0
+PROJECT_NUMBER         = v24.0.0.0

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/src/Kconfig
+++ b/src/Kconfig
@@ -51,3 +51,4 @@ rsource "arch/Kconfig"
 rsource "cpu/Kconfig"
 rsource "systemc/Kconfig"
 rsource "gpu-compute/Kconfig"
+rsource "test_objects/Kconfig"
--- a/src/arch/SConscript
+++ b/src/arch/SConscript
@@ -68,7 +68,7 @@ if env['CONF']['BUILD_ISA']:
        error("At least one ISA need to be set")


-amdgpu_isa = ['gcn3', 'vega']
+amdgpu_isa = ['vega']

 if env['CONF']['BUILD_GPU']:
    env.SwitchingHeaders(
--- a/src/arch/amdgpu/Kconfig
+++ b/src/arch/amdgpu/Kconfig
@@ -29,5 +29,4 @@ prompt "GPU ISA"
 endchoice
 endif

-rsource "gcn3/Kconfig"
 rsource "vega/Kconfig"
--- a/src/arch/amdgpu/common/SConscript
+++ b/src/arch/amdgpu/common/SConscript
@@ -34,7 +34,7 @@ Import('*')
 if not env['CONF']['BUILD_GPU']:
    Return()

-if env['CONF']['TARGET_GPU_ISA'] in ('gcn3', 'vega'):
+if env['CONF']['TARGET_GPU_ISA'] in ('vega'):
    SimObject('X86GPUTLB.py', sim_objects=['X86GPUTLB', 'TLBCoalescer'])

    Source('tlb.cc')
--- a/src/arch/amdgpu/common/dtype/README.md
+++ b/src/arch/amdgpu/common/dtype/README.md
@@ -0,0 +1,21 @@
+# Microscaling Formats
+
+This directory defines [microscaling formats](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) which are reduced precision floating point formats.
+The class makes some assumptions to simplify things and is not completely generic.
+For example:
+- Types must be smaller than 32-bits.
+- Type conversions currently assume that either:
+    - The destination format exponent and mantissa bits are both greater or equal to the source format.
+    - OR the destination format exponent and mantissa are both less than or equal to the source format.
+    - In other words, one type cannot have larger exponent and smaller mantissa and visa versa.
+- Basic MX operations are implementation defined, meaning MX types can be converted to FP32 for arithmetic
+    - This means that arithmetic operators need not be defined for MX types.
+- Exponent and mantissa of zero is zero. There is no special case for the sign (i.e, -0 is not special).
+- The spec does not differentiate between signaling and quiet NaN, therefore quiet NaN is used.
+- New types must template specialize the following standard library methods:
+    - isinf(T)
+    - isnan(T)
+    - isnormal(T)
+- New types must template specialize the following std::numeric_limits<T> members / methods:
+    - has_infinity / infinity()
+    - has_quiet_NaN / quiet_NaN()
--- a/src/arch/amdgpu/common/dtype/SConscript
+++ b/src/arch/amdgpu/common/dtype/SConscript
@@ -1,6 +1,4 @@
-# -*- mode:python -*-
-
-# Copyright (c) 2015, 2017 Advanced Micro Devices, Inc.
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -31,4 +29,4 @@

 Import('*')

-main.Append(ALL_GPU_ISAS=['gcn3'])
+GTest('mxfp.test', 'mxfp.test.cc')
--- a/src/arch/amdgpu/common/dtype/binary32.hh
+++ b/src/arch/amdgpu/common/dtype/binary32.hh
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_BINARY32_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_BINARY32_HH__
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+// Same as IEEE 754 binary 32 - Microscaling types are converted to/from
+// this format by default. For now as there do not seem to be any MI300
+// instructions operating directly on the types (i.e., they all cast to FP32
+// first and then perform arithmetic operations).
+typedef union binary32_u
+{
+    enum bitSizes
+    {
+        ebits = 8,
+        mbits = 23,
+        sbits = 1,
+        bias = 127,
+
+        inf = 0x7f800000,
+        nan = 0x7f800100,
+        max = 0x7f7fffff
+    };
+
+    uint32_t storage;
+    float    fp32;
+    struct
+    {
+        unsigned mant : 23;
+        unsigned exp  : 8;
+        unsigned sign : 1;
+    };
+
+    // To help with stdlib functions with T = float.
+    operator float() const
+    {
+        return fp32;
+    }
+} binary32;
+static_assert(sizeof(binary32) == 4);
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+namespace std
+{
+
+template<>
+class numeric_limits<gem5::AMDGPU::binary32>
+{
+  public:
+    static constexpr bool has_quiet_NaN = true;
+    static gem5::AMDGPU::binary32 quiet_NaN()
+    {
+        gem5::AMDGPU::binary32 tmp;
+        tmp.fp32 = std::numeric_limits<float>::quiet_NaN();
+        return tmp;
+    }
+
+    static constexpr bool has_infinity = true;
+    static gem5::AMDGPU::binary32 infinity()
+    {
+        gem5::AMDGPU::binary32 tmp;
+        tmp.fp32 = std::numeric_limits<float>::infinity();
+        return tmp;
+    }
+
+    static gem5::AMDGPU::binary32 max()
+    {
+        gem5::AMDGPU::binary32 tmp;
+        tmp.fp32 = std::numeric_limits<float>::max();
+        return tmp;
+    }
+};
+
+} // namespace std
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_BINARY32_HH__
--- a/src/arch/amdgpu/common/dtype/fp16_e5m10.hh
+++ b/src/arch/amdgpu/common/dtype/fp16_e5m10.hh
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP16_E5M10_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_FP16_E5M10_HH__
+
+#include <cassert>
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+typedef union
+{
+    enum bitSizes
+    {
+        ebits = 5,
+        mbits = 10,
+        sbits = 1,
+        zbits = 16,
+        bias = 15,
+
+        inf = 0x7c000000,
+        nan = 0x7c100000,
+        max = 0x7bff0000
+    };
+
+    uint32_t storage;
+    struct
+    {
+        unsigned zero : zbits;
+        unsigned mant : mbits;
+        unsigned exp  : ebits;
+        unsigned sign : sbits;
+    };
+} fp16_e5m10_info;
+static_assert(sizeof(fp16_e5m10_info) == 4);
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+
+// std library cmath definitions
+namespace std
+{
+
+constexpr bool isinf(gem5::AMDGPU::fp16_e5m10_info a)
+{
+    return a.exp == 0x1F && a.mant == 0;
+}
+
+constexpr bool isnan(gem5::AMDGPU::fp16_e5m10_info a)
+{
+    return a.exp == 0x1F && a.mant != 0;
+}
+
+constexpr bool isnormal(gem5::AMDGPU::fp16_e5m10_info a)
+{
+    return !(a.exp == 0 && a.mant != 0);
+}
+
+template<>
+class numeric_limits<gem5::AMDGPU::fp16_e5m10_info>
+{
+  public:
+    static constexpr bool has_quiet_NaN = true;
+    static gem5::AMDGPU::fp16_e5m10_info quiet_NaN()
+    {
+        assert(has_quiet_NaN);
+        gem5::AMDGPU::fp16_e5m10_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e5m10_info::nan;
+        return tmp;
+    }
+
+    static constexpr bool has_infinity = true;
+    static gem5::AMDGPU::fp16_e5m10_info infinity()
+    {
+        assert(has_infinity);
+        gem5::AMDGPU::fp16_e5m10_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e5m10_info::inf;
+        return tmp;
+    }
+
+    static gem5::AMDGPU::fp16_e5m10_info max()
+    {
+        gem5::AMDGPU::fp16_e5m10_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e5m10_info::max;
+        return tmp;
+    }
+};
+
+} // namespace std
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP16_E5M10_HH__
--- a/src/arch/amdgpu/common/dtype/fp16_e8m7.hh
+++ b/src/arch/amdgpu/common/dtype/fp16_e8m7.hh
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP16_E8M7_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_FP16_E8M7_HH__
+
+#include <cassert>
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+typedef union
+{
+    enum bitSizes
+    {
+        ebits = 8,
+        mbits = 7,
+        sbits = 1,
+        zbits = 16,
+        bias = 127,
+
+        inf = 0x7f800000,
+        nan = 0x7f810000,
+        max = 0x7f7f0000
+    };
+
+    uint32_t storage;
+    struct
+    {
+        unsigned zero : zbits;
+        unsigned mant : mbits;
+        unsigned exp  : ebits;
+        unsigned sign : sbits;
+    };
+} fp16_e8m7_info;
+static_assert(sizeof(fp16_e8m7_info) == 4);
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+
+// std library cmath definitions
+namespace std
+{
+
+constexpr bool isinf(gem5::AMDGPU::fp16_e8m7_info a)
+{
+    return a.exp == 0xFF && a.mant == 0;
+}
+
+constexpr bool isnan(gem5::AMDGPU::fp16_e8m7_info a)
+{
+    return a.exp == 0xFF && a.mant != 0;
+}
+
+constexpr bool isnormal(gem5::AMDGPU::fp16_e8m7_info a)
+{
+    return !(a.exp == 0 && a.mant != 0);
+}
+
+template<>
+class numeric_limits<gem5::AMDGPU::fp16_e8m7_info>
+{
+  public:
+    static constexpr bool has_quiet_NaN = true;
+    static gem5::AMDGPU::fp16_e8m7_info quiet_NaN()
+    {
+        assert(has_quiet_NaN);
+        gem5::AMDGPU::fp16_e8m7_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e8m7_info::nan;
+        return tmp;
+    }
+
+    static constexpr bool has_infinity = true;
+    static gem5::AMDGPU::fp16_e8m7_info infinity()
+    {
+        assert(has_infinity);
+        gem5::AMDGPU::fp16_e8m7_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e8m7_info::inf;
+        return tmp;
+    }
+
+    static gem5::AMDGPU::fp16_e8m7_info max()
+    {
+        gem5::AMDGPU::fp16_e8m7_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e8m7_info::max;
+        return tmp;
+    }
+};
+
+} // namespace std
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP16_E8M7_HH__
--- a/src/arch/amdgpu/common/dtype/fp8_e4m3.hh
+++ b/src/arch/amdgpu/common/dtype/fp8_e4m3.hh
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP8_E4M3_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_FP8_E4M3_HH__
+
+#include <cassert>
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+typedef union
+{
+    enum bitSizes
+    {
+        ebits = 4,
+        mbits = 3,
+        sbits = 1,
+        zbits = 24,
+        bias = 7,
+
+        inf = (0x7f << zbits),
+        nan = (0xff << zbits),
+        max = (0x7f << zbits)
+    };
+
+    uint32_t storage;
+    struct
+    {
+        unsigned zero : zbits;
+        unsigned mant : mbits;
+        unsigned exp  : ebits;
+        unsigned sign : sbits;
+    };
+} fp8_e4m3_info;
+static_assert(sizeof(fp8_e4m3_info) == 4);
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+
+// std library cmath definitions
+namespace std
+{
+
+// Inf not defined
+constexpr bool isinf(gem5::AMDGPU::fp8_e4m3_info a) { return false; }
+
+constexpr bool isnan(gem5::AMDGPU::fp8_e4m3_info a)
+{
+    return a.exp == 0xF && a.mant == 0x7;
+}
+
+constexpr bool isnormal(gem5::AMDGPU::fp8_e4m3_info a)
+{
+    return !(a.exp == 0 && a.mant != 0);
+}
+
+
+template<>
+class numeric_limits<gem5::AMDGPU::fp8_e4m3_info>
+{
+  public:
+    static constexpr bool has_quiet_NaN = true;
+    static gem5::AMDGPU::fp8_e4m3_info quiet_NaN()
+    {
+        assert(has_quiet_NaN);
+        gem5::AMDGPU::fp8_e4m3_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e4m3_info::nan;
+        return tmp;
+    }
+
+    static constexpr bool has_infinity = false;
+    static gem5::AMDGPU::fp8_e4m3_info infinity()
+    {
+        assert(has_infinity);
+        gem5::AMDGPU::fp8_e4m3_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e4m3_info::inf;
+        return tmp;
+    }
+
+    static gem5::AMDGPU::fp8_e4m3_info max()
+    {
+        gem5::AMDGPU::fp8_e4m3_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e4m3_info::max;
+        return tmp;
+    }
+};
+
+} // namespace std
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP8_E4M3_HH__
--- a/src/arch/amdgpu/common/dtype/fp8_e5m2.hh
+++ b/src/arch/amdgpu/common/dtype/fp8_e5m2.hh
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP8_E5M2_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_FP8_E5M2_HH__
+
+#include <cassert>
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+typedef union
+{
+    enum bitSizes
+    {
+        ebits = 5,
+        mbits = 2,
+        sbits = 1,
+        zbits = 24,
+        bias = 15,
+
+        inf = (0x7c << zbits),
+        nan = (0xff << zbits),
+        max = (0x7f << zbits)
+    };
+
+    uint32_t storage;
+    struct
+    {
+        unsigned zero : zbits;
+        unsigned mant : mbits;
+        unsigned exp  : ebits;
+        unsigned sign : sbits;
+    };
+} fp8_e5m2_info;
+static_assert(sizeof(fp8_e5m2_info) == 4);
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+
+// std library cmath definitions
+namespace std
+{
+
+constexpr bool isinf(gem5::AMDGPU::fp8_e5m2_info a)
+{
+    return a.exp == 0x1F && a.mant == 0x0;
+}
+
+constexpr bool isnan(gem5::AMDGPU::fp8_e5m2_info a)
+{
+    return a.exp == 0x1F && a.mant != 0x0;
+}
+
+constexpr bool isnormal(gem5::AMDGPU::fp8_e5m2_info a)
+{
+    return !(a.exp == 0 && a.mant != 0);
+}
+
+template<>
+class numeric_limits<gem5::AMDGPU::fp8_e5m2_info>
+{
+  public:
+    static constexpr bool has_quiet_NaN = true;
+    static gem5::AMDGPU::fp8_e5m2_info quiet_NaN()
+    {
+        assert(has_quiet_NaN);
+        gem5::AMDGPU::fp8_e5m2_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e5m2_info::nan;
+        return tmp;
+    }
+
+    static constexpr bool has_infinity = true;
+    static gem5::AMDGPU::fp8_e5m2_info infinity()
+    {
+        assert(has_infinity);
+        gem5::AMDGPU::fp8_e5m2_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e5m2_info::inf;
+        return tmp;
+    }
+
+    static gem5::AMDGPU::fp8_e5m2_info max()
+    {
+        gem5::AMDGPU::fp8_e5m2_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e5m2_info::max;
+        return tmp;
+    }
+};
+
+} // namespace std
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP8_E5M2_HH__
--- a/src/arch/amdgpu/common/dtype/mxfp.hh
+++ b/src/arch/amdgpu/common/dtype/mxfp.hh
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_HH__
+
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+
+#include "arch/amdgpu/common/dtype/mxfp_convert.hh"
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+// Base class for all microscaling types. The sizes of everything are
+// determined by the enum fields in the FMT struct. All of these share the
+// same operator overloads which convert to float before arithmetic and
+// convert back if assigned to a microscaling type.
+template<typename FMT>
+class mxfp
+{
+  public:
+    mxfp() = default;
+    mxfp(float f) : mode(roundTiesToEven)
+    {
+        data = float_to_mxfp(f);
+    }
+
+    // Set raw bits, used by gem5 to set a raw value read from VGPRs.
+    mxfp(const uint32_t& raw)
+    {
+        // The info unions end up being "left" aligned. For example, in FP4
+        // only the bits 31:28 are used. Shift the input by the storage size
+        // of 32 by the type size (sign + exponent + mantissa bits).
+        data = raw;
+        data <<= (32 - int(FMT::sbits) - int(FMT::ebits) - int(FMT::mbits));
+    }
+
+    mxfp(const mxfp& f)
+    {
+        FMT conv_out;
+        conv_out = convertMXFP<FMT, decltype(f.getFmt())>(f.getFmt());
+        data = conv_out.storage;
+    }
+
+    mxfp&
+    operator=(const float& f)
+    {
+       data = float_to_mxfp(f);
+       return *this;
+    }
+
+    mxfp&
+    operator=(const mxfp& f)
+    {
+        FMT conv_out;
+        conv_out = convertMXFP<FMT, decltype(f.getFmt())>(f.getFmt());
+        data = conv_out.storage;
+        return *this;
+    }
+
+    operator float() const
+    {
+        binary32 out;
+        FMT in;
+        in.storage = data;
+        out = convertMXFP<binary32, FMT>(in, mode);
+
+        return out.fp32;
+    }
+
+    constexpr static int
+    size()
+    {
+        return int(FMT::mbits) + int(FMT::ebits) + int(FMT::sbits);
+    }
+
+    // Intentionally use storage > size() so that a storage type is not needed
+    // as a template parameter.
+    uint32_t data = 0;
+
+    FMT
+    getFmt() const
+    {
+        FMT out;
+        out.storage = data;
+        return out;
+    }
+
+    void
+    setFmt(FMT in)
+    {
+        data = in.storage;
+    }
+
+    void
+    scale(const float& f)
+    {
+        binary32 bfp;
+        bfp.fp32 = f;
+        int scale_val = bfp.exp - bfp.bias;
+
+        // Scale value of 0xFF is NaN. Scaling by NaN returns NaN.
+        // In this implementation, types without NaN define it as zero.
+        if (scale_val == 0xFF) {
+            data = FMT::nan;
+            return;
+        }
+
+        FMT in = getFmt();
+        int exp = in.exp;
+
+        if (exp + scale_val > max_exp<FMT>()) {
+            in.exp = max_exp<FMT>();
+        } else if (exp + scale_val < min_exp<FMT>()) {
+            in.exp = min_exp<FMT>();
+        } else {
+            in.exp = exp + scale_val;
+        }
+
+        data = in.storage;
+    }
+
+  private:
+    mxfpRoundingMode mode = roundTiesToEven;
+
+    uint32_t
+    float_to_mxfp(float f)
+    {
+        if (std::isinf(f)) {
+            assert(std::numeric_limits<FMT>::has_infinity);
+            return FMT::inf;
+        }
+
+        if (std::isnan(f)) {
+            assert(std::numeric_limits<FMT>::has_quiet_NaN);
+            return FMT::nan;
+        }
+
+        return float_to_mxfp_nocheck(f);
+    }
+
+    uint32_t
+    float_to_mxfp_nocheck(float f)
+    {
+        binary32 in;
+        in.fp32 = f;
+
+        FMT out;
+        out.storage = 0;
+
+        out = convertMXFP<FMT, binary32>(in, mode);
+
+        return out.storage;
+    }
+};
+
+// Unary operators
+template<typename T>
+inline T operator+(T a)
+{
+    return a;
+}
+
+template<typename T>
+inline T operator-(T a)
+{
+    // Flip sign bit
+    a.data ^= 0x80000000;
+    return a;
+}
+
+template<typename T>
+inline T operator++(T a)
+{
+    a = a + T(1.0f);
+    return a;
+}
+
+template<typename T>
+inline T operator--(T a)
+{
+    a = a - T(1.0f);
+    return a;
+}
+
+template<typename T>
+inline T operator++(T a, int)
+{
+    T original = a;
+    ++a;
+    return original;
+}
+
+template<typename T>
+inline T operator--(T a, int)
+{
+    T original = a;
+    --a;
+    return original;
+}
+
+// Math operators
+template<typename T>
+inline T operator+(T a, T b)
+{
+    return T(float(a) + float(b));
+}
+
+template<typename T>
+inline T operator-(T a, T b)
+{
+    return T(float(a) - float(b));
+}
+
+template<typename T>
+inline T operator*(T a, T b)
+{
+    return T(float(a) * float(b));
+}
+
+template<typename T>
+inline T operator/(T a, T b)
+{
+    return T(float(a) / float(b));
+}
+
+template<typename T>
+inline T operator+=(T &a, T b)
+{
+    a = a + b;
+    return a;
+}
+
+template<typename T>
+inline T operator-=(T &a, T b)
+{
+    a = a - b;
+    return a;
+}
+
+template<typename T>
+inline T operator*=(T &a, T b)
+{
+    a = a * b;
+    return a;
+}
+
+template<typename T>
+inline T operator/=(T &a, T b)
+{
+    a = a / b;
+    return a;
+}
+
+// Comparison operators
+template<typename T>
+inline bool operator<(T a, T b)
+{
+    return float(a) < float(b);
+}
+
+template<typename T>
+inline bool operator>(T a, T b)
+{
+    return float(a) > float(b);
+}
+
+template<typename T>
+inline bool operator<=(T a, T b)
+{
+    return float(a) <= float(b);
+}
+
+template<typename T>
+inline bool operator>=(T a, T b)
+{
+    return float(a) >= float(b);
+}
+
+template<typename T>
+inline bool operator==(T a, T b)
+{
+    return float(a) == float(b);
+}
+
+template<typename T>
+inline bool operator!=(T a, T b)
+{
+    return float(a) != float(b);
+}
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_HH__
--- a/src/arch/amdgpu/common/dtype/mxfp.test.cc
+++ b/src/arch/amdgpu/common/dtype/mxfp.test.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <gtest/gtest.h>
+
+#include "arch/amdgpu/common/dtype/mxfp_types.hh"
+
+template<typename T>
+bool test_raw_mxfp(T raw_mxfp, int bits)
+{
+    float tmp = float(raw_mxfp);
+    T from_float(tmp);
+
+    // Simply check that casting to float and back yields the same bit values.
+    // Exclude inf/NaN as those have multiple values in some MXFP types.
+    if (raw_mxfp.data != from_float.data &&
+        !std::isnan(tmp) && !std::isinf(tmp)) {
+        return false;
+    }
+
+    return true;
+}
+
+template<typename T>
+int test_type(int bits)
+{
+    T raw_mxfp;
+    int errors = 0;
+
+    int max_val = 1 << bits;
+    for (int val = 0; val < max_val; ++val) {
+        // Raw data is aligned to MSb in MXFP types. Shift into place.
+        raw_mxfp.data = val << (32 - bits);
+        if (!test_raw_mxfp(raw_mxfp, bits)) {
+            errors++;
+        }
+    }
+
+    return errors;
+}
+
+TEST(MxfpTest, MxBf16Test)
+{
+    using T = gem5::AMDGPU::mxbfloat16;
+
+    int errors = test_type<T>(T::size());
+
+    EXPECT_EQ(errors, 0);
+}
+
+TEST(MxfpTest, MxFp16Test)
+{
+    using T = gem5::AMDGPU::mxfloat16;
+
+    int errors = test_type<T>(T::size());
+
+    EXPECT_EQ(errors, 0);
+}
+
+TEST(MxfpTest, MxBf8Test)
+{
+    using T = gem5::AMDGPU::mxbfloat8;
+
+    int errors = test_type<T>(T::size());
+
+    EXPECT_EQ(errors, 0);
+}
+
+TEST(MxfpTest, MxFp8Test)
+{
+    using T = gem5::AMDGPU::mxfloat8;
+
+    int errors = test_type<T>(T::size());
+
+    EXPECT_EQ(errors, 0);
+}
--- a/src/arch/amdgpu/common/dtype/mxfp_convert.hh
+++ b/src/arch/amdgpu/common/dtype/mxfp_convert.hh
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_CONVERT_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_CONVERT_HH__
+
+#include <cassert>
+
+#include "arch/amdgpu/common/dtype/mxfp_type_info.hh"
+#include "base/bitfield.hh"
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+// The various rounding modes for microscaling formats. roundTiesToEven must
+// be supported. Other rounding modes may be supported.
+enum mxfpRoundingMode
+{
+    roundTiesToEven,
+    roundStochastic
+};
+
+// Conversion functions - For instructions that convert from one microscaling
+// format to another. We only need the conversion functions as there do not
+// appear to be any instructions yet which operate directly on the MX formats.
+//
+// in - An MXFP info struct type
+// mode - rounding mode
+// seed - input value for stochastic rounding function
+template<typename dFMT, typename sFMT>
+dFMT convertMXFP(sFMT in, mxfpRoundingMode mode = roundTiesToEven,
+                 uint32_t seed = 0)
+{
+    // We assume that *both* exponent and mantissa bits are both >= or <=
+    // the target type. Checkable at compile time.
+    //
+    // This is not necessarily a limitation, others just are not implemented.
+    // Figuring this out would be interesting for converting FP8 <-> BF8 for
+    // example. So far all GPU conversion instructions convert explicitly to
+    // a larger type from a smaller type or smaller to larger.
+    static_assert(((int(sFMT::mbits) >= int(dFMT::mbits)) &&
+                   (int(sFMT::ebits) >= int(dFMT::ebits)))
+               || ((int(sFMT::mbits) <= int(dFMT::mbits)) &&
+                   (int(sFMT::ebits) <= int(dFMT::ebits))));
+
+    dFMT out;
+    out.storage = 0;
+
+    if (int(sFMT::mbits) >= int(dFMT::mbits) &&
+        int(sFMT::ebits) >= int(dFMT::ebits)) {
+        // Input format is larger, truncate and round mantissa. MX formats
+        // are subnormal if exp == 0. Zero out exp in that case.
+
+        if (std::isnan(in)) {
+            // For types with no NaN return max value.
+            if (std::numeric_limits<dFMT>::has_quiet_NaN) {
+                out = std::numeric_limits<dFMT>::quiet_NaN();
+            } else {
+                out = std::numeric_limits<dFMT>::max();
+            }
+        } else if (std::isinf(in)) {
+            // For types with no Inf return max value.
+            if (std::numeric_limits<dFMT>::has_infinity) {
+                out = std::numeric_limits<dFMT>::infinity();
+            } else {
+                out = std::numeric_limits<dFMT>::max();
+            }
+        } else if (in.mant == 0 && in.exp == 0) {
+            // All MX formats FP32, and FP64 encode 0 as all zeros. Keep sign.
+            out.mant = 0;
+            out.exp  = 0;
+            out.sign = in.sign;
+        } else {
+            // Extra bits are needed for the mantissa conversion.
+            uint32_t mant = in.mant & mask(sFMT::mbits);
+            int32_t exp   = in.exp - sFMT::bias + dFMT::bias;
+            out.sign = in.sign;
+
+            // Input is not subnormal, add the implicit 1 bit.
+            if (in.exp) {
+                mant |= (1 << sFMT::mbits);
+            }
+
+            mant >>= (sFMT::mbits - dFMT::mbits);
+
+            // Output became subnormal
+            if (exp < 1) {
+                int shift = 1 - exp;
+                mant >>= shift;
+                out.exp = 0;
+            } else {
+                out.exp = exp;
+            }
+
+            mant &= mask(dFMT::mbits);
+            out.mant = mant;
+
+            // roundTiesToEven is the only required rounding mode for MXFP
+            // types. Here we take the original mantissa and check the final
+            // bit which is shifted out when converting the mantissa. If that
+            // value is one, then we should round up to the next representable
+            // number. If the value is one and all other discarded mantissa
+            // bits are zero, round towards the number which has an even (0)
+            // bit value in the least significant mantissa bit.
+            //
+            // For denormals, the process is similar however we check the nth
+            // bit of the converted mantissa, where n is the absolute value of
+            // the converted exponent. If the value of |exp| is larger than
+            // the max exponent, round to zero. If it is exactly equal, always
+            // round up.
+            //
+            // If the number of destination and source format mantissa bits are
+            // the same, the mantissa is unchanged.
+            if (int(sFMT::mbits) > int(dFMT::mbits)
+                    && mode == roundTiesToEven) {
+                bool round_up = false;
+
+                int check_shift = sFMT::mbits - dFMT::mbits - 1;
+                uint32_t check_mant = in.mant & mask(sFMT::mbits);
+
+                check_mant >>= check_shift;
+
+                // out.exp == 0 means subnormal
+                if (out.exp == 0) {
+                    check_mant = in.mant >> (sFMT::mbits - dFMT::mbits);
+
+                    uint32_t max_exp = mask(dFMT::ebits);
+                    if (-exp > max_exp) {
+                        // if exp < -(1 << dFMT::ebits), result should be 0
+                        round_up = false;
+                    } else if (-exp == max_exp) {
+                        // if exp == -(1 << dFMT::ebits), round up
+                        round_up = true;
+                    } else {
+                        // Use the |exp|'th bit to determine rounding
+                        int check_bit = 1 << -exp;
+                        round_up = (check_mant & check_bit);
+                    }
+                } else {
+                    round_up = (check_mant & 0x1);
+                }
+
+                // For roundTiesToEven, if we are exactly between two
+                // representable numbers, pick the one with an even least
+                // significant mantissa bit. We are exactly between when
+                // all of the discarded mantissa bits are 0 (i.e., !sticky).
+                int sticky = in.mant & mask(sFMT::mbits - dFMT::mbits);
+                if (round_up && !sticky) {
+                    if (!(out.mant & 1)) {
+                        round_up = false;
+                    }
+                }
+
+                if (round_up) {
+                    if (out.mant == mask(dFMT::mbits)) {
+                        // mantissa at max value, increment exponent if not inf
+                        if (out.exp != mask(dFMT::ebits)) {
+                            out.exp++;
+                        }
+                        out.mant = 0;
+                    } else {
+                        out.mant++;
+                    }
+                }
+            } else if (int(sFMT::mbits) > int(dFMT::mbits)
+                    && mode == roundStochastic) {
+                // Use the discarded mantissa divided by the max mantissa of
+                // the source format to determine the probability of rounding
+                // up. An alternate implementation of this would be to get a
+                // random number and add that to the input mantissa. Then
+                // follow the normal rounding path above.
+                uint32_t discarded = in.mant & mask(sFMT::mbits - dFMT::mbits);
+                uint32_t max_mant = mask(sFMT::mbits);
+
+                float round_prob = float(discarded) / float(max_mant);
+
+                // Use a stochastic rounding function with the seed value to
+                // determine compare probability. This is implemented as a
+                // "Galois LFSR."
+                auto srFunc = [](uint32_t in) {
+                    uint32_t bit = (in ^ (in >> 1) ^ (in >> 3) ^ (in >> 12));
+                    return (in >> 1) | (bit << 15);
+                };
+
+                // Assume stochastic rounding returns up to max uint32_t.
+                // This will return an FP value between 0.0f and 1.0f.
+                float draw_prob = float(srFunc(seed))
+                    / float(std::numeric_limits<uint32_t>::max());
+
+                // Round up if the number we drew is less than the rounding
+                // probability. E.g., if round_prob is 90% (0.9) we choose
+                // values 0.0f - 0.90f to round up.
+                if (round_prob >= draw_prob) {
+                    if (out.mant == mask(dFMT::mbits)) {
+                        // mantissa at max value, increment exponent if not inf
+                        if (out.exp != mask(dFMT::ebits)) {
+                            out.exp++;
+                        }
+                        out.mant = 0;
+                    } else {
+                        out.mant++;
+                    }
+                }
+            }
+        }
+    } else if (int(sFMT::mbits) <= int(dFMT::mbits) &&
+               int(sFMT::ebits) <= int(dFMT::ebits)) {
+        // Input format is smaller. Extend mantissa / exponent and pad with 0.
+        // Should be the same for all non-stochastic rounding modes.
+
+        if (std::isnan(in)) {
+            // For types with no NaN return max value.
+            if (std::numeric_limits<dFMT>::has_quiet_NaN) {
+                out = std::numeric_limits<dFMT>::quiet_NaN();
+            } else {
+                out = std::numeric_limits<dFMT>::max();
+            }
+        } else if (std::isinf(in)) {
+            // For types with no Inf return max value.
+            if (std::numeric_limits<dFMT>::has_infinity) {
+                out = std::numeric_limits<dFMT>::infinity();
+            } else {
+                out = std::numeric_limits<dFMT>::max();
+            }
+        } else if (in.mant == 0 && in.exp == 0) {
+            // All MX formats FP32, and FP64 encode 0 as all zeros. Keep sign.
+            out.mant = 0;
+            out.exp  = 0;
+            out.sign = in.sign;
+        } else {
+            out.mant = in.mant << (dFMT::mbits - sFMT::mbits);
+            out.exp  = in.exp + dFMT::bias - sFMT::bias;
+            out.sign = in.sign;
+
+            // Normalize input denormals
+            if (!in.exp && int(sFMT::ebits) != int(dFMT::ebits)) {
+                uint32_t m = out.mant;
+                if (m != 0) {
+                    out.exp++;
+                    while (!(m >> dFMT::mbits)) {
+                        m <<= 1;
+                        out.exp--;
+                    }
+                    out.mant = m & mask(dFMT::mbits);
+                }
+            } else if (!in.exp) {
+                // Exponent is the same, but output is not denorm, so add
+                // implicit 1. This is specific mainly to bf16 -> f32.
+                uint32_t m = out.mant;
+                m <<= 1;
+                out.mant = m & mask(dFMT::mbits);
+            }
+        }
+    } else {
+        assert(false);
+    }
+
+    return out;
+}
+
+template<typename FMT>
+int min_exp()
+{
+    return 1;
+}
+
+template<typename FMT>
+int max_exp()
+{
+    return (1 << FMT::ebits) - 1;
+}
+
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_CONVERT_HH__
--- a/src/arch/amdgpu/common/dtype/mxfp_type_info.hh
+++ b/src/arch/amdgpu/common/dtype/mxfp_type_info.hh
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPE_INFO_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPE_INFO_HH__
+
+#include "arch/amdgpu/common/dtype/binary32.hh"
+#include "arch/amdgpu/common/dtype/fp16_e5m10.hh"
+#include "arch/amdgpu/common/dtype/fp16_e8m7.hh"
+#include "arch/amdgpu/common/dtype/fp8_e4m3.hh"
+#include "arch/amdgpu/common/dtype/fp8_e5m2.hh"
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPE_INFO_HH__
--- a/src/arch/amdgpu/common/dtype/mxfp_types.hh
+++ b/src/arch/amdgpu/common/dtype/mxfp_types.hh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPES_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPES_HH__
+
+#include "arch/amdgpu/common/dtype/mxfp.hh"
+
+namespace gem5
+{
+namespace AMDGPU
+{
+
+using mxbfloat8 = mxfp<fp8_e5m2_info>;
+using mxfloat8 = mxfp<fp8_e4m3_info>;
+
+using mxbfloat16 = mxfp<fp16_e8m7_info>;
+using mxfloat16 = mxfp<fp16_e5m10_info>;
+
+using mxfloat32 = mxfp<binary32>;
+
+}
+}
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPES_HH__
--- a/src/arch/amdgpu/gcn3/decoder.cc
+++ b/src/arch/amdgpu/gcn3/decoder.cc
--- a/src/arch/amdgpu/gcn3/gpu_decoder.hh
+++ b/src/arch/amdgpu/gcn3/gpu_decoder.hh
--- a/src/arch/amdgpu/gcn3/gpu_isa.hh
+++ b/src/arch/amdgpu/gcn3/gpu_isa.hh
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_GPU_ISA_HH__
-#define __ARCH_GCN3_GPU_ISA_HH__
-
-#include <array>
-#include <type_traits>
-
-#include "arch/amdgpu/common/tlb.hh"
-#include "arch/amdgpu/gcn3/gpu_registers.hh"
-#include "gpu-compute/dispatcher.hh"
-#include "gpu-compute/hsa_queue_entry.hh"
-#include "gpu-compute/misc.hh"
-
-namespace gem5
-{
-
-class Wavefront;
-
-namespace Gcn3ISA
-{
-    class GPUISA
-    {
-      public:
-        GPUISA(Wavefront &wf);
-
-        template<typename T> T
-        readConstVal(int opIdx) const
-        {
-            panic_if(!std::is_integral_v<T>, "Constant values must "
-                     "be an integer.\n");
-            T val(0);
-
-            if (isPosConstVal(opIdx)) {
-                val = (T)readPosConstReg(opIdx);
-            }
-
-            if (isNegConstVal(opIdx)) {
-                val = (T)readNegConstReg(opIdx);
-            }
-
-            return val;
-        }
-
-        ScalarRegU32 readMiscReg(int opIdx) const;
-        void writeMiscReg(int opIdx, ScalarRegU32 operandVal);
-        bool hasScalarUnit() const { return true; }
-        void advancePC(GPUDynInstPtr gpuDynInst);
-
-      private:
-        ScalarRegU32 readPosConstReg(int opIdx) const
-        {
-            return posConstRegs[opIdx - REG_INT_CONST_POS_MIN];
-        }
-
-        ScalarRegI32 readNegConstReg(int opIdx) const
-        {
-            return negConstRegs[opIdx - REG_INT_CONST_NEG_MIN];
-        }
-
-        static const std::array<const ScalarRegU32, NumPosConstRegs>
-            posConstRegs;
-        static const std::array<const ScalarRegI32, NumNegConstRegs>
-            negConstRegs;
-
-        // parent wavefront
-        Wavefront &wavefront;
-
-        // shader status bits
-        StatusReg statusReg;
-        // memory descriptor reg
-        ScalarRegU32 m0;
-    };
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif // __ARCH_GCN3_GPU_ISA_HH__
--- a/src/arch/amdgpu/gcn3/gpu_mem_helpers.hh
+++ b/src/arch/amdgpu/gcn3/gpu_mem_helpers.hh
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
-#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__
-
-#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
-#include "arch/amdgpu/gcn3/insts/op_encodings.hh"
-#include "debug/GPUMem.hh"
-#include "gpu-compute/gpu_dyn_inst.hh"
-
-namespace gem5
-{
-
-/**
- * Helper function for instructions declared in op_encodings.  This function
- * takes in all of the arguments for a given memory request we are trying to
- * initialize, then submits the request or requests depending on if the
- * original request is aligned or unaligned.
- */
-template<typename T, int N>
-inline void
-initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
-                 bool is_atomic=false)
-{
-    // local variables
-    int req_size = N * sizeof(T);
-    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
-    Addr vaddr = 0, split_addr = 0;
-    bool misaligned_acc = false;
-    RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
-    PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
-
-    gpuDynInst->resetEntireStatusVector();
-    for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
-        if (gpuDynInst->exec_mask[lane]) {
-            vaddr = gpuDynInst->addr[lane];
-
-            /**
-             * the base address of the cache line where the the last
-             * byte of the request will be stored.
-             */
-            split_addr = roundDown(vaddr + req_size - 1, block_size);
-
-            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
-            /**
-             * if the base cache line address of the last byte is
-             * greater than the address of the first byte then we have
-             * a misaligned access.
-             */
-            misaligned_acc = split_addr > vaddr;
-
-            if (is_atomic) {
-                // make sure request is word aligned
-                assert((vaddr & 0x3) == 0);
-
-                // a given lane's atomic can't cross cache lines
-                assert(!misaligned_acc);
-
-                req = std::make_shared<Request>(vaddr, sizeof(T), 0,
-                    gpuDynInst->computeUnit()->requestorId(), 0,
-                    gpuDynInst->wfDynId,
-                    gpuDynInst->makeAtomicOpFunctor<T>(
-                        &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
-                        &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
-            } else {
-                req = std::make_shared<Request>(vaddr, req_size, 0,
-                                  gpuDynInst->computeUnit()->requestorId(), 0,
-                                  gpuDynInst->wfDynId);
-            }
-
-            if (misaligned_acc) {
-                gpuDynInst->setStatusVector(lane, 2);
-                req->splitOnVaddr(split_addr, req1, req2);
-                gpuDynInst->setRequestFlags(req1);
-                gpuDynInst->setRequestFlags(req2);
-                pkt1 = new Packet(req1, mem_req_type);
-                pkt2 = new Packet(req2, mem_req_type);
-                pkt1->dataStatic(&(reinterpret_cast<T*>(
-                    gpuDynInst->d_data))[lane * N]);
-                pkt2->dataStatic(&(reinterpret_cast<T*>(
-                    gpuDynInst->d_data))[lane * N +
-                                         req1->getSize()/sizeof(T)]);
-                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
-                        "request for %#x\n", gpuDynInst->cu_id,
-                        gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
-                        split_addr);
-                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
-                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
-            } else {
-                gpuDynInst->setStatusVector(lane, 1);
-                gpuDynInst->setRequestFlags(req);
-                pkt = new Packet(req, mem_req_type);
-                pkt->dataStatic(&(reinterpret_cast<T*>(
-                    gpuDynInst->d_data))[lane * N]);
-                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
-            }
-        } else { // if lane is not active, then no pending requests
-            gpuDynInst->setStatusVector(lane, 0);
-        }
-    }
-}
-
-/**
- * Helper function for scalar instructions declared in op_encodings.  This
- * function takes in all of the arguments for a given memory request we are
- * trying to initialize, then submits the request or requests depending on if
- * the original request is aligned or unaligned.
- */
-template<typename T, int N>
-inline void
-initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
-{
-    int req_size = N * sizeof(T);
-    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
-    Addr vaddr = gpuDynInst->scalarAddr;
-
-    /**
-     * the base address of the cache line where the the last byte of
-     * the request will be stored.
-     */
-    Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
-
-    assert(split_addr <= vaddr || split_addr - vaddr < block_size);
-    /**
-     * if the base cache line address of the last byte is greater
-     * than the address of the first byte then we have a misaligned
-     * access.
-     */
-    bool misaligned_acc = split_addr > vaddr;
-
-    RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
-                                 gpuDynInst->computeUnit()->requestorId(), 0,
-                                 gpuDynInst->wfDynId);
-
-    if (misaligned_acc) {
-        RequestPtr req1, req2;
-        req->splitOnVaddr(split_addr, req1, req2);
-        gpuDynInst->numScalarReqs = 2;
-        gpuDynInst->setRequestFlags(req1);
-        gpuDynInst->setRequestFlags(req2);
-        PacketPtr pkt1 = new Packet(req1, mem_req_type);
-        PacketPtr pkt2 = new Packet(req2, mem_req_type);
-        pkt1->dataStatic(gpuDynInst->scalar_data);
-        pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
-        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
-                " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, split_addr);
-        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
-        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
-    } else {
-        gpuDynInst->numScalarReqs = 1;
-        gpuDynInst->setRequestFlags(req);
-        PacketPtr pkt = new Packet(req, mem_req_type);
-        pkt->dataStatic(gpuDynInst->scalar_data);
-        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
-    }
-}
-
-} // namespace gem5
-
-#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__
--- a/src/arch/amdgpu/gcn3/gpu_registers.hh
+++ b/src/arch/amdgpu/gcn3/gpu_registers.hh
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_REGISTERS_HH__
-#define __ARCH_GCN3_REGISTERS_HH__
-
-#include <array>
-#include <cstdint>
-#include <string>
-
-#include "arch/generic/vec_reg.hh"
-#include "base/intmath.hh"
-#include "base/logging.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    enum OpSelector : int
-    {
-        REG_SGPR_MIN = 0,
-        REG_SGPR_MAX = 101,
-        REG_FLAT_SCRATCH_LO = 102,
-        REG_FLAT_SCRATCH_HI = 103,
-        REG_XNACK_MASK_LO = 104,
-        REG_XNACK_MASK_HI = 105,
-        REG_VCC_LO = 106,
-        REG_VCC_HI = 107,
-        REG_TBA_LO = 108,
-        REG_TBA_HI = 109,
-        REG_TMA_LO = 110,
-        REG_TMA_HI = 111,
-        REG_TTMP_0 = 112,
-        REG_TTMP_1 = 113,
-        REG_TTMP_2 = 114,
-        REG_TTMP_3 = 115,
-        REG_TTMP_4 = 116,
-        REG_TTMP_5 = 117,
-        REG_TTMP_6 = 118,
-        REG_TTMP_7 = 119,
-        REG_TTMP_8 = 120,
-        REG_TTMP_9 = 121,
-        REG_TTMP_10 = 122,
-        REG_TTMP_11 = 123,
-        REG_M0 = 124,
-        REG_RESERVED_1 = 125,
-        REG_EXEC_LO = 126,
-        REG_EXEC_HI = 127,
-        REG_ZERO = 128,
-        REG_INT_CONST_POS_MIN = 129,
-        REG_INT_CONST_POS_MAX = 192,
-        REG_INT_CONST_NEG_MIN = 193,
-        REG_INT_CONST_NEG_MAX = 208,
-        REG_RESERVED_2 = 209,
-        REG_RESERVED_3 = 210,
-        REG_RESERVED_4 = 211,
-        REG_RESERVED_5 = 212,
-        REG_RESERVED_6 = 213,
-        REG_RESERVED_7 = 214,
-        REG_RESERVED_8 = 215,
-        REG_RESERVED_9 = 216,
-        REG_RESERVED_10 = 217,
-        REG_RESERVED_11 = 218,
-        REG_RESERVED_12 = 219,
-        REG_RESERVED_13 = 220,
-        REG_RESERVED_14 = 221,
-        REG_RESERVED_15 = 222,
-        REG_RESERVED_16 = 223,
-        REG_RESERVED_17 = 224,
-        REG_RESERVED_18 = 225,
-        REG_RESERVED_19 = 226,
-        REG_RESERVED_20 = 227,
-        REG_RESERVED_21 = 228,
-        REG_RESERVED_22 = 229,
-        REG_RESERVED_23 = 230,
-        REG_RESERVED_24 = 231,
-        REG_RESERVED_25 = 232,
-        REG_RESERVED_26 = 233,
-        REG_RESERVED_27 = 234,
-        REG_RESERVED_28 = 235,
-        REG_RESERVED_29 = 236,
-        REG_RESERVED_30 = 237,
-        REG_RESERVED_31 = 238,
-        REG_RESERVED_32 = 239,
-        REG_POS_HALF = 240,
-        REG_NEG_HALF = 241,
-        REG_POS_ONE = 242,
-        REG_NEG_ONE = 243,
-        REG_POS_TWO = 244,
-        REG_NEG_TWO = 245,
-        REG_POS_FOUR = 246,
-        REG_NEG_FOUR = 247,
-        REG_PI = 248,
-        /* NOTE: SDWA and SWDA both refer to sub d-word addressing */
-        REG_SRC_SWDA = 249,
-        REG_SRC_DPP = 250,
-        REG_VCCZ = 251,
-        REG_EXECZ = 252,
-        REG_SCC = 253,
-        REG_LDS_DIRECT = 254,
-        REG_SRC_LITERAL = 255,
-        REG_VGPR_MIN = 256,
-        REG_VGPR_MAX = 511
-    };
-
-    constexpr size_t MaxOperandDwords(16);
-    const int NumVecElemPerVecReg(64);
-    // op selector values 129 - 192 correspond to const values 1 - 64
-    const int NumPosConstRegs = REG_INT_CONST_POS_MAX
-        - REG_INT_CONST_POS_MIN + 1;
-    // op selector values 193 - 208 correspond to const values -1 - 16
-    const int NumNegConstRegs = REG_INT_CONST_NEG_MAX
-        - REG_INT_CONST_NEG_MIN + 1;
-    const int BITS_PER_BYTE = 8;
-    const int BITS_PER_WORD = 16;
-    const int MSB_PER_BYTE = (BITS_PER_BYTE - 1);
-    const int MSB_PER_WORD = (BITS_PER_WORD - 1);
-
-    // typedefs for the various sizes/types of scalar regs
-    typedef uint8_t ScalarRegU8;
-    typedef int8_t ScalarRegI8;
-    typedef uint16_t ScalarRegU16;
-    typedef int16_t ScalarRegI16;
-    typedef uint32_t ScalarRegU32;
-    typedef int32_t ScalarRegI32;
-    typedef float ScalarRegF32;
-    typedef uint64_t ScalarRegU64;
-    typedef int64_t ScalarRegI64;
-    typedef double ScalarRegF64;
-
-    // typedefs for the various sizes/types of vector reg elements
-    typedef uint8_t VecElemU8;
-    typedef int8_t VecElemI8;
-    typedef uint16_t VecElemU16;
-    typedef int16_t VecElemI16;
-    typedef uint32_t VecElemU32;
-    typedef int32_t VecElemI32;
-    typedef float VecElemF32;
-    typedef uint64_t VecElemU64;
-    typedef int64_t VecElemI64;
-    typedef double VecElemF64;
-
-    const int DWordSize = sizeof(VecElemU32);
-    /**
-     * Size of a single-precision register in DWords.
-     */
-    const int RegSizeDWords = sizeof(VecElemU32) / DWordSize;
-
-    using VecRegContainerU32 =
-        VecRegContainer<sizeof(VecElemU32) * NumVecElemPerVecReg>;
-
-    struct StatusReg
-    {
-        StatusReg() : SCC(0), SPI_PRIO(0), USER_PRIO(0), PRIV(0), TRAP_EN(0),
-            TTRACE_EN(0), EXPORT_RDY(0), EXECZ(0), VCCZ(0), IN_TG(0),
-            IN_BARRIER(0), HALT(0), TRAP(0), TTRACE_CU_EN(0), VALID(0),
-            ECC_ERR(0), SKIP_EXPORT(0), PERF_EN(0), COND_DBG_USER(0),
-            COND_DBG_SYS(0), ALLOW_REPLAY(0), INSTRUCTION_ATC(0), RESERVED(0),
-            MUST_EXPORT(0), RESERVED_1(0)
-        {
-        }
-
-        uint32_t SCC : 1;
-        uint32_t SPI_PRIO : 2;
-        uint32_t USER_PRIO : 2;
-        uint32_t PRIV : 1;
-        uint32_t TRAP_EN : 1;
-        uint32_t TTRACE_EN : 1;
-        uint32_t EXPORT_RDY : 1;
-        uint32_t EXECZ : 1;
-        uint32_t VCCZ : 1;
-        uint32_t IN_TG : 1;
-        uint32_t IN_BARRIER : 1;
-        uint32_t HALT : 1;
-        uint32_t TRAP : 1;
-        uint32_t TTRACE_CU_EN : 1;
-        uint32_t VALID : 1;
-        uint32_t ECC_ERR : 1;
-        uint32_t SKIP_EXPORT : 1;
-        uint32_t PERF_EN : 1;
-        uint32_t COND_DBG_USER : 1;
-        uint32_t COND_DBG_SYS : 1;
-        uint32_t ALLOW_REPLAY : 1;
-        uint32_t INSTRUCTION_ATC : 1;
-        uint32_t RESERVED : 3;
-        uint32_t MUST_EXPORT : 1;
-        uint32_t RESERVED_1 : 4;
-    };
-
-    std::string opSelectorToRegSym(int opIdx, int numRegs=0);
-    int opSelectorToRegIdx(int opIdx, int numScalarRegs);
-    bool isPosConstVal(int opIdx);
-    bool isNegConstVal(int opIdx);
-    bool isConstVal(int opIdx);
-    bool isLiteral(int opIdx);
-    bool isScalarReg(int opIdx);
-    bool isVectorReg(int opIdx);
-    bool isFlatScratchReg(int opIdx);
-    bool isExecMask(int opIdx);
-    bool isVccReg(int opIdx);
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif // __ARCH_GCN3_REGISTERS_HH__
--- a/src/arch/amdgpu/gcn3/insts/gpu_static_inst.hh
+++ b/src/arch/amdgpu/gcn3/insts/gpu_static_inst.hh
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_INSTS_GPU_STATIC_INST_HH__
-#define __ARCH_GCN3_INSTS_GPU_STATIC_INST_HH__
-
-#include "arch/amdgpu/gcn3/gpu_registers.hh"
-#include "arch/amdgpu/gcn3/operand.hh"
-#include "gpu-compute/gpu_static_inst.hh"
-#include "gpu-compute/scalar_register_file.hh"
-#include "gpu-compute/vector_register_file.hh"
-#include "gpu-compute/wavefront.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    class GCN3GPUStaticInst : public GPUStaticInst
-    {
-      public:
-        GCN3GPUStaticInst(const std::string &opcode);
-        ~GCN3GPUStaticInst();
-
-        void generateDisassembly() override { disassembly = _opcode; }
-
-        bool
-        isFlatScratchRegister(int opIdx) override
-        {
-            return isFlatScratchReg(opIdx);
-        }
-
-        bool
-        isExecMaskRegister(int opIdx) override
-        {
-            return isExecMask(opIdx);
-        }
-
-        void initOperandInfo() override { return; }
-        int getOperandSize(int opIdx) override { return 0; }
-
-        /**
-          * Return the number of tokens needed by the coalescer. In GCN3 there
-          * is generally one packet per memory request per lane generated. In
-          * HSAIL, the number of dest operands is used for loads and src
-          * operands for stores. This method should be overriden on a per-inst
-          * basis when this value differs.
-          */
-        int coalescerTokenCount() const override { return 1; }
-        ScalarRegU32 srcLiteral() const override { return _srcLiteral; }
-
-      protected:
-        void panicUnimplemented() const;
-
-        /**
-         * if the instruction has a src literal - an immediate
-         * value that is part of the instruction stream - we
-         * store that here
-         */
-        ScalarRegU32 _srcLiteral;
-    }; // class GCN3GPUStaticInst
-
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif //__ARCH_GCN3_INSTS_GPU_STATIC_INST_HH__
--- a/src/arch/amdgpu/gcn3/insts/inst_util.hh
+++ b/src/arch/amdgpu/gcn3/insts/inst_util.hh
@@ -1,896 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__
-#define __ARCH_GCN3_INSTS_INST_UTIL_HH__
-
-#include <cmath>
-
-#include "arch/amdgpu/gcn3/gpu_registers.hh"
-
-namespace gem5
-{
-
-// values for SDWA select operations
-enum SDWASelVals : int
-{
-    SDWA_BYTE_0 = 0, /* select data[7:0] */
-    SDWA_BYTE_1 = 1, /* select data[15:8] */
-    SDWA_BYTE_2 = 2, /* select data[23:16] */
-    SDWA_BYTE_3 = 3, /* select data[31:24] */
-    SDWA_WORD_0 = 4, /* select data[15:0] */
-    SDWA_WORD_1 = 5, /* select data[31:16] */
-    SDWA_DWORD  = 6  /* select data[31:0] */
-};
-
-// values for format of destination bits for SDWA operations
-enum SDWADstVals : int
-{
-    SDWA_UNUSED_PAD      = 0, /* Pad all unused bits with 0 */
-    SDWA_UNUSED_SEXT     = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
-    SDWA_UNUSED_PRESERVE = 2  /* select data[31:0] */
-};
-
-// values for DPP operations
-enum SqDPPVals : int
-{
-    SQ_DPP_QUAD_PERM_MAX   = 0xFF,
-    SQ_DPP_RESERVED        = 0x100,
-    SQ_DPP_ROW_SL1         = 0x101,
-    SQ_DPP_ROW_SL15        = 0x10F,
-    SQ_DPP_ROW_SR1         = 0x111,
-    SQ_DPP_ROW_SR15        = 0x11F,
-    SQ_DPP_ROW_RR1         = 0x121,
-    SQ_DPP_ROW_RR15        = 0x12F,
-    SQ_DPP_WF_SL1          = 0x130,
-    SQ_DPP_WF_RL1          = 0x134,
-    SQ_DPP_WF_SR1          = 0x138,
-    SQ_DPP_WF_RR1          = 0x13C,
-    SQ_DPP_ROW_MIRROR      = 0x140,
-    SQ_DPP_ROW_HALF_MIRROR = 0x141,
-    SQ_DPP_ROW_BCAST15     = 0x142,
-    SQ_DPP_ROW_BCAST31     = 0x143
-};
-static const int ROW_SIZE = 16; /* 16 registers per row */
-static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
-
-namespace Gcn3ISA
-{
-    template<typename T>
-    inline T
-    wholeQuadMode(T val)
-    {
-        T wqm = 0;
-        T mask = 0xF;
-
-        for (T bits = val; mask != 0; mask <<= 4)
-            if ((bits & mask) != 0)
-                wqm |= mask;
-
-        return wqm;
-    }
-
-    template<typename T>
-    inline T
-    quadMask(T val)
-    {
-        T qmsk = 0;
-        T mask = 0xF;
-        T qbit = 0x1;
-
-        for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
-            if (bits & mask) {
-                qmsk |= qbit;
-            }
-        }
-
-        return qmsk;
-    }
-
-    template<typename T>
-    inline ScalarRegI32
-    countZeroBits(T val)
-    {
-        ScalarRegI32 num_zeros
-            = std::numeric_limits<T>::digits - popCount(val);
-
-        return num_zeros;
-    }
-
-    template<typename T>
-    inline ScalarRegI32
-    findFirstZero(T val)
-    {
-        if (val == ~T(0)) {
-            return -1;
-        }
-
-        return findLsbSet(~val);
-    }
-
-    template<typename T>
-    inline ScalarRegI32
-    findFirstOne(T val)
-    {
-        if (!val) {
-            return -1;
-        }
-
-        return findLsbSet(val);
-    }
-
-    template<typename T>
-    inline ScalarRegI32
-    findFirstOneMsb(T val)
-    {
-        if (!val) {
-            return -1;
-        }
-
-        return findMsbSet(val);
-    }
-
-    template<typename T>
-    inline ScalarRegI32
-    countZeroBitsMsb(T val)
-    {
-        if (!val) {
-            return -1;
-        }
-
-        return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
-    }
-
-    inline ScalarRegI32
-    firstOppositeSignBit(ScalarRegI32 val)
-    {
-        bool found(false);
-        bool sign_bit = (val & 0x80000000) != 0;
-        ScalarRegU32 tmp_val(0);
-        int count(0);
-
-        if (!val || val == -1) {
-            return -1;
-        }
-
-        for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
-            tmp_val = val & (0x80000000 >> i);
-
-            if (!sign_bit) {
-                if (tmp_val) {
-                    found = true;
-                    break;
-                }
-            } else {
-                if (!tmp_val) {
-                    found = true;
-                    break;
-                }
-            }
-            ++count;
-        }
-
-        if (found) {
-            return count;
-        } else {
-            return -1;
-        }
-    }
-
-    inline ScalarRegI32
-    firstOppositeSignBit(ScalarRegI64 val)
-    {
-        bool found(false);
-        bool sign_bit = (val & 0x8000000000000000ULL) != 0;
-        ScalarRegU64 tmp_val(0);
-        int count(0);
-
-        if (!val || val == -1) {
-            return -1;
-        }
-
-        for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
-            tmp_val = val & (0x8000000000000000ULL >> i);
-
-            if (!sign_bit) {
-                if (tmp_val) {
-                    found = true;
-                    break;
-                }
-            } else {
-                if (!tmp_val) {
-                    found = true;
-                    break;
-                }
-            }
-            ++count;
-        }
-
-        if (found) {
-            return count;
-        } else {
-            return -1;
-        }
-    }
-
-    template<typename T>
-    inline T
-    median(T val_0, T val_1, T val_2)
-    {
-        if (std::is_floating_point_v<T>) {
-            return std::fmax(std::fmin(val_0, val_1),
-                std::fmin(std::fmax(val_0, val_1), val_2));
-        } else {
-            return std::max(std::min(val_0, val_1),
-                std::min(std::max(val_0, val_1), val_2));
-        }
-    }
-
-    template <typename T>
-    inline T roundNearestEven(T val)
-    {
-        T int_part = 0;
-        T nearest_round = std::floor(val + 0.5);
-        if ((int)std::floor(val) % 2 == 0
-            && std::modf(std::abs(val), &int_part) == 0.5) {
-          nearest_round = nearest_round - 1;
-        }
-
-        return nearest_round;
-    }
-
-    inline VecElemU32
-    muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1,
-        VecElemU64 val_2)
-    {
-        __uint128_t u0 = (__uint128_t)val_0;
-        __uint128_t u1 = (__uint128_t)val_1;
-        __uint128_t u2 = (__uint128_t)val_2;
-        __uint128_t result = u0 * u1 + u2;
-
-        dst = (VecElemU64)result;
-
-        return (VecElemU32)(result >> 64) ? 1 : 0;
-    }
-
-    inline VecElemU32
-    muladd(VecElemI64 &dst, VecElemI32 val_0, VecElemI32 val_1,
-        VecElemI64 val_2)
-    {
-        __int128_t u0 = (__int128_t)val_0;
-        __int128_t u1 = (__int128_t)val_1;
-        __int128_t u2 = (__int128_t)val_2;
-        __int128_t result = u0 * u1 + u2;
-
-        dst = (VecElemI64)result;
-
-        return (VecElemU32)(result >> 64) ? 1 : 0;
-    }
-
-    /**
-     * dppInstImpl is a helper function that performs the inputted operation
-     * on the inputted vector register lane.  The returned output lane
-     * represents the input lane given the destination lane and DPP_CTRL word.
-     *
-     * Currently the values are:
-     * 0x0 - 0xFF: full permute of four threads
-     * 0x100: reserved
-     * 0x101 - 0x10F: row shift right by 1-15 threads
-     * 0x111 - 0x11F: row shift right by 1-15 threads
-     * 0x121 - 0x12F: row shift right by 1-15 threads
-     * 0x130: wavefront left shift by 1 thread
-     * 0x134: wavefront left rotate by 1 thread
-     * 0x138: wavefront right shift by 1 thread
-     * 0x13C: wavefront right rotate by 1 thread
-     * 0x140: mirror threads within row
-     * 0x141: mirror threads within 1/2 row (8 threads)
-     * 0x142: broadcast 15th thread of each row to next row
-     * 0x143: broadcast thread 31 to rows 2 and 3
-     */
-    int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
-                    int rowOffset, bool & outOfBounds)
-    {
-        // local variables
-        // newLane will be the same as the input lane unless swizzling happens
-        int newLane = currLane;
-        // for shift/rotate permutations; positive values are LEFT rotates
-        int count = 1;
-        int localRowOffset = rowOffset;
-        int localRowNum = rowNum;
-
-        if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
-            int quadBase = (currLane & ~(3));
-            int quadPix = (currLane & 3);
-            quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
-            newLane = (quadBase | quadPix);
-        } else if (dppCtrl == SQ_DPP_RESERVED) {
-            panic("ERROR: instruction using reserved DPP_CTRL value\n");
-        } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
-                   (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
-            count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1);
-            if ((localRowOffset + count >= 0) &&
-                (localRowOffset + count < ROW_SIZE)) {
-                localRowOffset += count;
-                newLane = (rowNum | localRowOffset);
-            } else {
-                outOfBounds = true;
-            }
-        } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
-                   (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
-            count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1);
-            if ((localRowOffset + count >= 0) &&
-                (localRowOffset + count < ROW_SIZE)) {
-                localRowOffset += count;
-                newLane = (rowNum | localRowOffset);
-            } else {
-                outOfBounds = true;
-            }
-        } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
-                   (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
-            count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1);
-            localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
-            newLane = (rowNum | localRowOffset);
-        } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
-            count = 1;
-            if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
-                newLane += count;
-            } else {
-                outOfBounds = true;
-            }
-        } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
-            count = 1;
-            newLane = (currLane + count + NumVecElemPerVecReg) %
-                      NumVecElemPerVecReg;
-        } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
-            count = -1;
-            int currVal = (currLane + count);
-            if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
-                newLane += count;
-            } else {
-                outOfBounds = true;
-            }
-        } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
-            count = -1;
-            newLane = (currLane + count + NumVecElemPerVecReg) %
-                      NumVecElemPerVecReg;
-        } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
-            localRowOffset = (15 - localRowOffset);
-            newLane = (rowNum | localRowOffset);
-        } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
-            localRowNum = (currLane & -0x7);
-            localRowOffset = (currLane & 0x7);
-            localRowOffset = (7 - localRowNum);
-            newLane = (localRowNum | localRowOffset);
-        } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
-            count = 15;
-            if (currLane > count) {
-                newLane = (currLane & ~count) - 1;
-            }
-        } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
-            count = 31;
-            if (currLane > count) {
-                newLane = (currLane & ~count) - 1;
-            }
-        } else {
-            panic("Unimplemented DPP control operation: %d\n", dppCtrl);
-        }
-
-        return newLane;
-    }
-
-    /**
-     * processDPP is a helper function for implementing Data Parallel Primitive
-     * instructions.  This function may be called by many different VOP1
-     * instructions to do operations within a register.
-     */
-    template<typename T>
-    void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
-                    T & src0)
-    {
-        // local variables
-        SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
-        int boundCtrl = dppInst.BOUND_CTRL;
-        int bankMask = dppInst.BANK_MASK;
-        int rowMask = dppInst.ROW_MASK;
-        // row, bank info to be calculated per lane
-        int rowNum = 0, bankNum = 0, rowOffset = 0;
-        // outLane will be the same as the input lane unless swizzling happens
-        int outLane = 0;
-        bool laneDisabled = false;
-        // flags used for determining if a lane should be written to/reset/etc.
-        bool outOfBounds = false, zeroSrc = false;
-        long long threadValid = 0;
-
-        /**
-         * STEP 1a: check if the absolute value (ABS) or negation (NEG) tags
-         * are set.  If so, do the appropriate action(s) on src0 and/or src1.
-         *
-         * NOTE: ABS takes priority over NEG.
-         */
-        if (dppInst.SRC0_NEG) {
-            src0.negModifier();
-        }
-
-        if (dppInst.SRC0_ABS) {
-            src0.absModifier();
-        }
-
-        // iterate over all register lanes, performing steps 2-4
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            threadValid = (0x1LL << lane);
-            /**
-             * STEP 2: check the row and bank mask values.  These determine
-             * which threads are enabled for the subsequent DPP_CTRL
-             * operations.
-             */
-            rowNum = (lane / ROW_SIZE);
-            rowOffset = (lane % ROW_SIZE);
-            bankNum = (rowOffset / NUM_BANKS);
-
-            if (((rowMask & (0x1 << rowNum)) == 0)   /* row mask */   ||
-                ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
-                laneDisabled = true;
-                continue;
-            }
-
-            /**
-             * STEP 4: Handle the potential values of DPP_CTRL:
-             * 0x0 - 0xFF: full permute of four threads
-             * 0x100: reserved
-             * 0x101 - 0x10F: row shift right by 1-15 threads
-             * 0x111 - 0x11F: row shift right by 1-15 threads
-             * 0x121 - 0x12F: row shift right by 1-15 threads
-             * 0x130: wavefront left shift by 1 thread
-             * 0x134: wavefront left rotate by 1 thread
-             * 0x138: wavefront right shift by 1 thread
-             * 0x13C: wavefront right rotate by 1 thread
-             * 0x140: mirror threads within row
-             * 0x141: mirror threads within 1/2 row (8 threads)
-             * 0x142: broadcast 15th thread of each row to next row
-             * 0x143: broadcast thread 31 to rows 2 and 3
-             */
-            if (!laneDisabled) {
-                outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
-                                      outOfBounds);
-            }
-
-            /**
-             * STEP 4: Implement bound control for disabled threads.  If thread
-             * is disabled but boundCtrl is set, then we need to set the source
-             * data to 0 (i.e., set this lane to 0).
-             */
-            if (laneDisabled) {
-                threadValid = 0;
-            } else if (outOfBounds) {
-                if (boundCtrl == 1) {
-                    zeroSrc = true;
-                } else {
-                    threadValid = 0;
-                }
-            } else if (!gpuDynInst->exec_mask[lane]) {
-                if (boundCtrl == 1) {
-                    zeroSrc = true;
-                } else {
-                    threadValid = 0;
-                }
-            }
-
-            if (threadValid != 0 && !outOfBounds && !zeroSrc) {
-                assert(!laneDisabled);
-                src0[outLane] = src0[lane];
-            } else if (zeroSrc) {
-                src0[lane] = 0;
-            }
-
-            // reset for next iteration
-            laneDisabled = false;
-        }
-    }
-
-    /**
-     * processDPP is a helper function for implementing Data Parallel Primitive
-     * instructions.  This function may be called by many different
-     * VOP2/VOPC instructions to do operations within a register.
-     */
-    template<typename T>
-    void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
-                    T & src0, T & src1)
-    {
-        /**
-         * STEP 1b: check if the absolute value (ABS) or negation (NEG) tags
-         * are set.  If so, do the appropriate action(s) on src0 and/or src1.
-         *
-         * NOTE: ABS takes priority over NEG.
-         */
-        if (dppInst.SRC1_NEG) {
-            src1.negModifier();
-        }
-
-        if (dppInst.SRC1_ABS) {
-            src1.absModifier();
-        }
-
-        // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
-        // which is only used for negation/absolute value, call other version
-        // to do everything else.
-        processDPP(gpuDynInst, dppInst, src0);
-    }
-
-    /**
-     * sdwaInstSrcImpl_helper contains the per-lane code for selecting the
-     * appropriate bytes/words of the lane and doing the appropriate
-     * masking/padding/sign extending.  It returns the value after these
-     * operations are done on it.
-     */
-    template<typename T>
-    T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
-                             const SDWASelVals sel, const bool signExt)
-    {
-        // local variables
-        int low_bit = 0, high_bit = 0;
-        bool signExt_local = signExt;
-        T retVal = 0;
-
-        // if we're preserving all of the bits, then we can immediately return
-        if (sel == SDWA_DWORD) {
-            return currOperVal;
-        }
-
-        if (sel < SDWA_WORD_0) { // we are selecting 1 byte
-            /*
-              Process byte 0 first.  This code eiter selects the original bits
-              of byte 0, or makes the bits of the selected byte be byte 0 (and
-              next either sign extends or zero's out upper bits).
-            */
-            low_bit = (sel * Gcn3ISA::BITS_PER_BYTE);
-            high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
-            retVal = bits(currOperVal, high_bit, low_bit);
-
-            // make sure update propagated, since used next
-            fatal_if(bits(retVal, Gcn3ISA::MSB_PER_BYTE) !=
-                     bits(origOperVal, high_bit),
-                     "ERROR: SDWA byte update not propagated: retVal: %d, "
-                     "orig: %d\n", bits(retVal, Gcn3ISA::MSB_PER_BYTE),
-                     bits(origOperVal, high_bit));
-            // sign extended value depends on upper-most bit of the new byte 0
-            signExt_local = (signExt &&
-                             (bits(retVal, Gcn3ISA::MSB_PER_BYTE, 0) & 0x80));
-
-            // process all other bytes -- if sign extending, make them 1, else
-            // all 0's so leave as is
-            if (signExt_local) {
-                retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
-            }
-        } else if (sel < SDWA_DWORD) { // we are selecting 1 word
-            /*
-              Process word 0 first.  This code eiter selects the original bits
-              of word 0, or makes the bits of the selected word be word 0 (and
-              next either sign extends or zero's out upper bits).
-            */
-            low_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD;
-            high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
-            retVal = bits(currOperVal, high_bit, low_bit);
-
-            // make sure update propagated, since used next
-            fatal_if(bits(retVal, Gcn3ISA::MSB_PER_WORD) !=
-                     bits(origOperVal, high_bit),
-                     "ERROR: SDWA word update not propagated: retVal: %d, "
-                     "orig: %d\n",
-                     bits(retVal, Gcn3ISA::MSB_PER_WORD),
-                     bits(origOperVal, high_bit));
-            // sign extended value depends on upper-most bit of the new word 0
-            signExt_local = (signExt &&
-                             (bits(retVal, Gcn3ISA::MSB_PER_WORD, 0) &
-                              0x8000));
-
-            // process other word -- if sign extending, make them 1, else all
-            // 0's so leave as is
-            if (signExt_local) {
-                retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
-            }
-        } else {
-            assert(sel != SDWA_DWORD); // should have returned earlier
-            panic("Unimplemented SDWA select operation: %d\n", sel);
-        }
-
-        return retVal;
-    }
-
-
-    /**
-     * sdwaInstSrcImpl is a helper function that selects the appropriate
-     * bits/bytes for each lane of the inputted source operand of an SDWA
-     * instruction, does the appropriate masking/padding/sign extending for the
-     * non-selected bits/bytes, and updates the operands values with the
-     * resultant value.
-     *
-     * The desired behavior is:
-     *   1.  Select the appropriate bits/bytes based on sel:
-     *       0 (SDWA_BYTE_0): select data[7:0]
-     *       1 (SDWA_BYTE_1): select data[15:8]
-     *       2 (SDWA_BYTE_2): select data[23:16]
-     *       3 (SDWA_BYTE_3): select data[31:24]
-     *       4 (SDWA_WORD_0): select data[15:0]
-     *       5 (SDWA_WORD_1): select data[31:16]
-     *       6 (SDWA_DWORD): select data[31:0]
-     *   2.  if sign extend is set, then sign extend the value
-     */
-    template<typename T>
-    void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
-                         const SDWASelVals sel, const bool signExt)
-    {
-        // iterate over all lanes, setting appropriate, selected value
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
-                                                    origCurrOper[lane], sel,
-                                                    signExt);
-        }
-    }
-
-
-    /**
-     * sdwaInstDstImpl_helper contains the per-lane code for selecting the
-     * appropriate bytes/words of the lane and doing the appropriate
-     * masking/padding/sign extending.  It returns the value after these
-     * operations are done on it.
-     */
-    template<typename T>
-    T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
-                             const bool clamp, const SDWASelVals sel,
-                             const SDWADstVals unusedBits_format)
-    {
-        // local variables
-        int low_bit = 0, high_bit = 0;
-        bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
-        //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
-        bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
-        T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
-          origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
-
-        // if we're preserving all of the bits, then we can immediately return
-        if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
-            assert(sel == SDWA_DWORD);
-            return currDstVal;
-        } else if (sel == SDWA_DWORD) {
-            // NOTE: users may set the unused bits variable to anything in this
-            // scenario, because it will be ignored
-            return currDstVal;
-        }
-
-        if (sel < SDWA_WORD_0) { // we are selecting 1 byte
-            // if we sign extended depends on upper-most bit of byte 0
-            signExt = (signExt &&
-                       (bits(currDstVal, Gcn3ISA::MSB_PER_WORD, 0) & 0x80));
-
-            for (int byte = 0; byte < 4; ++byte) {
-                low_bit = byte * Gcn3ISA::BITS_PER_BYTE;
-                high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
-                /*
-                  Options:
-                    1.  byte == sel: we are keeping all bits in this byte
-                    2.  preserve is set: keep this byte as is because the
-                    output preserve flag is set
-                    3.  byte > sel && signExt: we're sign extending and
-                    this byte is one of the bytes we need to sign extend
-                */
-                origBits_thisByte = bits(origDstVal, high_bit, low_bit);
-                currBits_thisByte = bits(currDstVal, high_bit, low_bit);
-                newBits = ((byte == sel) ? origBits_thisByte :
-                           ((preserve) ? currBits_thisByte :
-                            (((byte > sel) && signExt) ? 0xff : 0)));
-                retVal = insertBits(retVal, high_bit, low_bit, newBits);
-            }
-        } else if (sel < SDWA_DWORD) { // we are selecting 1 word
-            low_bit = 0;
-            high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
-            // if we sign extended depends on upper-most bit of word 0
-            signExt = (signExt &&
-                       (bits(currDstVal, high_bit, low_bit) & 0x8000));
-
-            for (int word = 0; word < 2; ++word) {
-                low_bit = word * Gcn3ISA::BITS_PER_WORD;
-                high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
-                /*
-                  Options:
-                    1.  word == sel & 1: we are keeping all bits in this word
-                    2.  preserve is set: keep this word as is because the
-                    output preserve flag is set
-                    3.  word > (sel & 1) && signExt: we're sign extending and
-                    this word is one of the words we need to sign extend
-                */
-                origBits_thisWord = bits(origDstVal, high_bit, low_bit);
-                currBits_thisWord = bits(currDstVal, high_bit, low_bit);
-                newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
-                           ((preserve) ? currBits_thisWord :
-                            (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
-                retVal = insertBits(retVal, high_bit, low_bit, newBits);
-            }
-        } else {
-            assert(sel != SDWA_DWORD); // should have returned earlier
-            panic("Unimplemented SDWA select operation: %d\n", sel);
-        }
-
-        return retVal;
-    }
-
-
-    /**
-     * sdwaInstDestImpl is a helper function that selects the appropriate
-     * bits/bytes for the inputted dest operand of an SDWA instruction, does
-     * the appropriate masking/padding/sign extending for the non-selected
-     * bits/bytes, and updates the operands values with the resultant value.
-     *
-     * The desired behavior is:
-     *   1.  Select the appropriate bits/bytes based on sel:
-     *       0 (SDWA_BYTE_0): select data[7:0]
-     *       1 (SDWA_BYTE_1): select data[15:8]
-     *       2 (SDWA_BYTE_2): select data[23:16]
-     *       3 (SDWA_BYTE_3): select data[31:24]
-     *       4 (SDWA_WORD_0): select data[15:0]
-     *       5 (SDWA_WORD_1): select data[31:16]
-     *       6 (SDWA_DWORD): select data[31:0]
-     *   2.  either pad, sign extend, or select all bits based on the value of
-     *   unusedBits_format:
-     *       0 (SDWA_UNUSED_PAD): pad all unused bits with 0
-     *       1 (SDWA_UNUSED_SEXT): sign-extend upper bits; pad lower bits w/ 0
-     *       2 (SDWA_UNUSED_PRESERVE): select data[31:0]
-     */
-    template<typename T>
-    void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
-                         const SDWASelVals sel,
-                         const SDWADstVals unusedBits_format)
-    {
-        // iterate over all lanes, setting appropriate, selected value
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
-                                                   origDstOper[lane], clamp,
-                                                   sel, unusedBits_format);
-        }
-    }
-
-
-    /**
-     * processSDWA_srcHelper is a helper function for implementing sub d-word
-     * addressing instructions for the src operands.  This function may be
-     * called by many different VOP1/VOP2/VOPC instructions to do operations
-     * within a register.  This function is also agnostic of which operand it
-     * is operating on, so that it can be called for any src operand.
-     */
-    template<typename T>
-    void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
-                                const SDWASelVals src_sel,
-                                const bool src_signExt, const bool src_abs,
-                                const bool src_neg)
-    {
-        /**
-         * STEP 1: check if the absolute value (ABS) or negation (NEG) tags
-         * are set.  If so, do the appropriate action(s) on the src operand.
-         *
-         * NOTE: According to the CSim implementation, ABS takes priority over
-         * NEG.
-         */
-        if (src_neg) {
-            currSrc.negModifier();
-        }
-
-        if (src_abs) {
-            currSrc.absModifier();
-        }
-
-        /**
-         * STEP 2: select the appropriate bits for each lane of source operand.
-         */
-        sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
-    }
-
-
-    /**
-     * processSDWA_src is a helper function for implementing sub d-word
-     * addressing instructions for the src operands.  This function may be
-     * called by many different VOP1 instructions to do operations within a
-     * register.  processSDWA_dst is called after the math, while
-     * processSDWA_src is called before the math.
-     */
-    template<typename T>
-    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
-    {
-        // local variables
-        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
-        const bool src0_signExt = sdwaInst.SRC0_SEXT;
-        const bool src0_neg = sdwaInst.SRC0_NEG;
-        const bool src0_abs = sdwaInst.SRC0_ABS;
-
-        // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
-        // operand.  So ensure that SRC1 fields are not set, then call helper
-        // function only on src0.
-        assert(!sdwaInst.SRC1_SEXT);
-        assert(!sdwaInst.SRC1_NEG);
-        assert(!sdwaInst.SRC1_ABS);
-
-        processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
-                               src0_abs, src0_neg);
-    }
-
-
-    /**
-     * processSDWA_src is a helper function for implementing sub d-word
-     * addressing instructions.  This function may be called by many different
-     * VOP2/VOPC instructions to do operations within a register.
-     * processSDWA_dst is called after the math, while processSDWA_src is
-     * called before the math.
-     */
-    template<typename T>
-    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
-                         T & src1, T & origSrc1)
-    {
-        // local variables
-        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
-        const bool src0_signExt = sdwaInst.SRC0_SEXT;
-        const bool src0_neg = sdwaInst.SRC0_NEG;
-        const bool src0_abs = sdwaInst.SRC0_ABS;
-        const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
-        const bool src1_signExt = sdwaInst.SRC1_SEXT;
-        const bool src1_neg = sdwaInst.SRC1_NEG;
-        const bool src1_abs = sdwaInst.SRC1_ABS;
-
-        processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
-                               src0_abs, src0_neg);
-        processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
-                               src1_abs, src1_neg);
-    }
-
-
-    /**
-     * processSDWA_dst is a helper function for implementing sub d-word
-     * addressing instructions for the dst operand.  This function may be
-     * called by many different VOP1/VOP2/VOPC instructions to do operations
-     * within a register.  processSDWA_dst is called after the math, while
-     * processSDWA_src is called before the math.
-     */
-    template<typename T>
-    void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
-    {
-        // local variables
-        const SDWADstVals dst_unusedBits_format =
-            (SDWADstVals)sdwaInst.DST_UNUSED;
-        const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
-        const bool clamp = sdwaInst.CLAMP;
-
-        /**
-         * STEP 1: select the appropriate bits for dst and pad/sign-extend as
-         * appropriate.
-         */
-        sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
-    }
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
--- a/src/arch/amdgpu/gcn3/insts/instructions.hh
+++ b/src/arch/amdgpu/gcn3/insts/instructions.hh
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.cc
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.cc
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
@@ -1,925 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
-#define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
-
-#include "arch/amdgpu/gcn3/gpu_decoder.hh"
-#include "arch/amdgpu/gcn3/gpu_mem_helpers.hh"
-#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
-#include "arch/amdgpu/gcn3/operand.hh"
-#include "debug/GCN3.hh"
-#include "debug/GPUExec.hh"
-#include "mem/ruby/system/RubySystem.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    struct BufferRsrcDescriptor
-    {
-        uint64_t baseAddr : 48;
-        uint32_t stride : 14;
-        uint32_t cacheSwizzle : 1;
-        uint32_t swizzleEn : 1;
-        uint32_t numRecords : 32;
-        uint32_t dstSelX : 3;
-        uint32_t dstSelY : 3;
-        uint32_t dstSelZ : 3;
-        uint32_t dstSelW : 3;
-        uint32_t numFmt : 3;
-        uint32_t dataFmt : 4;
-        uint32_t elemSize : 2;
-        uint32_t idxStride : 2;
-        uint32_t addTidEn : 1;
-        uint32_t atc : 1;
-        uint32_t hashEn : 1;
-        uint32_t heap : 1;
-        uint32_t mType : 3;
-        uint32_t type : 2;
-    };
-
-    // --- purely virtual instruction classes ---
-
-    class Inst_SOP2 : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_SOP2 instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_SOP2 *);
-    }; // Inst_SOP2
-
-    class Inst_SOPK : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
-        ~Inst_SOPK();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_SOPK instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_SOPK *);
-    }; // Inst_SOPK
-
-    class Inst_SOP1 : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
-        ~Inst_SOP1();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_SOP1 instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_SOP1 *);
-    }; // Inst_SOP1
-
-    class Inst_SOPC : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
-        ~Inst_SOPC();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_SOPC instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_SOPC *);
-    }; // Inst_SOPC
-
-    class Inst_SOPP : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
-        ~Inst_SOPP();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_SOPP instData;
-    }; // Inst_SOPP
-
-    class Inst_SMEM : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
-        ~Inst_SMEM();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        /**
-         * initiate a memory read access for N dwords
-         */
-        template<int N>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst)
-        {
-            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
-                                                    MemCmd::ReadReq);
-        }
-
-        /**
-         * initiate a memory write access for N dwords
-         */
-        template<int N>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst)
-        {
-            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
-                                                    MemCmd::WriteReq);
-        }
-
-        /**
-         * For normal s_load_dword/s_store_dword instruction addresses.
-         */
-        void
-        calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr,
-                 ScalarRegU32 offset)
-        {
-            Addr vaddr = ((addr.rawData() + offset) & ~0x3);
-            gpu_dyn_inst->scalarAddr = vaddr;
-        }
-
-        /**
-         * For s_buffer_load_dword/s_buffer_store_dword instruction addresses.
-         * The s_buffer instructions use the same buffer resource descriptor
-         * as the MUBUF instructions.
-         */
-        void
-        calcAddr(GPUDynInstPtr gpu_dyn_inst,
-                 ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
-        {
-            BufferRsrcDescriptor rsrc_desc;
-            ScalarRegU32 clamped_offset(offset);
-            std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
-                        sizeof(BufferRsrcDescriptor));
-
-            /**
-             * The address is clamped if:
-             *     Stride is zero: clamp if offset >= num_records
-             *     Stride is non-zero: clamp if offset > (stride * num_records)
-             */
-            if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) {
-                clamped_offset = rsrc_desc.numRecords;
-            } else if (rsrc_desc.stride && offset
-                       > (rsrc_desc.stride * rsrc_desc.numRecords)) {
-                clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
-            }
-
-            Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
-            gpu_dyn_inst->scalarAddr = vaddr;
-        }
-
-        // first instruction DWORD
-        InFmt_SMEM instData;
-        // second instruction DWORD
-        InFmt_SMEM_1 extData;
-    }; // Inst_SMEM
-
-    class Inst_VOP2 : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
-        ~Inst_VOP2();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VOP2 instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_VOP2 *);
-    }; // Inst_VOP2
-
-    class Inst_VOP1 : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
-        ~Inst_VOP1();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VOP1 instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_VOP1 *);
-    }; // Inst_VOP1
-
-    class Inst_VOPC : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
-        ~Inst_VOPC();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VOPC instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_VOPC *);
-    }; // Inst_VOPC
-
-    class Inst_VINTRP : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
-        ~Inst_VINTRP();
-
-        int instSize() const override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VINTRP instData;
-    }; // Inst_VINTRP
-
-    class Inst_VOP3 : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VOP3(InFmt_VOP3*, const std::string &opcode, bool sgpr_dst);
-        ~Inst_VOP3();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VOP3 instData;
-        // second instruction DWORD
-        InFmt_VOP3_1 extData;
-
-      private:
-        bool hasSecondDword(InFmt_VOP3 *);
-        /**
-         * the v_cmp and readlane instructions in the VOP3
-         * encoding are unique because they are the only
-         * instructions that use the VDST field to specify
-         * a scalar register destination. for VOP3::V_CMP insts
-         * VDST specifies the arbitrary SGPR pair used to write
-         * VCC. for V_READLANE VDST specifies the SGPR to return
-         * the value of the selected lane in the source VGPR
-         * from which we are reading.
-         */
-        const bool sgprDst;
-    }; // Inst_VOP3
-
-    class Inst_VOP3_SDST_ENC : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VOP3_SDST_ENC(InFmt_VOP3_SDST_ENC*, const std::string &opcode);
-        ~Inst_VOP3_SDST_ENC();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VOP3_SDST_ENC instData;
-        // second instruction DWORD
-        InFmt_VOP3_1 extData;
-
-      private:
-        bool hasSecondDword(InFmt_VOP3_SDST_ENC *);
-    }; // Inst_VOP3_SDST_ENC
-
-    class Inst_DS : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_DS(InFmt_DS*, const std::string &opcode);
-        ~Inst_DS();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        template<typename T>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane] + offset;
-
-                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
-                        = wf->ldsChunk->read<T>(vaddr);
-                }
-            }
-        }
-
-        template<int N>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane] + offset;
-                    for (int i = 0; i < N; ++i) {
-                        (reinterpret_cast<VecElemU32*>(
-                            gpuDynInst->d_data))[lane * N + i]
-                            = wf->ldsChunk->read<VecElemU32>(
-                                vaddr + i*sizeof(VecElemU32));
-                    }
-                }
-            }
-        }
-
-        template<typename T>
-        void
-        initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
-                    Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
-
-                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
-                        = wf->ldsChunk->read<T>(vaddr0);
-                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
-                        = wf->ldsChunk->read<T>(vaddr1);
-                }
-            }
-        }
-
-        template<typename T>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane] + offset;
-                    wf->ldsChunk->write<T>(vaddr,
-                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
-                }
-            }
-        }
-
-        template<int N>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane] + offset;
-                    for (int i = 0; i < N; ++i) {
-                        wf->ldsChunk->write<VecElemU32>(
-                            vaddr + i*sizeof(VecElemU32),
-                            (reinterpret_cast<VecElemU32*>(
-                                gpuDynInst->d_data))[lane * N + i]);
-                    }
-                }
-            }
-        }
-
-        template<typename T>
-        void
-        initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
-                    Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
-                    wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane * 2]);
-                    wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane * 2 + 1]);
-                }
-            }
-        }
-
-        void
-        calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    gpuDynInst->addr.at(lane) = (Addr)addr[lane];
-                }
-            }
-        }
-
-        // first instruction DWORD
-        InFmt_DS instData;
-        // second instruction DWORD
-        InFmt_DS_1 extData;
-    }; // Inst_DS
-
-    class Inst_MUBUF : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
-        ~Inst_MUBUF();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        template<typename T>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst)
-        {
-            // temporarily modify exec_mask to supress memory accesses to oob
-            // regions.  Only issue memory requests for lanes that have their
-            // exec_mask set and are not out of bounds.
-            VectorMask old_exec_mask = gpuDynInst->exec_mask;
-            gpuDynInst->exec_mask &= ~oobMask;
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
-            gpuDynInst->exec_mask = old_exec_mask;
-        }
-
-
-        template<int N>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst)
-        {
-            // temporarily modify exec_mask to supress memory accesses to oob
-            // regions.  Only issue memory requests for lanes that have their
-            // exec_mask set and are not out of bounds.
-            VectorMask old_exec_mask = gpuDynInst->exec_mask;
-            gpuDynInst->exec_mask &= ~oobMask;
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
-            gpuDynInst->exec_mask = old_exec_mask;
-        }
-
-        template<typename T>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst)
-        {
-            // temporarily modify exec_mask to supress memory accesses to oob
-            // regions.  Only issue memory requests for lanes that have their
-            // exec_mask set and are not out of bounds.
-            VectorMask old_exec_mask = gpuDynInst->exec_mask;
-            gpuDynInst->exec_mask &= ~oobMask;
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
-            gpuDynInst->exec_mask = old_exec_mask;
-        }
-
-        template<int N>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst)
-        {
-            // temporarily modify exec_mask to supress memory accesses to oob
-            // regions.  Only issue memory requests for lanes that have their
-            // exec_mask set and are not out of bounds.
-            VectorMask old_exec_mask = gpuDynInst->exec_mask;
-            gpuDynInst->exec_mask &= ~oobMask;
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
-            gpuDynInst->exec_mask = old_exec_mask;
-        }
-
-        void
-        injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
-        {
-            // create request and set flags
-            gpuDynInst->resetEntireStatusVector();
-            gpuDynInst->setStatusVector(0, 1);
-            RequestPtr req = std::make_shared<Request>(0, 0, 0,
-                                       gpuDynInst->computeUnit()->
-                                       requestorId(), 0,
-                                       gpuDynInst->wfDynId);
-            gpuDynInst->setRequestFlags(req);
-            gpuDynInst->computeUnit()->
-                injectGlobalMemFence(gpuDynInst, false, req);
-        }
-
-        /**
-         * MUBUF insructions calculate their addresses as follows:
-         *
-         * index  = (IDXEN ? vgpr_idx : 0) + (const_add_tid_en ? TID : 0)
-         * offset = (OFFEN ? vgpr_off : 0) + inst_off
-         *
-         * / ====================== LINEAR ADDRESSING ====================== /
-         * VADDR = base + sgpr_off + offset + stride * index
-         *
-         * / ===================== SWIZZLED ADDRESSING ===================== /
-         * index_msb  = index / const_index_stride
-         * index_lsb  = index % const_index_stride
-         * offset_msb = offset / const_element_size
-         * offset_lsb = offset % const_element_size
-         * buffer_offset = ((index_msb * stride + offset_msb *
-         *                  const_element_size) * const_index_stride +
-         *                  index_lsb * const_element_size + offset_lsb)
-         *
-         * VADDR = base + sgpr_off + buffer_offset
-         */
-        template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
-        void
-        calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
-            SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
-        {
-            Addr vaddr = 0;
-            Addr base_addr = 0;
-            Addr stride = 0;
-            Addr buf_idx = 0;
-            Addr buf_off = 0;
-            Addr buffer_offset = 0;
-            BufferRsrcDescriptor rsrc_desc;
-
-            std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
-                sizeof(BufferRsrcDescriptor));
-
-            base_addr = rsrc_desc.baseAddr;
-
-            stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
-                + rsrc_desc.stride) : rsrc_desc.stride;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vaddr = base_addr + s_offset.rawData();
-                    /**
-                     * first we calculate the buffer's index and offset.
-                     * these will be used for either linear or swizzled
-                     * buffers.
-                     */
-                    buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
-
-                    buf_off = v_off[lane] + inst_offset;
-
-                    if (rsrc_desc.swizzleEn) {
-                        Addr idx_stride = 8 << rsrc_desc.idxStride;
-                        Addr elem_size = 2 << rsrc_desc.elemSize;
-                        Addr idx_msb = buf_idx / idx_stride;
-                        Addr idx_lsb = buf_idx % idx_stride;
-                        Addr off_msb = buf_off / elem_size;
-                        Addr off_lsb = buf_off % elem_size;
-                        DPRINTF(GCN3, "mubuf swizzled lane %d: "
-                                "idx_stride = %llx, elem_size = %llx, "
-                                "idx_msb = %llx, idx_lsb = %llx, "
-                                "off_msb = %llx, off_lsb = %llx\n",
-                                lane, idx_stride, elem_size, idx_msb, idx_lsb,
-                                off_msb, off_lsb);
-
-                        buffer_offset =(idx_msb * stride + off_msb * elem_size)
-                            * idx_stride + idx_lsb * elem_size + off_lsb;
-                    } else {
-                        buffer_offset = buf_off + stride * buf_idx;
-                    }
-
-
-                    /**
-                     * Range check behavior causes out of range accesses to
-                     * to be treated differently. Out of range accesses return
-                     * 0 for loads and are ignored for stores. For
-                     * non-formatted accesses, this is done on a per-lane
-                     * basis.
-                     */
-                    if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
-                        if (buffer_offset >=
-                            rsrc_desc.numRecords - s_offset.rawData()) {
-                            DPRINTF(GCN3, "mubuf out-of-bounds condition 1: "
-                                    "lane = %d, buffer_offset = %llx, "
-                                    "const_stride = %llx, "
-                                    "const_num_records = %llx\n",
-                                    lane, buf_off + stride * buf_idx,
-                                    rsrc_desc.stride, rsrc_desc.numRecords);
-                            oobMask.set(lane);
-                            continue;
-                        }
-                    }
-
-                    if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
-                        if (buf_idx >= rsrc_desc.numRecords ||
-                            buf_off >= stride) {
-                            DPRINTF(GCN3, "mubuf out-of-bounds condition 2: "
-                                    "lane = %d, offset = %llx, "
-                                    "index = %llx, "
-                                    "const_num_records = %llx\n",
-                                    lane, buf_off, buf_idx,
-                                    rsrc_desc.numRecords);
-                            oobMask.set(lane);
-                            continue;
-                        }
-                    }
-
-                    vaddr += buffer_offset;
-
-                    DPRINTF(GCN3, "Calculating mubuf address for lane %d: "
-                            "vaddr = %llx, base_addr = %llx, "
-                            "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
-                            lane, vaddr, base_addr, stride,
-                            buf_idx, buf_off);
-                    gpuDynInst->addr.at(lane) = vaddr;
-                }
-            }
-        }
-
-        // first instruction DWORD
-        InFmt_MUBUF instData;
-        // second instruction DWORD
-        InFmt_MUBUF_1 extData;
-        // Mask of lanes with out-of-bounds accesses.  Needs to be tracked
-        // seperately from the exec_mask so that we remember to write zero
-        // to the registers associated with out of bounds lanes.
-        VectorMask oobMask;
-    }; // Inst_MUBUF
-
-    class Inst_MTBUF : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
-        ~Inst_MTBUF();
-
-        int instSize() const override;
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_MTBUF instData;
-        // second instruction DWORD
-        InFmt_MTBUF_1 extData;
-
-      private:
-        bool hasSecondDword(InFmt_MTBUF *);
-    }; // Inst_MTBUF
-
-    class Inst_MIMG : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
-        ~Inst_MIMG();
-
-        int instSize() const override;
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_MIMG instData;
-        // second instruction DWORD
-        InFmt_MIMG_1 extData;
-    }; // Inst_MIMG
-
-    class Inst_EXP : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_EXP(InFmt_EXP*, const std::string &opcode);
-        ~Inst_EXP();
-
-        int instSize() const override;
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_EXP instData;
-        // second instruction DWORD
-        InFmt_EXP_1 extData;
-    }; // Inst_EXP
-
-    class Inst_FLAT : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
-        ~Inst_FLAT();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        template<typename T>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst)
-        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
-            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-                Wavefront *wf = gpuDynInst->wavefront();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (gpuDynInst->exec_mask[lane]) {
-                        Addr vaddr = gpuDynInst->addr[lane];
-                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
-                            = wf->ldsChunk->read<T>(vaddr);
-                    }
-                }
-            }
-        }
-
-        template<int N>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst)
-        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
-            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-                Wavefront *wf = gpuDynInst->wavefront();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (gpuDynInst->exec_mask[lane]) {
-                        Addr vaddr = gpuDynInst->addr[lane];
-                        for (int i = 0; i < N; ++i) {
-                            (reinterpret_cast<VecElemU32*>(
-                                gpuDynInst->d_data))[lane * N + i]
-                                = wf->ldsChunk->read<VecElemU32>(
-                                        vaddr + i*sizeof(VecElemU32));
-                        }
-                    }
-                }
-            }
-        }
-
-        template<typename T>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst)
-        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
-            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-                Wavefront *wf = gpuDynInst->wavefront();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (gpuDynInst->exec_mask[lane]) {
-                        Addr vaddr = gpuDynInst->addr[lane];
-                        wf->ldsChunk->write<T>(vaddr,
-                            (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
-                    }
-                }
-            }
-        }
-
-        template<int N>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst)
-        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
-            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-                Wavefront *wf = gpuDynInst->wavefront();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (gpuDynInst->exec_mask[lane]) {
-                        Addr vaddr = gpuDynInst->addr[lane];
-                        for (int i = 0; i < N; ++i) {
-                            wf->ldsChunk->write<VecElemU32>(
-                                vaddr + i*sizeof(VecElemU32),
-                                (reinterpret_cast<VecElemU32*>(
-                                    gpuDynInst->d_data))[lane * N + i]);
-                        }
-                    }
-                }
-            }
-        }
-
-        template<typename T>
-        void
-        initAtomicAccess(GPUDynInstPtr gpuDynInst)
-        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
-            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-                Wavefront *wf = gpuDynInst->wavefront();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (gpuDynInst->exec_mask[lane]) {
-                        Addr vaddr = gpuDynInst->addr[lane];
-                        auto amo_op =
-                            gpuDynInst->makeAtomicOpFunctor<T>(
-                                &(reinterpret_cast<T*>(
-                                    gpuDynInst->a_data))[lane],
-                                &(reinterpret_cast<T*>(
-                                    gpuDynInst->x_data))[lane]);
-
-                        T tmp = wf->ldsChunk->read<T>(vaddr);
-                        (*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
-                        wf->ldsChunk->write<T>(vaddr, tmp);
-                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
-                    }
-                }
-            }
-        }
-
-        void
-        calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr)
-        {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    gpuDynInst->addr.at(lane) = addr[lane];
-                }
-            }
-            gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
-        }
-
-        // first instruction DWORD
-        InFmt_FLAT instData;
-        // second instruction DWORD
-        InFmt_FLAT_1 extData;
-    }; // Inst_FLAT
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif // __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
--- a/src/arch/amdgpu/gcn3/isa.cc
+++ b/src/arch/amdgpu/gcn3/isa.cc
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "arch/amdgpu/gcn3/gpu_isa.hh"
-
-#include <numeric>
-
-#include "gpu-compute/gpu_static_inst.hh"
-#include "gpu-compute/wavefront.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    GPUISA::GPUISA(Wavefront &wf) : wavefront(wf), m0(0)
-    {
-    }
-
-    ScalarRegU32
-    GPUISA::readMiscReg(int opIdx) const
-    {
-        switch (opIdx) {
-          case REG_M0:
-            return m0;
-          case REG_ZERO:
-            return 0;
-          case REG_SCC:
-            return statusReg.SCC;
-          default:
-            fatal("attempting to read from unsupported or non-readable "
-                  "register. selector val: %i\n", opIdx);
-            return 0;
-        }
-    }
-
-    void
-    GPUISA::writeMiscReg(int opIdx, ScalarRegU32 operandVal)
-    {
-        switch (opIdx) {
-          case REG_M0:
-            m0 = operandVal;
-            break;
-          case REG_SCC:
-            statusReg.SCC = operandVal ? 1 : 0;
-            break;
-          default:
-            fatal("attempting to write to an unsupported or non-writable "
-                  "register. selector val: %i\n", opIdx);
-            break;
-        }
-    }
-
-    void
-    GPUISA::advancePC(GPUDynInstPtr gpuDynInst)
-    {
-        wavefront.pc(wavefront.pc()
-                     + gpuDynInst->staticInstruction()->instSize());
-    }
-
-    const std::array<const ScalarRegU32, NumPosConstRegs>
-        GPUISA::posConstRegs = { {
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-            20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
-            37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
-            54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
-        } };
-
-    const std::array<const ScalarRegI32, NumNegConstRegs>
-        GPUISA::negConstRegs = { {
-            -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
-            -16
-        } };
-} // namespace Gcn3ISA
-} // namespace gem5
--- a/src/arch/amdgpu/gcn3/operand.hh
+++ b/src/arch/amdgpu/gcn3/operand.hh
@@ -1,752 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_OPERAND_HH__
-#define __ARCH_GCN3_OPERAND_HH__
-
-#include <array>
-
-#include "arch/amdgpu/gcn3/gpu_registers.hh"
-#include "arch/generic/vec_reg.hh"
-#include "gpu-compute/scalar_register_file.hh"
-#include "gpu-compute/vector_register_file.hh"
-#include "gpu-compute/wavefront.hh"
-
-namespace gem5
-{
-
-/**
- * classes that represnt vector/scalar operands in GCN3 ISA. these classes
- * wrap the generic vector register type (i.e., src/arch/generic/vec_reg.hh)
- * and allow them to be manipulated in ways that are unique to GCN3 insts.
- */
-
-namespace Gcn3ISA
-{
-    /**
-     * convenience traits so we can automatically infer the correct FP type
-     * without looking at the number of dwords (i.e., to determine if we
-     * need a float or a double when creating FP constants).
-     */
-    template<typename T> struct OpTraits { typedef float FloatT; };
-    template<> struct OpTraits<ScalarRegF64> { typedef double FloatT; };
-    template<> struct OpTraits<ScalarRegU64> { typedef double FloatT; };
-
-    class Operand
-    {
-      public:
-        Operand() = delete;
-
-        Operand(GPUDynInstPtr gpuDynInst, int opIdx)
-            : _gpuDynInst(gpuDynInst), _opIdx(opIdx)
-        {
-            assert(_gpuDynInst);
-            assert(_opIdx >= 0);
-        }
-
-        /**
-         * read from and write to the underlying register(s) that
-         * this operand is referring to.
-         */
-        virtual void read() = 0;
-        virtual void write() = 0;
-
-      protected:
-        /**
-         * instruction object that owns this operand
-         */
-        GPUDynInstPtr _gpuDynInst;
-        /**
-         * op selector value for this operand. note that this is not
-         * the same as the register file index, be it scalar or vector.
-         * this could refer to inline constants, system regs, or even
-         * special values.
-         */
-        int _opIdx;
-    };
-
-    template<typename DataType, bool Const, size_t NumDwords>
-    class ScalarOperand;
-
-    template<typename DataType, bool Const,
-        size_t NumDwords = sizeof(DataType) / sizeof(VecElemU32)>
-    class VecOperand final : public Operand
-    {
-      static_assert(NumDwords >= 1 && NumDwords <= MaxOperandDwords,
-            "Incorrect number of DWORDS for GCN3 operand.");
-
-      public:
-        VecOperand() = delete;
-
-        VecOperand(GPUDynInstPtr gpuDynInst, int opIdx)
-            : Operand(gpuDynInst, opIdx), scalar(false), absMod(false),
-              negMod(false), scRegData(gpuDynInst, _opIdx),
-              vrfData{{ nullptr }}
-        {
-            vecReg.zero();
-        }
-
-        ~VecOperand()
-        {
-        }
-
-        /**
-         * certain vector operands can read from the vrf/srf or constants.
-         * we use this method to first determine the type of the operand,
-         * then we read from the appropriate source. if vector we read
-         * directly from the vrf. if scalar, we read in the data through
-         * the scalar operand component. this should only be used for VSRC
-         * operands.
-         */
-        void
-        readSrc()
-        {
-            if (isVectorReg(_opIdx)) {
-                _opIdx = opSelectorToRegIdx(_opIdx, _gpuDynInst->wavefront()
-                    ->reservedScalarRegs);
-                read();
-            } else {
-                readScalar();
-            }
-        }
-
-        /**
-         * read from the vrf. this should only be used by vector inst
-         * source operands that are explicitly vector (i.e., VSRC).
-         */
-        void
-        read() override
-        {
-            assert(_gpuDynInst);
-            assert(_gpuDynInst->wavefront());
-            assert(_gpuDynInst->computeUnit());
-            Wavefront *wf = _gpuDynInst->wavefront();
-            ComputeUnit *cu = _gpuDynInst->computeUnit();
-
-            for (auto i = 0; i < NumDwords; ++i) {
-                int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx + i);
-                vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
-
-                DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx);
-                cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
-            }
-
-            if (NumDwords == 1) {
-                assert(vrfData[0]);
-                auto vgpr = vecReg.template as<DataType>();
-                auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    std::memcpy((void*)&vgpr[lane],
-                        (void*)&reg_file_vgpr[lane], sizeof(DataType));
-                }
-            } else if (NumDwords == 2) {
-                assert(vrfData[0]);
-                assert(vrfData[1]);
-                auto vgpr = vecReg.template as<VecElemU64>();
-                auto reg_file_vgpr0 = vrfData[0]->template as<VecElemU32>();
-                auto reg_file_vgpr1 = vrfData[1]->template as<VecElemU32>();
-
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    VecElemU64 tmp_val(0);
-                    ((VecElemU32*)&tmp_val)[0] = reg_file_vgpr0[lane];
-                    ((VecElemU32*)&tmp_val)[1] = reg_file_vgpr1[lane];
-                    vgpr[lane] = tmp_val;
-                }
-            }
-        }
-
-        /**
-         * write to the vrf. we maintain a copy of the underlying vector
-         * reg(s) for this operand (i.e., vrfData/scRegData), as well as a
-         * temporary vector register representation (i.e., vecReg) of the
-         * vector register, which allows the execute() methods of instructions
-         * to easily write their operand data using operator[] regardless of
-         * their size. after the result is calculated we use write() to write
-         * the data to the actual register file storage. this allows us to do
-         * type conversion, etc., in a single call as opposed to doing it
-         * in each execute() method.
-         */
-        void
-        write() override
-        {
-            assert(_gpuDynInst);
-            assert(_gpuDynInst->wavefront());
-            assert(_gpuDynInst->computeUnit());
-            Wavefront *wf = _gpuDynInst->wavefront();
-            ComputeUnit *cu = _gpuDynInst->computeUnit();
-            VectorMask &exec_mask = _gpuDynInst->isLoad()
-                ? _gpuDynInst->exec_mask : wf->execMask();
-
-            if (NumDwords == 1) {
-                int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx);
-                vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
-                assert(vrfData[0]);
-                auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
-                auto vgpr = vecReg.template as<DataType>();
-
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (exec_mask[lane] || _gpuDynInst->ignoreExec()) {
-                        std::memcpy((void*)&reg_file_vgpr[lane],
-                            (void*)&vgpr[lane], sizeof(DataType));
-                    }
-                }
-
-                DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx);
-                cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
-            } else if (NumDwords == 2) {
-                int vgprIdx0 = cu->registerManager->mapVgpr(wf, _opIdx);
-                int vgprIdx1 = cu->registerManager->mapVgpr(wf, _opIdx + 1);
-                vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0);
-                vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1);
-                assert(vrfData[0]);
-                assert(vrfData[1]);
-                auto reg_file_vgpr0 = vrfData[0]->template as<VecElemU32>();
-                auto reg_file_vgpr1 = vrfData[1]->template as<VecElemU32>();
-                auto vgpr = vecReg.template as<VecElemU64>();
-
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (exec_mask[lane] || _gpuDynInst->ignoreExec()) {
-                        reg_file_vgpr0[lane] = ((VecElemU32*)&vgpr[lane])[0];
-                        reg_file_vgpr1[lane] = ((VecElemU32*)&vgpr[lane])[1];
-                    }
-                }
-
-                DPRINTF(GPUVRF, "Write v[%d:%d]\n", vgprIdx0, vgprIdx1);
-                cu->vrf[wf->simdId]->printReg(wf, vgprIdx0);
-                cu->vrf[wf->simdId]->printReg(wf, vgprIdx1);
-            }
-        }
-
-        void
-        negModifier()
-        {
-            negMod = true;
-        }
-
-        void
-        absModifier()
-        {
-            absMod = true;
-        }
-
-        /**
-         * getter [] operator. only enable if this operand is constant
-         * (i.e, a source operand) and if it can be represented using
-         * primitive types (i.e., 8b to 64b primitives).
-         */
-        template<bool Condition = (NumDwords == 1 || NumDwords == 2) && Const>
-        typename std::enable_if_t<Condition, const DataType>
-        operator[](size_t idx) const
-        {
-            assert(idx < NumVecElemPerVecReg);
-
-            if (scalar) {
-                DataType ret_val = scRegData.rawData();
-
-                if (absMod) {
-                    assert(std::is_floating_point_v<DataType>);
-                    ret_val = std::fabs(ret_val);
-                }
-
-                if (negMod) {
-                    assert(std::is_floating_point_v<DataType>);
-                    ret_val = -ret_val;
-                }
-
-                return ret_val;
-            } else {
-                auto vgpr = vecReg.template as<DataType>();
-                DataType ret_val = vgpr[idx];
-
-                if (absMod) {
-                    assert(std::is_floating_point_v<DataType>);
-                    ret_val = std::fabs(ret_val);
-                }
-
-                if (negMod) {
-                    assert(std::is_floating_point_v<DataType>);
-                    ret_val = -ret_val;
-                }
-
-                return ret_val;
-            }
-        }
-
-        /**
-         * setter [] operator. only enable if this operand is non-constant
-         * (i.e, a destination operand) and if it can be represented using
-         * primitive types (i.e., 8b to 64b primitives).
-         */
-        template<bool Condition = (NumDwords == 1 || NumDwords == 2) && !Const>
-        typename std::enable_if_t<Condition, DataType&>
-        operator[](size_t idx)
-        {
-            assert(!scalar);
-            assert(idx < NumVecElemPerVecReg);
-
-            return vecReg.template as<DataType>()[idx];
-        }
-
-        private:
-          /**
-           * if we determine that this operand is a scalar (reg or constant)
-           * then we read the scalar data into the scalar operand data member.
-           */
-          void
-          readScalar()
-          {
-              scalar = true;
-              scRegData.read();
-          }
-
-          using VecRegCont =
-              VecRegContainer<sizeof(DataType) * NumVecElemPerVecReg>;
-
-          /**
-           * whether this operand a scalar or not.
-           */
-          bool scalar;
-          /**
-           * absolute value and negative modifiers. VOP3 instructions
-           * may indicate that their input/output operands must be
-           * modified, either by taking the absolute value or negating
-           * them. these bools indicate which modifier, if any, to use.
-           */
-          bool absMod;
-          bool negMod;
-          /**
-           * this holds all the operand data in a single vector register
-           * object (i.e., if an operand is 64b, this will hold the data
-           * from both registers the operand is using).
-           */
-          VecRegCont vecReg;
-          /**
-           * for src operands that read scalars (i.e., scalar regs or
-           * a scalar constant).
-           */
-          ScalarOperand<DataType, Const, NumDwords> scRegData;
-          /**
-           * pointers to the underlyding registers (i.e., the actual
-           * registers in the register file).
-           */
-          std::array<VecRegContainerU32*, NumDwords> vrfData;
-    };
-
-    template<typename DataType, bool Const,
-        size_t NumDwords = sizeof(DataType) / sizeof(ScalarRegU32)>
-    class ScalarOperand final : public Operand
-    {
-      static_assert(NumDwords >= 1 && NumDwords <= MaxOperandDwords,
-            "Incorrect number of DWORDS for GCN3 operand.");
-      public:
-        ScalarOperand() = delete;
-
-        ScalarOperand(GPUDynInstPtr gpuDynInst, int opIdx)
-            : Operand(gpuDynInst, opIdx)
-        {
-            std::memset(srfData.data(), 0, NumDwords * sizeof(ScalarRegU32));
-        }
-
-        ~ScalarOperand()
-        {
-        }
-
-        /**
-         * we store scalar data in a std::array, however if we need the
-         * full operand data we use this method to copy all elements of
-         * the scalar operand data to a single primitive container. only
-         * useful for 8b to 64b primitive types, as they are the only types
-         * that we need to perform computation on.
-         */
-        template<bool Condition = NumDwords == 1 || NumDwords == 2>
-        typename std::enable_if_t<Condition, DataType>
-        rawData() const
-        {
-            assert(sizeof(DataType) <= sizeof(srfData));
-            DataType raw_data((DataType)0);
-            std::memcpy((void*)&raw_data, (void*)srfData.data(),
-                sizeof(DataType));
-
-            return raw_data;
-        }
-
-        void*
-        rawDataPtr()
-        {
-            return (void*)srfData.data();
-        }
-
-        void
-        read() override
-        {
-            Wavefront *wf = _gpuDynInst->wavefront();
-            ComputeUnit *cu = _gpuDynInst->computeUnit();
-
-            if (!isScalarReg(_opIdx)) {
-                readSpecialVal();
-            } else {
-                for (auto i = 0; i < NumDwords; ++i) {
-                    int sgprIdx = regIdx(i);
-                    srfData[i] = cu->srf[wf->simdId]->read(sgprIdx);
-                    DPRINTF(GPUSRF, "Read s[%d]\n", sgprIdx);
-                    cu->srf[wf->simdId]->printReg(wf, sgprIdx);
-                }
-            }
-        }
-
-        void
-        write() override
-        {
-            Wavefront *wf = _gpuDynInst->wavefront();
-            ComputeUnit *cu = _gpuDynInst->computeUnit();
-
-            if (!isScalarReg(_opIdx)) {
-                if (_opIdx == REG_EXEC_LO) {
-                    ScalarRegU64 new_exec_mask_val
-                        = wf->execMask().to_ullong();
-                    if (NumDwords == 1) {
-                        std::memcpy((void*)&new_exec_mask_val,
-                            (void*)srfData.data(), sizeof(VecElemU32));
-                    } else if (NumDwords == 2) {
-                        std::memcpy((void*)&new_exec_mask_val,
-                            (void*)srfData.data(), sizeof(VecElemU64));
-                    } else {
-                        panic("Trying to write more than 2 DWORDS to EXEC\n");
-                    }
-                    VectorMask new_exec_mask(new_exec_mask_val);
-                    wf->execMask() = new_exec_mask;
-                    DPRINTF(GPUSRF, "Write EXEC\n");
-                    DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
-                } else if (_opIdx == REG_EXEC_HI) {
-                    /**
-                     * If we're writing only the upper half of the EXEC mask
-                     * this ought to be a single dword operand.
-                     */
-                    assert(NumDwords == 1);
-                    ScalarRegU32 new_exec_mask_hi_val(0);
-                    ScalarRegU64 new_exec_mask_val
-                        = wf->execMask().to_ullong();
-                    std::memcpy((void*)&new_exec_mask_hi_val,
-                        (void*)srfData.data(), sizeof(new_exec_mask_hi_val));
-                    replaceBits(new_exec_mask_val, 63, 32,
-                                new_exec_mask_hi_val);
-                    VectorMask new_exec_mask(new_exec_mask_val);
-                    wf->execMask() = new_exec_mask;
-                    DPRINTF(GPUSRF, "Write EXEC\n");
-                    DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
-                } else {
-                    _gpuDynInst->writeMiscReg(_opIdx, srfData[0]);
-                }
-            } else {
-                for (auto i = 0; i < NumDwords; ++i) {
-                    int sgprIdx = regIdx(i);
-                    auto &sgpr = cu->srf[wf->simdId]->readWriteable(sgprIdx);
-                    if (_gpuDynInst->isLoad()) {
-                        assert(sizeof(DataType) <= sizeof(ScalarRegU64));
-                        sgpr = reinterpret_cast<ScalarRegU32*>(
-                            _gpuDynInst->scalar_data)[i];
-                    } else {
-                        sgpr = srfData[i];
-                    }
-                    DPRINTF(GPUSRF, "Write s[%d]\n", sgprIdx);
-                    cu->srf[wf->simdId]->printReg(wf, sgprIdx);
-                }
-            }
-        }
-
-        /**
-         * bit access to scalar data. primarily used for setting vcc bits.
-         */
-        template<bool Condition = NumDwords == 1 || NumDwords == 2>
-        typename std::enable_if_t<Condition, void>
-        setBit(int bit, int bit_val)
-        {
-            DataType &sgpr = *((DataType*)srfData.data());
-            replaceBits(sgpr, bit, bit_val);
-        }
-
-        template<bool Condition = (NumDwords == 1 || NumDwords == 2) && !Const>
-        typename std::enable_if_t<Condition, ScalarOperand&>
-        operator=(DataType rhs)
-        {
-            std::memcpy((void*)srfData.data(), (void*)&rhs, sizeof(DataType));
-            return *this;
-        }
-
-      private:
-        /**
-         * we have determined that we are not reading our scalar operand data
-         * from the register file, so here we figure out which special value
-         * we are reading (i.e., float constant, int constant, inline
-         * constant, or various other system registers (e.g., exec mask).
-         */
-        void
-        readSpecialVal()
-        {
-            assert(NumDwords == 1 || NumDwords == 2);
-
-            switch(_opIdx) {
-              case REG_EXEC_LO:
-                {
-                    if (NumDwords == 1) {
-                        ScalarRegU32 exec_mask = _gpuDynInst->wavefront()->
-                            execMask().to_ulong();
-                        std::memcpy((void*)srfData.data(), (void*)&exec_mask,
-                            sizeof(exec_mask));
-                        DPRINTF(GPUSRF, "Read EXEC\n");
-                        DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
-                    } else {
-                        assert(NumDwords == 2);
-                        ScalarRegU64 exec_mask = _gpuDynInst->wavefront()->
-                            execMask().to_ullong();
-                        std::memcpy((void*)srfData.data(), (void*)&exec_mask,
-                            sizeof(exec_mask));
-                        DPRINTF(GPUSRF, "Read EXEC\n");
-                        DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
-                    }
-                }
-                break;
-              case REG_EXEC_HI:
-                {
-                    /**
-                     * If we're reading only the upper half of the EXEC mask
-                     * this ought to be a single dword operand.
-                     */
-                    assert(NumDwords == 1);
-                    ScalarRegU64 exec_mask = _gpuDynInst->wavefront()
-                        ->execMask().to_ullong();
-
-                    ScalarRegU32 exec_mask_hi = bits(exec_mask, 63, 32);
-                    std::memcpy((void*)srfData.data(), (void*)&exec_mask_hi,
-                                sizeof(exec_mask_hi));
-                    DPRINTF(GPUSRF, "Read EXEC_HI\n");
-                    DPRINTF(GPUSRF, "EXEC_HI = %#x\n", exec_mask_hi);
-                }
-                break;
-              case REG_SRC_SWDA:
-              case REG_SRC_DPP:
-              case REG_SRC_LITERAL:
-                assert(NumDwords == 1);
-                srfData[0] = _gpuDynInst->srcLiteral();
-                break;
-              case REG_POS_HALF:
-                {
-                    typename OpTraits<DataType>::FloatT pos_half = 0.5;
-                    std::memcpy((void*)srfData.data(), (void*)&pos_half,
-                        sizeof(pos_half));
-
-                }
-                break;
-              case REG_NEG_HALF:
-                {
-                    typename OpTraits<DataType>::FloatT neg_half = -0.5;
-                    std::memcpy((void*)srfData.data(), (void*)&neg_half,
-                        sizeof(neg_half));
-                }
-                break;
-              case REG_POS_ONE:
-                {
-                    typename OpTraits<DataType>::FloatT pos_one = 1.0;
-                    std::memcpy(srfData.data(), &pos_one, sizeof(pos_one));
-                }
-                break;
-              case REG_NEG_ONE:
-                {
-                    typename OpTraits<DataType>::FloatT neg_one = -1.0;
-                    std::memcpy(srfData.data(), &neg_one, sizeof(neg_one));
-                }
-                break;
-              case REG_POS_TWO:
-                {
-                    typename OpTraits<DataType>::FloatT pos_two = 2.0;
-                    std::memcpy(srfData.data(), &pos_two, sizeof(pos_two));
-                }
-                break;
-              case REG_NEG_TWO:
-                {
-                    typename OpTraits<DataType>::FloatT neg_two = -2.0;
-                    std::memcpy(srfData.data(), &neg_two, sizeof(neg_two));
-                }
-                break;
-              case REG_POS_FOUR:
-                {
-                    typename OpTraits<DataType>::FloatT pos_four = 4.0;
-                    std::memcpy(srfData.data(), &pos_four, sizeof(pos_four));
-                }
-                break;
-              case REG_NEG_FOUR:
-                {
-                    typename OpTraits<DataType>::FloatT neg_four = -4.0;
-                    std::memcpy((void*)srfData.data(), (void*)&neg_four ,
-                        sizeof(neg_four));
-                }
-                break;
-                case REG_PI:
-                {
-                    assert(sizeof(DataType) == sizeof(ScalarRegF64)
-                        || sizeof(DataType) == sizeof(ScalarRegF32));
-
-                    const ScalarRegU32 pi_u32(0x3e22f983UL);
-                    const ScalarRegU64 pi_u64(0x3fc45f306dc9c882ULL);
-
-                    if (sizeof(DataType) == sizeof(ScalarRegF64)) {
-                        std::memcpy((void*)srfData.data(),
-                            (void*)&pi_u64, sizeof(pi_u64));
-                    } else {
-                        std::memcpy((void*)srfData.data(),
-                            (void*)&pi_u32, sizeof(pi_u32));
-                    }
-                }
-                break;
-              default:
-                {
-                    assert(sizeof(DataType) <= sizeof(srfData));
-                    DataType misc_val(0);
-                    if (isConstVal(_opIdx)) {
-                        misc_val = (DataType)_gpuDynInst
-                            ->readConstVal<DataType>(_opIdx);
-                    } else {
-                        misc_val = (DataType)_gpuDynInst->readMiscReg(_opIdx);
-                    }
-                    std::memcpy((void*)srfData.data(), (void*)&misc_val,
-                                sizeof(DataType));
-                }
-            }
-        }
-
-        /**
-         * for scalars we need to do some extra work to figure out how to
-         * map the op selector to the sgpr idx because some op selectors
-         * do not map directly to the srf (i.e., vcc/flat_scratch).
-         */
-        int
-        regIdx(int dword) const
-        {
-            Wavefront *wf = _gpuDynInst->wavefront();
-            ComputeUnit *cu = _gpuDynInst->computeUnit();
-            int sgprIdx(-1);
-
-            if (_opIdx == REG_VCC_HI) {
-                sgprIdx = cu->registerManager
-                    ->mapSgpr(wf, wf->reservedScalarRegs - 1 + dword);
-            } else if (_opIdx == REG_VCC_LO) {
-                sgprIdx = cu->registerManager
-                    ->mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
-            } else if (_opIdx == REG_FLAT_SCRATCH_HI) {
-                sgprIdx = cu->registerManager
-                    ->mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
-            } else if (_opIdx == REG_FLAT_SCRATCH_LO) {
-                assert(NumDwords == 1);
-                sgprIdx = cu->registerManager
-                    ->mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
-            } else {
-                sgprIdx = cu->registerManager->mapSgpr(wf, _opIdx + dword);
-            }
-
-            assert(sgprIdx > -1);
-
-            return sgprIdx;
-        }
-
-        /**
-         * in GCN3 each register is represented as a 32b unsigned value,
-         * however operands may require up to 16 registers, so we store
-         * all the individual 32b components here. for sub-dword operand
-         * we still consider them to be 1 dword because the minimum size
-         * of a register is 1 dword. this class will take care to do the
-         * proper packing/unpacking of sub-dword operands.
-         */
-        std::array<ScalarRegU32, NumDwords> srfData;
-    };
-
-    // typedefs for the various sizes/types of scalar operands
-    using ScalarOperandU8 = ScalarOperand<ScalarRegU8, false, 1>;
-    using ScalarOperandI8 = ScalarOperand<ScalarRegI8, false, 1>;
-    using ScalarOperandU16 = ScalarOperand<ScalarRegU16, false, 1>;
-    using ScalarOperandI16 = ScalarOperand<ScalarRegI16, false, 1>;
-    using ScalarOperandU32 = ScalarOperand<ScalarRegU32, false>;
-    using ScalarOperandI32 = ScalarOperand<ScalarRegI32, false>;
-    using ScalarOperandF32 = ScalarOperand<ScalarRegF32, false>;
-    using ScalarOperandU64 = ScalarOperand<ScalarRegU64, false>;
-    using ScalarOperandI64 = ScalarOperand<ScalarRegI64, false>;
-    using ScalarOperandF64 = ScalarOperand<ScalarRegF64, false>;
-    using ScalarOperandU128 = ScalarOperand<ScalarRegU32, false, 4>;
-    using ScalarOperandU256 = ScalarOperand<ScalarRegU32, false, 8>;
-    using ScalarOperandU512 = ScalarOperand<ScalarRegU32, false, 16>;
-    // non-writeable versions of scalar operands
-    using ConstScalarOperandU8 = ScalarOperand<ScalarRegU8, true, 1>;
-    using ConstScalarOperandI8 = ScalarOperand<ScalarRegI8, true, 1>;
-    using ConstScalarOperandU16 = ScalarOperand<ScalarRegU16, true, 1>;
-    using ConstScalarOperandI16 = ScalarOperand<ScalarRegI16, true, 1>;
-    using ConstScalarOperandU32 = ScalarOperand<ScalarRegU32, true>;
-    using ConstScalarOperandI32 = ScalarOperand<ScalarRegI32, true>;
-    using ConstScalarOperandF32 = ScalarOperand<ScalarRegF32, true>;
-    using ConstScalarOperandU64 = ScalarOperand<ScalarRegU64, true>;
-    using ConstScalarOperandI64 = ScalarOperand<ScalarRegI64, true>;
-    using ConstScalarOperandF64 = ScalarOperand<ScalarRegF64, true>;
-    using ConstScalarOperandU128 = ScalarOperand<ScalarRegU32, true, 4>;
-    using ConstScalarOperandU256 = ScalarOperand<ScalarRegU32, true, 8>;
-    using ConstScalarOperandU512 = ScalarOperand<ScalarRegU32, true, 16>;
-    // typedefs for the various sizes/types of vector operands
-    using VecOperandU8 = VecOperand<VecElemU8, false, 1>;
-    using VecOperandI8 = VecOperand<VecElemI8, false, 1>;
-    using VecOperandU16 = VecOperand<VecElemU16, false, 1>;
-    using VecOperandI16 = VecOperand<VecElemI16, false, 1>;
-    using VecOperandU32 = VecOperand<VecElemU32, false>;
-    using VecOperandI32 = VecOperand<VecElemI32, false>;
-    using VecOperandF32 = VecOperand<VecElemF32, false>;
-    using VecOperandU64 = VecOperand<VecElemU64, false>;
-    using VecOperandF64 = VecOperand<VecElemF64, false>;
-    using VecOperandI64 = VecOperand<VecElemI64, false>;
-    using VecOperandU96 = VecOperand<VecElemU32, false, 3>;
-    using VecOperandU128 = VecOperand<VecElemU32, false, 4>;
-    using VecOperandU256 = VecOperand<VecElemU32, false, 8>;
-    using VecOperandU512 = VecOperand<VecElemU32, false, 16>;
-    // non-writeable versions of vector operands
-    using ConstVecOperandU8 = VecOperand<VecElemU8, true, 1>;
-    using ConstVecOperandI8 = VecOperand<VecElemI8, true, 1>;
-    using ConstVecOperandU16 = VecOperand<VecElemU16, true, 1>;
-    using ConstVecOperandI16 = VecOperand<VecElemI16, true, 1>;
-    using ConstVecOperandU32 = VecOperand<VecElemU32, true>;
-    using ConstVecOperandI32 = VecOperand<VecElemI32, true>;
-    using ConstVecOperandF32 = VecOperand<VecElemF32, true>;
-    using ConstVecOperandU64 = VecOperand<VecElemU64, true>;
-    using ConstVecOperandI64 = VecOperand<VecElemI64, true>;
-    using ConstVecOperandF64 = VecOperand<VecElemF64, true>;
-    using ConstVecOperandU96 = VecOperand<VecElemU32, true, 3>;
-    using ConstVecOperandU128 = VecOperand<VecElemU32, true, 4>;
-    using ConstVecOperandU256 = VecOperand<VecElemU32, true, 8>;
-    using ConstVecOperandU512 = VecOperand<VecElemU32, true, 16>;
-}
-
-} // namespace gem5
-
-#endif // __ARCH_GCN3_OPERAND_HH__
--- a/src/arch/amdgpu/gcn3/registers.cc
+++ b/src/arch/amdgpu/gcn3/registers.cc
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "arch/amdgpu/gcn3/gpu_registers.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    std::string
-    opSelectorToRegSym(int idx, int numRegs)
-    {
-        std::string reg_sym;
-
-        // we have an SGPR
-        if (idx <= REG_SGPR_MAX) {
-            if (numRegs > 1)
-                reg_sym = "s[" + std::to_string(idx) + ":" +
-                    std::to_string(idx + numRegs - 1) + "]";
-            else
-                reg_sym = "s" + std::to_string(idx);
-            return reg_sym;
-        } else if (idx >= REG_VGPR_MIN && idx <= REG_VGPR_MAX) {
-            if (numRegs > 1)
-                reg_sym = "v[" + std::to_string(idx - REG_VGPR_MIN) + ":" +
-                    std::to_string(idx - REG_VGPR_MIN + numRegs - 1) + "]";
-            else
-                reg_sym = "v" + std::to_string(idx - REG_VGPR_MIN);
-            return reg_sym;
-        } else if (idx >= REG_INT_CONST_POS_MIN &&
-                   idx <= REG_INT_CONST_POS_MAX) {
-            reg_sym = std::to_string(idx - REG_INT_CONST_POS_MIN + 1);
-            return reg_sym;
-        } else if (idx >= REG_INT_CONST_NEG_MIN &&
-                   idx <= REG_INT_CONST_NEG_MAX) {
-            int inline_val = -1 - (idx - REG_INT_CONST_NEG_MIN);
-            reg_sym = std::to_string(inline_val);
-            return reg_sym;
-        }
-
-        switch (idx) {
-          case REG_FLAT_SCRATCH_LO:
-            reg_sym = "flat_scratch_lo";
-            break;
-          case REG_FLAT_SCRATCH_HI:
-            reg_sym = "flat_scratch_hi";
-            break;
-          case REG_VCC_LO:
-            reg_sym = "vcc_lo";
-            break;
-          case REG_VCC_HI:
-            reg_sym = "vcc_hi";
-            break;
-          case REG_M0:
-            reg_sym = "m0";
-            break;
-          case REG_EXEC_LO:
-            reg_sym = "exec";
-            break;
-          case REG_ZERO:
-            reg_sym = "0";
-            break;
-          case REG_POS_HALF:
-            reg_sym = "0.5";
-            break;
-          case REG_NEG_HALF:
-            reg_sym = "-0.5";
-            break;
-          case REG_POS_ONE:
-            reg_sym = "1";
-            break;
-          case REG_NEG_ONE:
-            reg_sym = "-1";
-            break;
-          case REG_POS_TWO:
-            reg_sym = "2";
-            break;
-          case REG_NEG_TWO:
-            reg_sym = "-2";
-            break;
-          case REG_POS_FOUR:
-            reg_sym = "4";
-            break;
-          case REG_NEG_FOUR:
-            reg_sym = "-4";
-            break;
-          default:
-            fatal("GCN3 ISA instruction has unknown register index %u\n", idx);
-            break;
-        }
-
-        return reg_sym;
-    }
-
-    int
-    opSelectorToRegIdx(int idx, int numScalarRegs)
-    {
-        int regIdx = -1;
-
-        if (idx <= REG_SGPR_MAX) {
-            regIdx = idx;
-        } else if (idx >= REG_VGPR_MIN && idx <= REG_VGPR_MAX) {
-            regIdx = idx - REG_VGPR_MIN;
-        } else if (idx == REG_VCC_LO) {
-            /**
-             * the VCC register occupies the two highest numbered
-             * SRF entries. VCC is typically indexed by specifying
-             * VCC_LO (simply called VCC) in the instruction encoding
-             * and reading it as a 64b value so we only return the
-             * index to the lower half of the VCC register.
-             *
-             * VCC_LO = s[NUM_SGPRS - 2]
-             * VCC_HI = s[NUM_SGPRS - 1]
-             *
-             */
-            regIdx = numScalarRegs - 2;
-        } else if (idx == REG_VCC_HI) {
-            regIdx = numScalarRegs - 1;
-        } else if (idx == REG_FLAT_SCRATCH_LO) {
-            /**
-             * the FLAT_SCRATCH register occupies the two SRF entries
-             * just below VCC. FLAT_SCRATCH is typically indexed by
-             * specifying FLAT_SCRATCH_LO (simply called FLAT_SCRATCH)
-             * in the instruction encoding and reading it as a 64b value
-             * so we only return the index to the lower half of the
-             * FLAT_SCRATCH register.
-             *
-             * FLAT_SCRATCH_LO = s[NUM_SGPRS - 4]
-             * FLAT_SCRATCH_HI = s[NUM_SGPRS - 3]
-             *
-             */
-            regIdx = numScalarRegs - 4;
-        } else if (idx == REG_FLAT_SCRATCH_HI) {
-            regIdx = numScalarRegs - 3;
-        }
-
-        return regIdx;
-    }
-
-    bool
-    isPosConstVal(int opIdx)
-    {
-        bool is_pos_const_val = (opIdx >= REG_INT_CONST_POS_MIN
-            && opIdx <= REG_INT_CONST_POS_MAX);
-
-        return is_pos_const_val;
-    }
-
-    bool
-    isNegConstVal(int opIdx)
-    {
-        bool is_neg_const_val = (opIdx >= REG_INT_CONST_NEG_MIN
-            && opIdx <= REG_INT_CONST_NEG_MAX);
-
-        return is_neg_const_val;
-    }
-
-    bool
-    isConstVal(int opIdx)
-    {
-        bool is_const_val = isPosConstVal(opIdx) || isNegConstVal(opIdx);
-        return is_const_val;
-    }
-
-    bool
-    isLiteral(int opIdx)
-    {
-        return opIdx == REG_SRC_LITERAL;
-    }
-
-    bool
-    isExecMask(int opIdx)
-    {
-        return opIdx == REG_EXEC_LO || opIdx == REG_EXEC_HI;
-    }
-
-    bool
-    isVccReg(int opIdx)
-    {
-        return opIdx == REG_VCC_LO || opIdx == REG_VCC_HI;
-    }
-
-    bool
-    isFlatScratchReg(int opIdx)
-    {
-        return opIdx == REG_FLAT_SCRATCH_LO || opIdx == REG_FLAT_SCRATCH_HI;
-    }
-
-    bool
-    isScalarReg(int opIdx)
-    {
-        // FLAT_SCRATCH and VCC are stored in an SGPR pair
-        if (opIdx <= REG_SGPR_MAX || opIdx == REG_FLAT_SCRATCH_LO ||
-            opIdx == REG_FLAT_SCRATCH_HI || opIdx == REG_VCC_LO ||
-            opIdx == REG_VCC_HI) {
-            return true;
-        }
-
-        return false;
-    }
-
-    bool
-    isVectorReg(int opIdx)
-    {
-        if (opIdx >= REG_VGPR_MIN && opIdx <= REG_VGPR_MAX)
-            return true;
-
-        return false;
-    }
-
-} // namespace Gcn3ISA
-} // namespace gem5
--- a/src/arch/amdgpu/vega/SConscript
+++ b/src/arch/amdgpu/vega/SConscript
@@ -49,11 +49,31 @@ Source('tlb_coalescer.cc')
 DebugFlag('GPUPTWalker', 'Debug flag for GPU page table walker')

 if env['CONF']['TARGET_GPU_ISA'] == 'vega':
-    Source('decoder.cc')
+    Source('gpu_decoder.cc')
    Source('insts/gpu_static_inst.cc')
-    Source('insts/instructions.cc')
    Source('insts/op_encodings.cc')
-    Source('isa.cc')
-    Source('registers.cc')
+    Source('gpu_isa.cc')
+    Source('gpu_registers.cc')
+
+    Source('insts/sop2.cc')
+    Source('insts/sopk.cc')
+    Source('insts/sop1.cc')
+    Source('insts/sopc.cc')
+    Source('insts/sopp.cc')
+    Source('insts/smem.cc')
+    Source('insts/vop2.cc')
+    Source('insts/vop1.cc')
+    Source('insts/vopc.cc')
+    Source('insts/vinterp.cc')
+    Source('insts/vop3.cc')
+    Source('insts/vop3_cmp.cc')
+    Source('insts/ds.cc')
+    Source('insts/mubuf.cc')
+    Source('insts/mtbuf.cc')
+    Source('insts/mimg.cc')
+    Source('insts/exp.cc')
+    Source('insts/flat.cc')
+    Source('insts/vop3p.cc')
+    Source('insts/vop3p_mai.cc')

    DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA')
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -325,6 +325,7 @@ namespace VegaISA
        GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_FMAC_F32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst);
@@ -470,6 +471,7 @@ namespace VegaISA
        GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst);
@@ -508,6 +510,7 @@ namespace VegaISA
        GPUStaticInst* decode_OPU_VOP3__V_ADD_I16(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_SUB_I16(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_PACK_B32_F16(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_CVT_PK_FP8_F32(MachInst);
        GPUStaticInst* decode_OP_DS__DS_ADD_U32(MachInst);
        GPUStaticInst* decode_OP_DS__DS_SUB_U32(MachInst);
        GPUStaticInst* decode_OP_DS__DS_RSUB_U32(MachInst);
@@ -698,6 +701,9 @@ namespace VegaISA
        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_XOR(MachInst);
        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_INC(MachInst);
        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_DEC(MachInst);
+        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_ADD_F64(MachInst);
+        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_MIN_F64(MachInst);
+        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_MAX_F64(MachInst);
        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SWAP_X2(MachInst);
        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP_X2(MachInst);
        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_ADD_X2(MachInst);
@@ -746,6 +752,11 @@ namespace VegaISA
        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_XOR(MachInst);
        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_INC(MachInst);
        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_DEC(MachInst);
+        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F32(MachInst);
+        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_PK_ADD_F16(MachInst);
+        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F64(MachInst);
+        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_MIN_F64(MachInst);
+        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_MAX_F64(MachInst);
        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_SWAP_X2(MachInst);
        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_CMPSWAP_X2(MachInst);
        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_X2(MachInst);
@@ -1279,6 +1290,7 @@ namespace VegaISA
        GPUStaticInst* decode_OP_VOP1__V_FREXP_MANT_F32(MachInst);
        GPUStaticInst* decode_OP_VOP1__V_CLREXCP(MachInst);
        GPUStaticInst* decode_OP_VOP1__V_SCREEN_PARTITION_4SE_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP1__V_MOV_B64(MachInst);
        GPUStaticInst* decode_OP_VOP1__V_CVT_F16_U16(MachInst);
        GPUStaticInst* decode_OP_VOP1__V_CVT_F16_I16(MachInst);
        GPUStaticInst* decode_OP_VOP1__V_CVT_U16_F16(MachInst);
@@ -1303,6 +1315,7 @@ namespace VegaISA
        GPUStaticInst* decode_OP_VOP1__V_CVT_NORM_U16_F16(MachInst);
        GPUStaticInst* decode_OP_VOP1__V_SAT_PK_U8_I16(MachInst);
        GPUStaticInst* decode_OP_VOP1__V_SWAP_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP1__V_ACCVGPR_MOV_B32(MachInst);
        GPUStaticInst* decode_OP_VOP2__V_CNDMASK_B32(MachInst);
        GPUStaticInst* decode_OP_VOP2__V_ADD_F32(MachInst);
        GPUStaticInst* decode_OP_VOP2__V_SUB_F32(MachInst);
@@ -1585,6 +1598,65 @@ namespace VegaISA
        GPUStaticInst* decode_OP_VOP3P__V_MAD_MIX_F32(MachInst);
        GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXLO_F16(MachInst);
        GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXHI_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_FMA_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_F32_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_I32_I16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_U32_U16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT4_I32_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT4_U32_U8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT8_I32_I4(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT8_U32_U4(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X1_2B_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X1_4B_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_4X4X1_16B_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X2_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X4_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X4_2B_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X4_4B_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_4X4X4_16B_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X8_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X16_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_32X32X4_2B_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X4_4B_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_4X4X4_16B_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_32X32X8_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_32X32X16_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X32_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X4_2B_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X4_4B_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_4X4X4_16B_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X8_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X16_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X32_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X16_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X32_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X16_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_I32_16X16X64_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_I32_32X32X32_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4_F64(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_4X4X4_4B_F64(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_BF8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_BF8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_FP8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_FP8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_BF8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_BF8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_FP8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_FP8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_BF8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_BF8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_FP8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_FP8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_BF8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_BF8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_FP8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_FP8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_READ(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst);
        GPUStaticInst* subDecode_OPU_VOP3(MachInst);
        GPUStaticInst* subDecode_OP_DS(MachInst);
        GPUStaticInst* subDecode_OP_FLAT(MachInst);
@@ -1642,7 +1714,7 @@ namespace VegaISA

    struct InFmt_FLAT {
        unsigned int    OFFSET : 13;
-        unsigned int       LDS : 1;
+        unsigned int       SVE : 1;
        unsigned int       SEG : 2;
        unsigned int       GLC : 1;
        unsigned int       SLC : 1;
@@ -1908,7 +1980,27 @@ namespace VegaISA
        unsigned int       NEG : 3;
    };

-    union InstFormat {
+    struct InFmt_VOP3P_MAI
+    {
+        unsigned int      VDST : 8;
+        unsigned int      CBSZ : 3;
+        unsigned int      ABID : 4;
+        unsigned int    ACC_CD : 1;
+        unsigned int        OP : 7;
+        unsigned int  ENCODING : 9;
+    };
+
+    struct InFmt_VOP3P_MAI_1
+    {
+        unsigned int  SRC0 : 9;
+        unsigned int  SRC1 : 9;
+        unsigned int  SRC2 : 9;
+        unsigned int   ACC : 2;
+        unsigned int  BLGP : 3;
+    };
+
+    union InstFormat
+    {
        InFmt_DS            iFmt_DS;
        InFmt_DS_1          iFmt_DS_1;
        InFmt_EXP           iFmt_EXP;
@@ -1941,6 +2033,8 @@ namespace VegaISA
        InFmt_VOP_SDWAB     iFmt_VOP_SDWAB;
        InFmt_VOP3P         iFmt_VOP3P;
        InFmt_VOP3P_1       iFmt_VOP3P_1;
+        InFmt_VOP3P_MAI     iFmt_VOP3P_MAI;
+        InFmt_VOP3P_MAI_1   iFmt_VOP3P_MAI_1;
        uint32_t            imm_u32;
        float               imm_f32;
    }; // union InstFormat
--- a/src/arch/amdgpu/vega/gpu_isa.cc
+++ b/src/arch/amdgpu/vega/gpu_isa.cc
--- a/src/arch/amdgpu/vega/gpu_registers.cc
+++ b/src/arch/amdgpu/vega/gpu_registers.cc
@@ -89,6 +89,18 @@ namespace VegaISA
          case REG_ZERO:
            reg_sym = "0";
            break;
+          case REG_SHARED_BASE:
+            reg_sym = "src_shared_base";
+            break;
+          case REG_SHARED_LIMIT:
+            reg_sym = "src_shared_limit";
+            break;
+          case REG_PRIVATE_BASE:
+            reg_sym = "src_private_base";
+            break;
+          case REG_PRIVATE_LIMIT:
+            reg_sym = "src_private_limit";
+            break;
          case REG_POS_HALF:
            reg_sym = "0.5";
            break;
--- a/src/arch/amdgpu/vega/gpu_registers.hh
+++ b/src/arch/amdgpu/vega/gpu_registers.hh
@@ -106,10 +106,10 @@ namespace VegaISA
        REG_RESERVED_25 = 232,
        REG_RESERVED_26 = 233,
        REG_RESERVED_27 = 234,
-        REG_RESERVED_28 = 235,
-        REG_RESERVED_29 = 236,
-        REG_RESERVED_30 = 237,
-        REG_RESERVED_31 = 238,
+        REG_SHARED_BASE = 235,
+        REG_SHARED_LIMIT = 236,
+        REG_PRIVATE_BASE = 237,
+        REG_PRIVATE_LIMIT = 238,
        REG_RESERVED_32 = 239,
        REG_POS_HALF = 240,
        REG_NEG_HALF = 241,
@@ -129,7 +129,7 @@ namespace VegaISA
        REG_LDS_DIRECT = 254,
        REG_SRC_LITERAL = 255,
        REG_VGPR_MIN = 256,
-        REG_VGPR_MAX = 511
+        REG_VGPR_MAX = 767
    };

    constexpr size_t MaxOperandDwords(16);
--- a/src/arch/amdgpu/vega/insts/ds.cc
+++ b/src/arch/amdgpu/vega/insts/ds.cc
--- a/src/arch/amdgpu/gcn3/insts/gpu_static_inst.cc
+++ b/src/arch/amdgpu/gcn3/insts/gpu_static_inst.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@@ -29,31 +29,30 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
-
-#include "arch/amdgpu/gcn3/gpu_decoder.hh"
-#include "arch/amdgpu/gcn3/insts/instructions.hh"
-#include "debug/GPUExec.hh"
-#include "gpu-compute/shader.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"

 namespace gem5
 {

-namespace Gcn3ISA
+namespace VegaISA
 {
-    GCN3GPUStaticInst::GCN3GPUStaticInst(const std::string &opcode)
-        : GPUStaticInst(opcode), _srcLiteral(0)
-    {
-    }
+    // --- Inst_EXP__EXP class methods ---

-    GCN3GPUStaticInst::~GCN3GPUStaticInst()
+    Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
+        : Inst_EXP(iFmt, "exp")
    {
-    }
+    } // Inst_EXP__EXP

+    Inst_EXP__EXP::~Inst_EXP__EXP()
+    {
+    } // ~Inst_EXP__EXP
+
+    // --- description from .arch file ---
+    // Export through SX.
    void
-    GCN3GPUStaticInst::panicUnimplemented() const
+    Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
    {
-        fatal("Encountered unimplemented GCN3 instruction: %s\n", _opcode);
-    }
-} // namespace Gcn3ISA
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
 } // namespace gem5
--- a/src/arch/amdgpu/vega/insts/flat.cc
+++ b/src/arch/amdgpu/vega/insts/flat.cc
--- a/src/arch/amdgpu/vega/insts/inst_util.hh
+++ b/src/arch/amdgpu/vega/insts/inst_util.hh
@@ -35,6 +35,7 @@
 #include <cmath>

 #include "arch/amdgpu/vega/gpu_registers.hh"
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"

 namespace gem5
 {
@@ -315,7 +316,8 @@ namespace VegaISA
     * 0x142: broadcast 15th thread of each row to next row
     * 0x143: broadcast thread 31 to rows 2 and 3
     */
-    int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
+    inline int
+    dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
                    int rowOffset, bool & outOfBounds)
    {
        // local variables
@@ -699,7 +701,7 @@ namespace VegaISA
        if (sel < SDWA_WORD_0) { // we are selecting 1 byte
            // if we sign extended depends on upper-most bit of byte 0
            signExt = (signExt &&
-                       (bits(currDstVal, VegaISA::MSB_PER_WORD, 0) & 0x80));
+                       (bits(currDstVal, VegaISA::MSB_PER_BYTE, 0) & 0x80));

            for (int byte = 0; byte < 4; ++byte) {
                low_bit = byte * VegaISA::BITS_PER_BYTE;
@@ -712,7 +714,7 @@ namespace VegaISA
                    3.  byte > sel && signExt: we're sign extending and
                    this byte is one of the bytes we need to sign extend
                */
-                origBits_thisByte = bits(origDstVal, high_bit, low_bit);
+                origBits_thisByte = bits(origDstVal, VegaISA::MSB_PER_BYTE, 0);
                currBits_thisByte = bits(currDstVal, high_bit, low_bit);
                newBits = ((byte == sel) ? origBits_thisByte :
                           ((preserve) ? currBits_thisByte :
@@ -737,7 +739,7 @@ namespace VegaISA
                    3.  word > (sel & 1) && signExt: we're sign extending and
                    this word is one of the words we need to sign extend
                */
-                origBits_thisWord = bits(origDstVal, high_bit, low_bit);
+                origBits_thisWord = bits(origDstVal, VegaISA::MSB_PER_WORD, 0);
                currBits_thisWord = bits(currDstVal, high_bit, low_bit);
                newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
                           ((preserve) ? currBits_thisWord :
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`[{"delta": 1, "kernel": "Gather", "pattern": [1333, 0, 1, 2, 36, 37, 38, 72, 73, 74, 1296, 1297, 1298, 1332, 1334, 1368], "count": 1454647}, {"delta": 1, "kernel": "Gather", "pattern": [1333, 0, 1, 36, 37, 72, 73, 1296, 1297, 1332, 1368, 1369, 2592, 2593, 2628, 2629], "count": 1454647}]`