misc: Merge v24.0 release staging branch to stable (#1274)

This merge officially marks the release of gem5 v24.0.
This commit is contained in:
Bobby R. Bruce
2024-06-27 23:22:40 -07:00
committed by GitHub
624 changed files with 78553 additions and 156412 deletions

View File

@@ -0,0 +1,37 @@
{
"name": "gem5 Development Container",
"image": "ghcr.io/gem5/devcontainer:latest",
"hostRequirements": {
"cpus": 8,
"memory": "16gb",
"storage": "32gb"
},
"customizations": {
"vscode": {
"extensions": [
"eamodio.gitlens",
"GitHub.copilot",
"GitHub.copilot-chat",
"GitHub.vscode-pull-request-github",
"ms-python.debugpy",
"ms-python.isort",
"ms-python.python",
"ms-python.vscode-pylance",
"ms-vscode.cpptools",
"ms-vscode.cpptools-extension-pack",
"ms-vscode.cpptools-themes",
"ms-vscode.makefile-tools",
"ms-vscode-remote.remote-containers",
"Tsinghua-Hexin-Joint-Institute.gem5-slicc",
"VisualStudioExptTeam.vscodeintellicode"
]
}
},
"features": {
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
"ghcr.io/devcontainers/features/github-cli:1": {},
"ghcr.io/devcontainers-contrib/features/actionlint:1": {},
"ghcr.io/devcontainers-contrib/features/vscode-cli:1": {}
},
"onCreateCommand": "./.devcontainer/on-create.sh"
}

38
.devcontainer/on-create.sh Executable file
View File

@@ -0,0 +1,38 @@
#!/bin/bash
# Copyright (c) 2024 The Regents of the University of California
# All Rights Reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# This script is run when the Docker container specified in devcontainer.json
# is created.
set -e
# Refresh the git index.
git update-index
# Install the pre-commit checks.
./util/pre-commit-install.sh

1
.gitignore vendored
View File

@@ -34,3 +34,4 @@ configs/dram/lowp_sweep.cfg
.pyenv
.vscode
typings
.DS_Store

132
.mailmap
View File

@@ -1,8 +1,11 @@
Abdul Mutaal Ahmad <abdul.mutaal@gmail.com>
adarshpatil <adarshpatil123@gmail.com>
Aditya K Kamath <a_kamath@hotmail.com> aditya <a_kamath@hotmail.com>
Adrià Armejach <adria.armejach@bsc.es> Adrià Armejach <adria.armejach@gmail.com>
Adrià Armejach <adria.armejach@bsc.es> Adrià Armejach <66964292+aarmejach@users.noreply.github.com>
Adrian Herrera <adrian.herrera@arm.com>
Adrien Pesle <adrien.pesle@arm.com>
Adwaith R Krishna <adwaithrk19@gmail.com>
Akash Bagdia <akash.bagdia@ARM.com> Akash Bagdia <akash.bagdia@arm.com>
Alec Roelke <alec.roelke@gmail.com> Alec Roelke <ar4jc@virginia.edu>
Alexander Klimov <Alexander.Klimov@arm.com>
@@ -10,21 +13,19 @@ Alexandru Dutu <alexandru.dutu@amd.com> Alexandru <alexandru.dutu@amd.com>
Alex Richardson <alexrichardson@google.com>
Ali Jafri <ali.jafri@arm.com>
Ali Saidi <Ali.Saidi@arm.com> Ali Saidi <ali.saidi@arm.com>
Ali Saidi <Ali.Saidi@arm.com> Ali Saidi <Ali.Saidi@ARM.com>
Ali Saidi <Ali.Saidi@arm.com> Ali Saidi <saidi@eecs.umich.edu>
Alistair Delva <adelva@google.com>
Alvaro Moreno <alvaro.moreno@bsc.es>
Amin Farmahini <aminfar@gmail.com>
Anders Handler <s052838@student.dtu.dk>
Andrea Mondelli <andrea.mondelli@huawei.com> Andrea Mondelli <andrea.mondelli@ucf.edu>
Andrea Mondelli <andrea.mondelli@huawei.com> Andrea Mondelli <Andrea.Mondelli@ucf.edu>
Andrea Pellegrini <andrea.pellegrini@gmail.com>
Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <andreas.hansson>
Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <andreas.hansson@arm.com>
Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <Andreas.Hansson@ARM.com>
Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <andreas.hansson@armm.com>
Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <andreas.sandberg@arm.com>
Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <Andreas.Sandberg@ARM.com>
Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <andreas@sandberg.pp.se>
Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <andreas@sandberg.uk>
Andrew Bardsley <Andrew.Bardsley@arm.com> Andrew Bardsley <Andreas.Bardsley@arm.com>
Andrew Lukefahr <lukefahr@umich.edu>
Andrew Schultz <alschult@umich.edu>
@@ -32,11 +33,14 @@ Andriani Mappoura <andriani.mappoura@arm.com>
Angie Lee <peiyinglee@google.com>
Anis Peysieux <anis.peysieux@inria.fr>
Ani Udipi <ani.udipi@arm.com>
anoop <mysanoop@gmail.com>
Anouk Van Laer <anouk.vanlaer@arm.com>
ARM gem5 Developers <none@none>
Arthur Perais <Arthur.Perais@univ-grenoble-alpes.fr> Arthur Perais <arthur.perais@inria.fr>
Arun Rodrigues <afrodri@gmail.com>
Ashkan Tousi <ashkan.tousimojarad@arm.com>
atrah22 <atul.rahman@outlook.com>
Atri Bhattacharyya <atri.bhattacharyya@epfl.ch>
Austin Harris <austinharris@utexas.edu> Austin Harris <mail@austin-harris.com>
Avishai Tvila <avishai.tvila@gmail.com>
Ayaz Akram <yazakram@ucdavis.edu>
@@ -48,6 +52,7 @@ Bjoern A. Zeeb <baz21@cam.ac.uk>
Blake Hechtman <bah13@duke.edu> Blake Hechtman <blake.hechtman@amd.com>
Blake Hechtman <bah13@duke.edu> Blake Hechtman ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <bah13@duke.edu>
Bobby R. Bruce <bbruce@ucdavis.edu> Bobby Bruce <bbruce@amarillo.cs.ucdavis.edu>
Bobby R. Bruce <bbruce@ucdavis.edu> Bobby Bruce <bbruce@ucdavis.edu>
Boris Shingarov <shingarov@gmail.com> Boris Shingarov <shingarov@labware.com>
Brad Beckmann <brad.beckmann@amd.com> Brad Beckmann <Brad.Beckmann@amd.com>
Brad Beckmann <brad.beckmann@amd.com> Brad Beckmann ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <Brad.Beckmann@amd.com>
@@ -60,15 +65,13 @@ Brian Grayson <b.grayson@samsung.com>
Cagdas Dirik <cdirik@micron.com> cdirik <cdirik@micron.com>
Carlos Falquez <c.falquez@fz-juelich.de>
Chander Sudanthi <chander.sudanthi@arm.com> Chander Sudanthi <Chander.Sudanthi@arm.com>
Chander Sudanthi <chander.sudanthi@arm.com> Chander Sudanthi <Chander.Sudanthi@ARM.com>
Charles Jamieson <cjamieson2@wisc.edu>
CHEN Meng <tundriolaxy@gmail.com>
Chen Meng <tundriolaxy@gmail.com>
Chen Zou <chenzou@uchicago.edu>
Chia-You Chen <hortune@google.com>
Chow, Marcus <marcus.chow@amd.com>
Marcus Chow <marcus.chow@amd.com>
Chris Adeniyi-Jones <Chris.Adeniyi-Jones@arm.com>
Chris Emmons <chris.emmons@arm.com> Chris Emmons <Chris.Emmons@arm.com>
Chris Emmons <chris.emmons@arm.com> Chris Emmons <Chris.Emmons@ARM.com>
Chris January <chris.january@arm.com>
Christian Menard <christian.menard@tu-dresden.de> Christian Menard <Christian.Menard@tu-dresden.de>
Christopher Torng <clt67@cornell.edu>
@@ -83,17 +86,19 @@ Daecheol You <daecheol.you@samsung.com>
Dam Sunwoo <dam.sunwoo@arm.com>
Dan Gibson <gibson@cs.wisc.edu>
Daniel Carvalho <odanrc@yahoo.com.br> Daniel <odanrc@yahoo.com.br>
Daniel Carvalho <odanrc@yahoo.com.br> Daniel Carvalho <odanrc@users.noreply.github.com>
Daniel Carvalho <odanrc@yahoo.com.br> Daniel R. Carvalho <odanrc@yahoo.com.br>
Daniel Gerzhoy <daniel.gerzhoy@gmail.com>
Daniel Johnson <daniel.johnson@arm.com>
Daniel Kouchekinia <DanKouch@users.noreply.github.com>
Daniel Sanchez <sanchezd@stanford.edu>
Davide Basilio Bartolini <davide.basilio.bartolini@huawei.com>
David Guillen-Fandos <david.guillen@arm.com> David Guillen <david.guillen@arm.com>
David Guillen-Fandos <david.guillen@arm.com> David Guillen Fandos <david.guillen@arm.com>
David Hashe <david.hashe@amd.com> David Hashe <david.j.hashe@gmail.com>
David Oehmke <doehmke@umich.edu>
David Schall <david.schall2@arm.com>
Derek Christ <dchrist@rhrk.uni-kl.de>
David Schall <david.schall@ed.ac.uk> David Schall <david.schall2@arm.com>
Derek Christ <dchrist@rhrk.uni-kl.de> Derek Christ <44267643+derchr@users.noreply.github.com>
Derek Hower <drh5@cs.wisc.edu>
Deyaun Guo <guodeyuan@tsinghua.org.cn> Deyuan Guo ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <guodeyuan@tsinghua.org.cn>
Deyaun Guo <guodeyuan@tsinghua.org.cn> Deyuan Guo <guodeyuan@tsinghua.org.cn>
@@ -107,11 +112,12 @@ Earl Ou <shunhsingou@google.com>
eavivi <eavivi@ucdavis.edu>
Éder F. Zulian <zulian@eit.uni-kl.de>
Edmund Grimley Evans <Edmund.Grimley-Evans@arm.com>
Eduardo José Gómez Hernández <eduardojose.gomez@um.es>
Eduardo José Gómez Hernández <eduardojose.gomez@um.es> Eduardo José Gómez Hernández <git@edujgh.net>
Eliot Moss <moss@cs.umass.edu>
Emilio Castillo <castilloe@unican.es> Emilio Castillo <ecastill@bsc.es>
Emilio Castillo <castilloe@unican.es> Emilio Castillo ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <castilloe@unican.es>
Emily Brickey <esbrickey@ucdavis.edu>
Emin Gadzhiev <e.gadzhiev.mhk@gmail.com>
Erfan Azarkhish <erfan.azarkhish@unibo.it>
Erhu <fengerhu.ipads@gmail.com>
Eric Van Hensbergen <eric.vanhensbergen@arm.com> Eric Van Hensbergen <Eric.VanHensbergen@ARM.com>
@@ -125,11 +131,12 @@ Gabe Black <gabe.black@gmail.com> Gabe Black <gabeblack@google.com>
Gabe Black <gabe.black@gmail.com> Gabe Black <gblack@eecs.umich.edu>
Gabe Loh <gabriel.loh@amd.com> gloh <none@none>
Gabor Dozsa <gabor.dozsa@arm.com>
Gabriel Busnot <gabriel.busnot@arteris.com>
Gabriel Busnot <gabriel.busnot@arteris.com> Gabriel Busnot <gabriel.busnot@cea.fr>
Gabriel Busnot <gabriel.busnot@arteris.com> Gabriel Busnot <gabibusnot@gmail.com>
gauravjain14 <gjain6@wisc.edu>
Gautham Pathak <gspathak@gitlab.uwaterloo.ca>
Gedare Bloom <gedare@rtems.org> Gedare Bloom <gedare@gwmail.gwu.edu>
Gene Wu <gene.wu@arm.com> Gene WU <gene.wu@arm.com>
Gene WU <gene.wu@arm.com> Gene Wu <Gene.Wu@arm.com>
Geoffrey Blake <geoffrey.blake@arm.com> Geoffrey Blake <blakeg@umich.edu>
Geoffrey Blake <geoffrey.blake@arm.com> Geoffrey Blake <Geoffrey.Blake@arm.com>
Georg Kotheimer <georg.kotheimer@mailbox.tu-dresden.de>
@@ -140,10 +147,14 @@ GWDx <gwdx@mail.ustc.edu.cn>
Hamid Reza Khaleghzadeh <khaleghzadeh@gmail.com> Hamid Reza Khaleghzadeh ext:(%2C%20Lluc%20Alvarez%20%3Clluc.alvarez%40bsc.es%3E%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <khaleghzadeh@gmail.com>
handsomeliu <handsomeliu@google.com>
Hanhwi Jang <jang.hanhwi@gmail.com>
Hoa Nguyen <hoanguyen@ucdavis.edu>
Harshil Patel <hpppatel@ucdavis.edu> Harshil Patel <harshilp2107@gmail.com>
Harshil Patel <hpppatel@ucdavis.edu> Harshil Patel <91860903+Harshil2107@users.noreply.github.com>
Wenjian He <wheac@connect.ust.hk>
HJikram <humzajahangirikram@gmail.com>
Hoa Nguyen <hn@hnpl.org> Hoa Nguyen <hoanguyen@ucdavis.edu>
Hongil Yoon <ongal@cs.wisc.edu>
Hsuan Hsu <hsuan.hsu@mediatek.com>
huangjs <jiasen.hjs@alibaba-inc.com>
hungweihsu <hungweihsu@google.com> hungweihsuG <145444687+hungweihsuG@users.noreply.github.com>
Hussein Elnawawy <hussein.elnawawy@gmail.com>
Ian Jiang <ianjiang.ict@gmail.com>
IanJiangICT <ianjiang.ict@gmail.com>
@@ -152,9 +163,13 @@ Iru Cai <mytbk920423@gmail.com>
Isaac Richter <isaac.richter@rochester.edu>
Isaac Sánchez Barrera <isaac.sanchez@bsc.es>
Ivan Pizarro <ivan.pizarro@metempsy.com>
Jack Whitham <jack-m5ml2@cs.york.ac.uk> Jack Whitman <jack-m5ml2@cs.york.ac.uk>
Ivan Turasov <turasov.ivan@gmail.com>
Ivana Mitrovic <imitrovic@ucdavis.edu> Ivana Mitrovic <ivanamit91@gmail.com>
Ivana Mitrovic <imitrovic@ucdavis.edu> ivanaamit <ivanamit91@gmail.com>
Jack Whitham <jack-m5ml2@cs.york.ac.uk>
Jairo Balart <jairo.balart@metempsy.com>
Jakub Jermar <jakub@jermar.eu>
James Braun <jebraun3@wisc.edu>
James Clarkson <james.clarkson@arm.com>
Jan-Peter Larsson <jan-peter.larsson@arm.com>
Jan Vrany <jan.vrany@labware.com>
@@ -174,8 +189,8 @@ Jayneel Gandhi <jayneel@cs.wisc.edu>
Jennifer Treichler <jtreichl@umich.edu>
Jerin Joy <joy@rivosinc.com>
Jiajie Chen <c@jia.je>
Jiasen Huang <jiasen.hjs@alibaba-inc.com>
Jiasen <jiasen.hjs@alibaba-inc.com>
Jiasen Huang <jiasen.hjs@alibaba-inc.com> Jiasen <jiasen.hjs@alibaba-inc.com>
Jiasen Huang <jiasen.hjs@alibaba-inc.com> huangjs <jiasen.hjs@alibaba-inc.com>
Jiayi Huang <jyhuang91@gmail.com>
jiegec <noc@jiegec.ac.cn>
Jieming Yin <jieming.yin@amd.com> jiemingyin <bjm419@gmail.com>
@@ -188,14 +203,17 @@ Joel Hestness <jthestness@gmail.com> Joel Hestness <hestness@cs.wisc.edu>
Joël Porquet-Lupine <joel@porquet.org>
John Alsop <johnathan.alsop@amd.com>
John Kalamatianos <john.kalamatianos@amd.com> jkalamat <john.kalamatianos@amd.com>
Johnny <johnnyko@google.com>
Jordi Vaquero <jordi.vaquero@metempsy.com>
Jose Marinho <jose.marinho@arm.com>
Juan M. Cebrian <jm.cebriangonzalez@gmail.com>
Jui-min Lee <fcrh@google.com>
kai.ren <kai.ren@streamcomputing.com> Kai Ren <binarystar2006@outlook.com>
Kai Ren <kai.ren@streamcomputing.com> kai.ren <kai.ren@streamcomputing.com>
Kai Ren <kai.ren@streamcomputing.com> Kai Ren <binarystar2006@outlook.com>
KaiBatley <68886332+KaiBatley@users.noreply.github.com>
Kanishk Sugand <kanishk.sugand@arm.com>
Karthik Sangaiah <karthik.sangaiah@arm.com>
Kaustav Goswami <kggoswami@ucdavis.edu>
Kaustav Goswami <kggoswami@ucdavis.edu> Kaustav Goswami <39310478+kaustav-goswami@users.noreply.github.com>
Kelly Nguyen <klynguyen@ucdavis.edu>
Ke Meng <mengke97@hotmail.com>
Kevin Brodsky <kevin.brodsky@arm.com>
@@ -206,11 +224,16 @@ Koan-Sin Tan <koansin.tan@gmail.com>
Korey Sewell <ksewell@umich.edu>
Krishnendra Nathella <Krishnendra.Nathella@arm.com> Krishnendra Nathella <krinat01@arm.com>
ksco <numbksco@gmail.com>
kunpai <kunpai@ucdavis.edu>
Kunal Pai <kunpai@ucdavis.edu> Kunal Pai <62979320+kunpai@users.noreply.github.com>
Kunal Pai <kunpai@ucdavis.edu> kunpai <kunpai@ucdavis.edu>
Kunal Pai <kunpai@ucdavis.edu> paikunal <kunpai@ucdavis.edu>
Kunal Pai <kunpai@ucdavis.edu> KUNAL PAI <kunpai@ucdavis.edu>
Kyle Roarty <kyleroarty1716@gmail.com> Kyle Roarty <Kyle.Roarty@amd.com>
Laura Hinman <llhinman@ucdavis.edu>
Lena Olson <leolson@google.com> Lena Olson <lena@cs.wisc,edu>
Lena Olson <leolson@google.com> Lena Olson <lena@cs.wisc.edu>
Leo Redivo <lredivo@ucdavis.edu> leoredivo <94771718+leoredivo@users.noreply.github.com>
Lingkang <karlzhu12@gmail.com>
Lisa Hsu <Lisa.Hsu@amd.com> Lisa Hsu <hsul@eecs.umich.edu>
Lluc Alvarez <lluc.alvarez@bsc.es>
Lluís Vilanova <vilanova@ac.upc.edu> Lluis Vilanova <vilanova@ac.upc.edu>
@@ -221,9 +244,11 @@ Mahyar Samani <msamani@ucdavis.edu>
Majid Jalili <majid0jalili@gmail.com>
Malek Musleh <malek.musleh@gmail.com> Nilay Vaish ext:(%2C%20Malek%20Musleh%20%3Cmalek.musleh%40gmail.com%3E) <nilay@cs.wisc.edu>
Marc Mari Barcelo <marc.maribarcelo@arm.com>
Marco Balboni <Marco.Balboni@ARM.com>
Marco Elver <Marco.Elver@ARM.com> Marco Elver <marco.elver@ed.ac.uk>
Marc Orr <marc.orr@gmail.com> Marc Orr <morr@cs.wisc.edu>
Marco Balboni <Marco.Balboni@ARM.com>
Marco Chen <mc@soc.pub>
Marco Elver <Marco.Elver@ARM.com> Marco Elver <marco.elver@ed.ac.uk>
Marco Kurzynski <marcokurzynski@icloud.com>
Marjan Fariborz <mfariborz@ucdavis.edu> marjanfariborz <mfariborz@ucdavis.edu>
Mark Hildebrand <mhildebrand@ucdavis.edu>
Marton Erdos <marton.erdos@arm.com>
@@ -233,20 +258,18 @@ Matteo Andreozzi <matteo.andreozzi@arm.com> Matteo Andreozzi <Matteo.Andreozzi@a
Matteo M. Fusi <matteo.fusi@bsc.es>
Matt Evans <matt.evans@arm.com> Matt Evans <Matt.Evans@arm.com>
Matthew Poremba <matthew.poremba@amd.com> Matthew Poremba <Matthew.Poremba@amd.com>
Matthias Boettcher <matthias.boettcher@arm.com>
Matthias Hille <matthiashille8@gmail.com>
Matthias Jung <jungma@eit.uni-kl.de>
Matthias Jung <matthias.jung@iese.fraunhofer.de>
Matt Horsnell <matt.horsnell@arm.com> Matt Horsnell <matt.horsnell@ARM.com>
Matthias Jung <matthias.jung@iese.fraunhofer.de> Matthias Jung <jungma@eit.uni-kl.de>
Matt Horsnell <matt.horsnell@arm.com> Matt Horsnell <Matt.Horsnell@arm.com>
Matt Horsnell <matt.horsnell@arm.com>Matt Horsnell <Matt.Horsnell@ARM.com>
Matt Poremba <matthew.poremba@amd.com> Matt Poremba <Matthew.Poremba@amd.com>
Matt Sinclair <mattdsinclair@gmail.com> Matthew Sinclair <matthew.sinclair@amd.com>
Matt Sinclair <mattdsinclair.wisc@gmail.com> Matt Sinclair <Matthew.Sinclair@amd.com>
Matt Sinclair <mattdsinclair.wisc@gmail.com> Matt Sinclair <mattdsinclair@gmail.com>
Matt Sinclair <mattdsinclair.wisc@gmail.com> Matthew Sinclair <matthew.sinclair@amd.com>
Maurice Becker <madnaurice@googlemail.com>
Maxime Martinasso <maxime.cscs@gmail.com>
Maximilian Stein <maximilian.stein@tu-dresden.de>Maximilian Stein <m@steiny.biz>
Maximilien Breughe <maximilien.breughe@elis.ugent.be> Maximilien Breughe <Maximilien.Breughe@elis.ugent.be>
Melissa Jost <melissakjost@gmail.com>
Melissa Jost <melissakjost@gmail.com> Melissa Jost <50555529+mkjost0@users.noreply.github.com>
Michael Adler <Michael.Adler@intel.com>
Michael Boyer <Michael.Boyer@amd.com>
Michael LeBeane <michael.lebeane@amd.com> Michael LeBeane <Michael.Lebeane@amd.com>
@@ -262,7 +285,6 @@ Min Kyu Jeong <minkyu.jeong@arm.com> Min Kyu Jeong <MinKyu.Jeong@arm.com>
Mitch Hayenga <mitch.hayenga@arm.com> Mitchell Hayenga <Mitchell.Hayenga@ARM.com>
Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga ext:(%2C%20Amin%20Farmahini%20%3Caminfar%40gmail.com%3E) <mitch.hayenga+gem5@gmail.com>
Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga <Mitch.Hayenga@arm.com>
Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga <Mitch.Hayenga@ARM.com>
Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga <mitch.hayenga+gem5@gmail.com>
Mohammad Alian <m.alian1369@gmail.com>
Monir Mozumder <monir.mozumder@amd.com>
@@ -279,13 +301,17 @@ Nathan Binkert <nate@binkert.org> Nathan Binkert <binkertn@umich.edu>
Nayan Deshmukh <nayan26deshmukh@gmail.com>
Neha Agarwal <neha.agarwal@arm.com>
Neil Natekar <nanatekar@ucdavis.edu>
Nicholas Lindsay <nicholas.lindsay@arm.com>
Nicholas Lindsay <nicholas.lindsay@arm.com> Nicholas Lindsay <Nicholas.Lindsey@arm.com>
Nicholas Mosier <nmosier@stanford.edu> Nicholas Mosier <nh.mosier@gmail.com>
Nicolas Boichat <drinkcat@google.com>
Nicolas Derumigny <nderumigny@gmail.com>
Nicolas Zea <nicolas.zea@gmail.com>
Nikolaos Kyparissas <nikolaos.kyparissas@arm.com>
Nikos Nikoleris <nikos.nikoleris@arm.com> Nikos Nikoleris <nikos.nikoleris@gmail.com>
Nilay Vaish ext:(%2C%20Timothy%20Jones%20%3Ctimothy.jones%40cl.cam.ac.uk%3E) <nilay@cs.wisc.edu>
Nils Asmussen <nils.asmussen@barkhauseninstitut.org> Nils Asmussen <nilsasmussen7@gmail.com>
Nitesh Narayana <nitesh.dps@gmail.com>
Nitish Arya <42148385+aryanitish@users.noreply.github.com>
Noah Katz <nkatz@rivosinc.com>
ntampouratzis <ntampouratzis@isc.tuc.gr>
Nuwan Jayasena <Nuwan.Jayasena@amd.com>
@@ -293,7 +319,6 @@ Ola Jeppsson <ola.jeppsson@gmail.com>
Omar Naji <Omar.Naji@arm.com>
Onur Kayiran <onur.kayiran@amd.com>
Pablo Prieto <pablo.prieto@unican.es>
paikunal <kunpai@ucdavis.edu>
Palle Lyckegaard <palle@lyckegaard.dk>
Pau Cabre <pau.cabre@metempsy.com>
Paul Rosenfeld <prosenfeld@micron.com> Paul Rosenfeld <dramninjas@gmail.com>
@@ -308,29 +333,39 @@ Po-Hao Su <supohaosu@gmail.com>
Polina Dudnik <pdudnik@cs.wisc.edu> Polina Dudnik <pdudnik@gmail.com>
Polydoros Petrakis <ppetrak@ics.forth.gr>
Pouya Fotouhi <pfotouhi@ucdavis.edu> Pouya Fotouhi <Pouya.Fotouhi@amd.com>
Prajwal Hegde <prhegde@wisc.edu>
Prakash Ramrakhyani <prakash.ramrakhyani@arm.com> Prakash Ramrakhani <Prakash.Ramrakhani@arm.com>
Prakash Ramrakhyani <prakash.ramrakhyani@arm.com> Prakash Ramrakhyani <Prakash.Ramrakhyani@arm.com>
Pritha Ghoshal <pritha9987@tamu.edu>
Pu (Luke) Yi <lukeyi@stanford.edu>
Quentin Forcioli <quentin.forcioli@telecom-paris.fr>
Radhika Jagtap <radhika.jagtap@arm.com> Radhika Jagtap <radhika.jagtap@ARM.com>
Rahul Thakur <rjthakur@google.com>
Reiley Jeapaul <Reiley.Jeyapaul@arm.com>
Rajarshi Das <drajarsh@gmail.com>
Ranganath (Bujji) Selagamsetty <bujji.selagamsetty@amd.com> BujSet <ranganath1000@gmail.com>
Razeza <borisov.dn@phystech.edu>
Reiley Jeapaul <reiley.jeyapaul@arm.com> Reiley Jeapaul <Reiley.Jeyapaul@arm.com>
Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai Gonzalez Alberquilla <rekai.gonzalezalberquilla@arm.com>
Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai Gonzalez Alberquilla <Rekai.GonzalezAlberquilla@arm.com>
Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai Gonzalez-Alberquilla <Rekai.GonzalezAlberquilla@arm.com>
Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai <Rekai.GonzalezAlberquilla@arm.com>
Rene de Jong <rene.dejong@arm.com>
Ricardo Alves <ricardo.alves@arm.com>
Richard Cooper <richard.cooper@arm.com>
Richard D. Strong <r.d.strong@gmail.com>
Richard Strong <rstrong@hp.com> Richard D. Strong <r.d.strong@gmail.com>
Richard Strong <rstrong@hp.com> Richard Strong <r.d.strong@gmail.com>
Richard Strong <rstrong@hp.com> Richard Strong <rstrong@cs.ucsd.edu>
Richard Strong <rstrong@hp.com> Rick Strong <rstrong@cs.ucsd.edu>
Rico Amslinger <rico.amslinger@informatik.uni-augsburg.de>
Riken Gohil <Riken.Gohil@arm.com>
Rizwana Begum <rb639@drexel.edu>
Robert Hauser <85344819+robhau@users.noreply.github.com>
Robert Kovacsics <rmk35@cl.cam.ac.uk>
Robert Scheffel <robert.scheffel1@tu-dresden.de> Robert <robert.scheffel1@tu-dresden.de>
Rocky Tatiefo <rtatiefo@google.com>
Roger Chang <rogerycchang@google.com> rogerchang23424 <rogerycchang@google.com>
Roger Chang <rogerycchang@google.com> rogerchang23424 <32214817+rogerchang23424@users.noreply.github.com>
Roger Chang <rogerycchang@google.com> rogerchang23424 <aucixw45876@gmail.com>
Roger Chang <rogerycchang@google.com> Yu-Cheng Chang <rogerycchang@google.com>
Rohit Kurup <rohit.kurup@arm.com>
Ron Dreslinski <rdreslin@umich.edu> Ronald Dreslinski <rdreslin@umich.edu>
Ruben Ayrapetyan <ruben.ayrapetyan@arm.com>
@@ -342,23 +377,21 @@ sacak32 <byrakocalan99@gmail.com>
Sampad Mohapatra <sampad.mohapatra@gmail.com>
Samuel Grayson <sam@samgrayson.me>
Samuel Stark <samuel.stark2@arm.com>
Sandipan Das <31861871+sandip4n@users.noreply.github.com>
Sandipan Das <sandipan@linux.ibm.com> Sandipan Das <31861871+sandip4n@users.noreply.github.com>
Santi Galan <santi.galan@metempsy.com>
Sascha Bischoff <sascha.bischoff@arm.com> Sascha Bischoff <sascha.bischoff@ARM.com>
Sascha Bischoff <sascha.bischoff@arm.com> Sascha Bischoff <Sascha.Bischoff@ARM.com>
Saúl Adserias <33020671+saul44203@users.noreply.github.com>
Sean McGoogan <Sean.McGoogan@arm.com>
Sean Wilson <spwilson2@wisc.edu>
Sergei Trofimov <sergei.trofimov@arm.com>
Severin Wischmann <wiseveri@student.ethz.ch> Severin Wischmann ext:(%2C%20Ioannis%20Ilkos%20%3Cioannis.ilkos09%40imperial.ac.uk%3E) <wiseveri@student.ethz.ch>
Shawn Rosti <shawn.rosti@gmail.com>
Sherif Elhabbal <elhabbalsherif@gmail.com>
Shivani Parekh <shparekh@ucdavis.edu>
Shivani <shparekh@ucdavis.edu>
Shivani Parekh <shparekh@ucdavis.edu> Shivani <shparekh@ucdavis.edu>
Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Simon Park <seminpark@google.com>
Somayeh Sardashti <somayeh@cs.wisc.edu>
Sooraj Puthoor <puthoorsooraj@gmail.com>
Sooraj Puthoor <Sooraj.Puthoor@amd.com>
Sooraj Puthoor <puthoorsooraj@gmail.com> Sooraj Puthoor <Sooraj.Puthoor@amd.com>
Sophiane Senni <sophiane.senni@gmail.com>
Soumyaroop Roy <sroy@cse.usf.edu>
Srikant Bharadwaj <srikant.bharadwaj@amd.com>
@@ -370,7 +403,6 @@ Steve Raasch <sraasch@umich.edu>
Steve Reinhardt <stever@gmail.com> Steve Reinhardt ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E%2C%20Ali%20Saidi%20%3CAli.Saidi%40ARM.com%3E) <stever@gmail.com>
Steve Reinhardt <stever@gmail.com> Steve Reinhardt <stever@eecs.umich.edu>
Steve Reinhardt <stever@gmail.com> Steve Reinhardt <steve.reinhardt@amd.com>
Steve Reinhardt <stever@gmail.com> Steve Reinhardt <Steve.Reinhardt@amd.com>
Stian Hvatum <stian@dream-web.no>
Sudhanshu Jha <sudhanshu.jha@arm.com>
Sujay Phadke <electronicsguy123@gmail.com>
@@ -378,16 +410,18 @@ Sungkeun Kim <ksungkeun84@tamu.edu>
Swapnil Haria <swapnilster@gmail.com> Swapnil Haria <swapnilh@cs.wisc.edu>
Taeho Kgil <tkgil@umich.edu>
Tao Zhang <tao.zhang.0924@gmail.com>
Thilo Vörtler <thilo.voertler@coseda-tech.com> root <thilo.voertler@coseda-tech.com>
Thomas Grass <Thomas.Grass@ARM.com>
Tiago Mück <tiago.muck@arm.com> Tiago Muck <tiago.muck@arm.com>
Tiberiu Bucur <36485854+TiberiuBucur@users.noreply.github.com>
Tim Harris <tharris@microsoft.com>
Timothy Hayes <timothy.hayes@arm.com>
Timothy M. Jones <timothy.jones@arm.com> Timothy Jones <timothy.jones@cl.cam.ac.uk>
Timothy M. Jones <timothy.jones@arm.com> Timothy M. Jones <timothy.jones@cl.cam.ac.uk>
Timothy M. Jones <timothy.jones@arm.com> Timothy M. Jones <tjones1@inf.ed.ac.uk>
Tom Jablin <tjablin@gmail.com>
Tommaso Marinelli <tommarin@ucm.es>
Tom Rollet <tom.rollet@huawei.com>
Tommaso Marinelli <tommarin@ucm.es>
Tong Shen <endlessroad@google.com>
Tony Gutierrez <anthony.gutierrez@amd.com> Anthony Gutierrez <atgutier@umich.edu>
Travis Boraten <travis.boraten@amd.com>
@@ -401,6 +435,7 @@ Victor Garcia <victor.garcia@arm.com>
Vilas Sridharan <vilas.sridharan@gmail.com>
Vincentius Robby <acolyte@umich.edu>
Vince Weaver <vince@csl.cornell.edu>
Vishnu Ramadas <vramadas@outlook.com>
vramadas95 <vramadas@wisc.edu>
vsoria <victor.soria@bsc.es>
Wade Walker <wade.walker@arm.com>
@@ -409,14 +444,16 @@ Weiping Liao <weipingliao@google.com>
Wende Tan <twd2@163.com>
Wendy Elsasser <wendy.elsasser@arm.com>
William Wang <william.wang@arm.com> William Wang <William.Wang@arm.com>
William Wang <william.wang@arm.com> William Wang <William.Wang@ARM.com>
Willy Wolff <willy.mh.wolff.ml@gmail.com>
Wing Li <wingers@google.com>
wmin0 <wmin0@hotmail.com>
Xiangyu Dong <rioshering@gmail.com>
Xianwei Zhang <xianwei.zhang.@amd.com> Xianwei Zhang <xianwei.zhang@amd.com>
Xiaoyu Ma <xiaoyuma@google.com>
Xin Ouyang <xin.ouyang@streamcomputing.com>
Xiongfei <xiongfei.liao@gmail.com>
Xuan Hu <huxuan@bosc.ac.cn>
Yan Lee <yanlee@google.com>
Yasuko Eckert <yasuko.eckert@amd.com>
Yen-lin Lai <yenlinlai@google.com>
Yifei Liu <liu.ad2039@gmail.com>
@@ -426,7 +463,10 @@ Yuan Yao <yuanyao@seas.harvard.edu>
Yuetsu Kodama <yuetsu.kodama@riken.jp> yuetsu.kodama <yuetsu.kodama@riken.jp>
Yu-hsin Wang <yuhsingw@google.com>
Zhang Zheng <perise@gmail.com>
Zhantong Qiu <ztqiu@ucdavis.edu>
Zhantong Qiu <ztqiu@ucdavis.edu> studyztp <studyztp@gmail.com>
Zhengrong Wang <seanzw@ucla.edu> seanzw <seanyukigeek@gmail.com>
Zhengrong Wang <seanzw@ucla.edu> Zhengrong Wang <seanyukigeek@gmail.com>
zhongchengyong <zhongcy93@gmail.com>
Zicong Wang <wangzicong@nudt.edu.cn>
Zixian Cai <2891235+caizixian@users.noreply.github.com>
zmckevitt <zack.mckevitt@gmail.com>

View File

@@ -1,3 +1,161 @@
# Version 24.0
gem5 Version 24.0 is the first major release of 2024.
During this time there have been 298 pull requests merged, comprising of over 600 commits, from 56 unique contributors.
## API and user-facing changes
* The GCN3 GPU model has been removed in favor of the newer VEGA_X85 GPU model.
* gem5 now supports building, running, and simulating Ubuntu 24.04.
### Compiler and OS support
As of this release gem5 support Clang version 6 to 16 and GCC version 10 to 13.
While other compilers and versions may work, they are not regularly tested.
gem5 now supports building, running, and simulating on Ubuntu 24.04.
We continue to support 22.04 with 20.04 being deprecated in the coming year.
The majority of our testing is done on Ubuntu LTS systems though Apple Silicon machines and other Linux distributions have also been used regularly during development.
Improvements have been made to ensure a wider support of operating systems.
## New features
### gem5 MultiSim: Multiprocessing for gem5
The gem5 "MultiSim" module allows for multiple simulations to be run from a single gem5 execution via a single gem5 configuration script.
This allows for multiple simulations to be run in parallel in a structured manner.
To use MultiSim first create multiple simulators and add them to the MultiSim with the `add_simulator` function.
If needed, limit the maximum number of parallel processes with the `set_num_processes` function.
Then run the simulations in parallel with the `gem5` binary using `-m gem5.utils.multisim`.
Here is an example of how to use MultiSim:
```python
import gem5.util.multisim as multisim
# Set the maximum number of processes to run in parallel
multisim.set_num_processes(4)
# Create multiple simulators.
# In this case, one for each workload in the benchmark suite.
for workload in benchmark_suite:
board = X86Board(
# ...
)
board.set_workload(workload)
# Useful to set the ID here. This is used to create unique output
# directorires for each gem5 process and can be used to idenfify and
# run gem5 processes individually.
simulator = Simulator(board, id=f"{workload.get_id()}")
multisim.add_simulator(simulator)
```
Then to run the simulations in parallel:
```sh
<gem5 binary> -m gem5.utils.multisim <config script>
```
The output directory ("m5out" by default) will contain sub-directories for each simulation run.
The sub-directory will be named after the simulator ID set in the configuration script.
We therefore recommend setting the simulator ID to something meaningful to help identify the output directories (i.e., the workload run or something identifying the meaningful characteristics of the simulated system in comparison to others).
If only one simulation specified in the config needs run, you can do so with:
```sh
<gem5 binary> <config script> --list # Lists the simulations by ID
<gem5 binary> <config script> <ID> # Run the simulation with the specified ID.
```
Example scripts of using MultiSim can be found in "configs/example/gem5_library/multisim".
### RISC-V Vector Extension Support
There have been significant improvements to the RVV support in gem5 including
* Fixed viota (#1137)
* Fixed vrgather (#1134)
* Added RVV FP16 support (#1123)
* Fixed widening and narrowing instructions (#1079)
* Fixed bug in vfmv.f.s (#863)
* Add unit stride segment loads and stores (#851) (#913)
* Fix vl in masked load/store (#830)
* Add unit-stride loads (#794)
* Fix many RVV instructions (#814) (#805) (#715)
### General RISC-V bugfixes
* Fixed problem in TLB lookup (#1264)
* Fixed sign-extended branch target (#1173)
* Fixed compressed jump instructions (#1163)
* Fixed GDB connection (#1152)
* Fixed CSR behavior (#1099)
* Add Integer conditional operations Zicond (#1078)
* Add RISC-V Semihosting support (#681)
* Added more detailed instruction types (#589)
* Fixed 32-bit m5op arguments (#900)
* Fixed c.fswsp and c.fsw (#998) (#1005)
* Update PLIC implementation (#886)
* Fix fflags behavior in O3 (#868)
* Add support for local interrupts (#813)
* Removebit 63 of physical address (#756)
## Improvements
* Added an new generator which can generate requests based on [spatter](https://github.com/hpcgarage/spatter) patterns.
* KVM is now supported in the gem5 Standard Library ARM Board.
* Generic Cache template added to the Standard Library: https://github.com/gem5/gem5/pull/745
* Support added for partitioning caches.
* The Standard Library `obtain_resources` function can request multiple resources at once thus reducing delay associated with multiple requests.
* An official gem5 DevContainer has been added to the gem5 repository.
This can be used to build and run gem5 in consistent environment and enables GitHub Codespaces support.
### gem5 Python Statistics
The gem5 Python statistics API has been improved.
The gem5 Project's general intent with this improvement is make it easier and more desirable to obtain and interact with gem5 simulation statistics via Python.
For example, the following code snippet demonstrates how to obtain statistics from a gem5 simulation:
```python
from m5.stats.gem5stats import get_simstat
## Setup and run the configuation ...
simstat = get_simstat(board)
# Print the number of cycles the CPU at index 0 has executed.
print(simstat.cpu[0].numCycles)
# Strings can also be used to access statistics.
print(simstat['cpu'][0]['numCycles'])
# Print the total number of cycles executed by all CPUs.
print(sum(simstat.cpu[i].numCycles for i in range(len(simstat.cpu))))
```
We hope the usage of the gem5 Python statistics API will be more intuitive and easier to use while allowing better processing of statistical data.
### GPU Model
* Support for MI300X and MI200 GPU models including their features and most instructions.
* ROCm 6.1 disk image and compile docker files have been added. ROCm 5.4.2 and 4.2 resources are removed.
* The deprecated GCN3 ISA has been removed. Use VEGA instead.
## Bug Fixes
* An integer overflow error known to affect the `AddrRange` class has been fixed.
* Fix fflags behavior of floating point instruction in RISC-V for Out-of-Order CPUs.
### Arm FEAT_MPAM Support
An initial implementation of FEAT_MPAM has been introduced in gem5 with the capability to statically partition
classic caches. Guidance on how to use this is available on a Arm community [blog post](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/gem5-cache-partitioning)
# Version 23.1
gem5 Version 23.1 is our first release where the development has been on GitHub.

View File

@@ -117,6 +117,8 @@ AddOption('--no-compress-debug', action='store_true',
help="Don't compress debug info in build files")
AddOption('--with-lto', action='store_true',
help='Enable Link-Time Optimization')
AddOption('--with-libcxx', action='store_true',
help='Use libc++ as the C++ standard library (requires Clang)')
AddOption('--verbose', action='store_true',
help='Print full tool command lines')
AddOption('--without-python', action='store_true',
@@ -550,11 +552,6 @@ for variant_path in variant_paths:
env.Append(CCFLAGS=['-pipe'])
env.Append(CCFLAGS=['-fno-strict-aliasing'])
# Enable -Wall and -Wextra and then disable the few warnings that
# we consistently violate
env.Append(CCFLAGS=['-Wall', '-Wundef', '-Wextra',
'-Wno-sign-compare', '-Wno-unused-parameter'])
# We always compile using C++17
env.Append(CXXFLAGS=['-std=c++17'])
@@ -567,6 +564,16 @@ for variant_path in variant_paths:
with gem5_scons.Configure(env) as conf:
conf.CheckLinkFlag('-Wl,--as-needed')
want_libcxx = GetOption('with_libcxx')
if want_libcxx:
with gem5_scons.Configure(env) as conf:
# Try using libc++ if it supports the <filesystem> library.
code = '#include <filesystem>\nint main() { return 0; }'
if (not conf.CheckCxxFlag('-stdlib=libc++') or
not conf.CheckLinkFlag('-stdlib=libc++', code=code)
):
error('Requested libc++ but it is not usable')
linker = GetOption('linker')
if linker:
with gem5_scons.Configure(env) as conf:
@@ -597,6 +604,13 @@ for variant_path in variant_paths:
env.Append(LINKFLAGS=['-Wl,--no-keep-memory'])
else:
error("Unable to use --no-keep-memory with the linker")
# Treat warnings as errors but white list some warnings that we
# want to allow (e.g., deprecation warnings).
env.Append(CCFLAGS=['-Werror',
'-Wno-error=deprecated-declarations',
'-Wno-error=deprecated',
])
else:
error('\n'.join((
"Don't know what compiler options to use for your compiler.",
@@ -612,8 +626,8 @@ for variant_path in variant_paths:
"src/SConscript to support that compiler.")))
if env['GCC']:
if compareVersions(env['CXXVERSION'], "7") < 0:
error('gcc version 7 or newer required.\n'
if compareVersions(env['CXXVERSION'], "10") < 0:
error('gcc version 10 or newer required.\n'
'Installed version:', env['CXXVERSION'])
# Add the appropriate Link-Time Optimization (LTO) flags if
@@ -637,17 +651,6 @@ for variant_path in variant_paths:
'-fno-builtin-malloc', '-fno-builtin-calloc',
'-fno-builtin-realloc', '-fno-builtin-free'])
if compareVersions(env['CXXVERSION'], "9") < 0:
# `libstdc++fs`` must be explicitly linked for `std::filesystem``
# in GCC version 8. As of GCC version 9, this is not required.
#
# In GCC 7 the `libstdc++fs`` library explicit linkage is also
# required but the `std::filesystem` is under the `experimental`
# namespace(`std::experimental::filesystem`).
#
# Note: gem5 does not support GCC versions < 7.
env.Append(LIBS=['stdc++fs'])
elif env['CLANG']:
if compareVersions(env['CXXVERSION'], "6") < 0:
error('clang version 6 or newer required.\n'
@@ -665,7 +668,7 @@ for variant_path in variant_paths:
env.Append(TCMALLOC_CCFLAGS=['-fno-builtin'])
if compareVersions(env['CXXVERSION'], "11") < 0:
if not want_libcxx and compareVersions(env['CXXVERSION'], "11") < 0:
# `libstdc++fs`` must be explicitly linked for `std::filesystem``
# in clang versions 6 through 10.
#
@@ -679,7 +682,7 @@ for variant_path in variant_paths:
# On Mac OS X/Darwin we need to also use libc++ (part of XCode) as
# opposed to libstdc++, as the later is dated.
if sys.platform == "darwin":
if not want_libcxx and sys.platform == "darwin":
env.Append(CXXFLAGS=['-stdlib=libc++'])
env.Append(LIBS=['c++'])
@@ -688,20 +691,26 @@ for variant_path in variant_paths:
if GetOption('with_ubsan'):
sanitizers.append('undefined')
if GetOption('with_asan'):
# Available for gcc >= 5 or llvm >= 3.1 both a requirement
# by the build system
sanitizers.append('address')
suppressions_file = Dir('util').File('lsan-suppressions').get_abspath()
suppressions_opt = 'suppressions=%s' % suppressions_file
suppressions_opts = ':'.join([suppressions_opt,
'print_suppressions=0'])
env['ENV']['LSAN_OPTIONS'] = suppressions_opts
print()
warning('To suppress false positive leaks, set the LSAN_OPTIONS '
'environment variable to "%s" when running gem5' %
suppressions_opts)
warning('LSAN_OPTIONS=%s' % suppressions_opts)
print()
if env['GCC']:
# Address sanitizer is not supported with GCC. Please see Github
# Issue https://github.com/gem5/gem5/issues/916 for more details.
warning("Address Sanitizer is not supported with GCC. "
"This option will be ignored.")
else:
# Available for llvm >= 3.1. A requirement by the build system.
sanitizers.append('address')
suppressions_file = Dir('util').File('lsan-suppressions')\
.get_abspath()
suppressions_opt = 'suppressions=%s' % suppressions_file
suppressions_opts = ':'.join([suppressions_opt,
'print_suppressions=0'])
env['ENV']['LSAN_OPTIONS'] = suppressions_opts
print()
warning('To suppress false positive leaks, set the LSAN_OPTIONS '
'environment variable to "%s" when running gem5' %
suppressions_opts)
warning('LSAN_OPTIONS=%s' % suppressions_opts)
print()
if sanitizers:
sanitizers = ','.join(sanitizers)
if env['GCC'] or env['CLANG']:

View File

@@ -7,3 +7,4 @@ USE_POWER_ISA=y
USE_RISCV_ISA=y
USE_SPARC_ISA=y
USE_X86_ISA=y
USE_TEST_OBJECTS=y

View File

@@ -1,6 +0,0 @@
RUBY=y
RUBY_PROTOCOL_GPU_VIPER=y
BUILD_ISA=y
USE_X86_ISA=y
GCN3_GPU_ISA=y
BUILD_GPU=y

View File

@@ -211,8 +211,7 @@ code.indent()
if sim_object == SimObject:
code(
"""
SimObjectParams() {}
virtual ~SimObjectParams() {}
virtual ~SimObjectParams() = default;
std::string name;
"""

View File

@@ -224,7 +224,7 @@ for cpu in system.cpu:
if ObjectList.is_kvm_cpu(CPUClass) or ObjectList.is_kvm_cpu(FutureClass):
if buildEnv["USE_X86_ISA"]:
system.kvm_vm = KvmVM()
system.m5ops_base = 0xFFFF0000
system.m5ops_base = max(0xFFFF0000, Addr(args.mem_size).getValue())
for process in multiprocesses:
process.useArchPT = True
process.kvmInSE = True

View File

@@ -335,6 +335,12 @@ parser.add_argument(
default="dynamic",
help="register allocation policy (simple/dynamic)",
)
parser.add_argument(
"--register-file-cache-size",
type=int,
default=0,
help="number of registers in cache",
)
parser.add_argument(
"--dgpu",
@@ -369,11 +375,33 @@ parser.add_argument(
parser.add_argument(
"--gfx-version",
type=str,
default="gfx801",
default="gfx902",
choices=GfxVersion.vals,
help="Gfx version for gpuNote: gfx902 is not fully supported by ROCm",
)
parser.add_argument(
"--tcp-rp",
type=str,
default="TreePLRURP",
help="cache replacement policy" "policy for tcp",
)
parser.add_argument(
"--tcc-rp",
type=str,
default="TreePLRURP",
help="cache replacement policy" "policy for tcc",
)
# sqc rp both changes sqc rp and scalar cache rp
parser.add_argument(
"--sqc-rp",
type=str,
default="TreePLRURP",
help="cache replacement policy" "policy for sqc",
)
Ruby.define_options(parser)
# add TLB options to the parser
@@ -428,6 +456,7 @@ print(
# shader is the GPU
shader = Shader(
n_wf=args.wfs_per_simd,
cu_per_sqc=args.cu_per_sqc,
clk_domain=SrcClockDomain(
clock=args.gpu_clock,
voltage_domain=VoltageDomain(voltage=args.gpu_voltage),
@@ -493,6 +522,7 @@ for i in range(n_cu):
vrfs = []
vrf_pool_mgrs = []
srfs = []
rfcs = []
srf_pool_mgrs = []
for j in range(args.simds_per_cu):
for k in range(shader.n_wf):
@@ -537,10 +567,16 @@ for i in range(n_cu):
simd_id=j, wf_size=args.wf_size, num_regs=args.sreg_file_size
)
)
rfcs.append(
RegisterFileCache(
simd_id=j, cache_size=args.register_file_cache_size
)
)
compute_units[-1].wavefronts = wavefronts
compute_units[-1].vector_register_file = vrfs
compute_units[-1].scalar_register_file = srfs
compute_units[-1].register_file_cache = rfcs
compute_units[-1].register_manager = RegisterManager(
policy=args.registerManagerPolicy,
vrf_pool_managers=vrf_pool_mgrs,
@@ -671,7 +707,7 @@ render_driver = GPURenderDriver(filename=f"dri/renderD{renderDriNum}")
gpu_hsapp = HSAPacketProcessor(
pioAddr=hsapp_gpu_map_paddr, numHWQueues=args.num_hw_queues
)
dispatcher = GPUDispatcher()
dispatcher = GPUDispatcher(kernel_exit_events=True)
gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp, dispatcher=dispatcher)
gpu_driver.device = gpu_cmd_proc
shader.dispatcher = dispatcher
@@ -798,6 +834,8 @@ if fast_forward:
# configure the TLB hierarchy
GPUTLBConfig.config_tlb_hierarchy(args, system, shader_idx)
system.exit_on_work_items = True
# create Ruby system
system.piobus = IOXBar(
width=32, response_latency=0, frontend_latency=0, forward_latency=0
@@ -938,19 +976,15 @@ root = Root(system=system, full_system=False)
# knows what type of GPU hardware we are simulating
if args.dgpu:
assert args.gfx_version in [
"gfx803",
"gfx900",
], "Incorrect gfx version for dGPU"
if args.gfx_version == "gfx803":
hsaTopology.createFijiTopology(args)
elif args.gfx_version == "gfx900":
if args.gfx_version == "gfx900":
hsaTopology.createVegaTopology(args)
else:
assert args.gfx_version in [
"gfx801",
"gfx902",
], "Incorrect gfx version for APU"
hsaTopology.createCarrizoTopology(args)
hsaTopology.createRavenTopology(args)
m5.ticks.setGlobalFrequency("1THz")
if args.abs_max_tick:
@@ -976,6 +1010,41 @@ if args.fast_forward:
exit_event = m5.simulate(maxtick)
while True:
if (
exit_event.getCause() == "m5_exit instruction encountered"
or exit_event.getCause() == "user interrupt received"
or exit_event.getCause() == "simulate() limit reached"
or "exiting with last active thread context" in exit_event.getCause()
):
print(f"breaking loop due to: {exit_event.getCause()}.")
break
elif "checkpoint" in exit_event.getCause():
assert args.checkpoint_dir is not None
m5.checkpoint(args.checkpoint_dir)
print("breaking loop with checkpoint")
break
elif "GPU Kernel Completed" in exit_event.getCause():
print("GPU Kernel Completed dump and reset")
m5.stats.dump()
m5.stats.reset()
elif "GPU Blit Kernel Completed" in exit_event.getCause():
print("GPU Blit Kernel Completed dump and reset")
m5.stats.dump()
m5.stats.reset()
elif "workbegin" in exit_event.getCause():
print("m5 work begin dump and reset")
m5.stats.dump()
m5.stats.reset()
elif "workend" in exit_event.getCause():
print("m5 work end dump and reset")
m5.stats.dump()
m5.stats.reset()
else:
print(f"Unknown exit event: {exit_event.getCause()}. Continuing...")
exit_event = m5.simulate(maxtick - m5.curTick())
if args.fast_forward:
if exit_event.getCause() == "a thread reached the max instruction count":
m5.switchCpus(system, switch_cpu_list)

View File

@@ -53,15 +53,24 @@ from common import (
MemConfig,
ObjectList,
)
from common.cores.arm import HPI
from common.cores.arm import (
HPI,
O3_ARM_v7a,
)
# Pre-defined CPU configurations. Each tuple must be ordered as : (cpu_class,
# l1_icache_class, l1_dcache_class, walk_cache_class, l2_Cache_class). Any of
# l1_icache_class, l1_dcache_class, l2_Cache_class). Any of
# the cache class may be 'None' if the particular cache is not present.
cpu_types = {
"atomic": (AtomicSimpleCPU, None, None, None),
"minor": (MinorCPU, devices.L1I, devices.L1D, devices.L2),
"hpi": (HPI.HPI, HPI.HPI_ICache, HPI.HPI_DCache, HPI.HPI_L2),
"o3": (
O3_ARM_v7a.O3_ARM_v7a_3,
O3_ARM_v7a.O3_ARM_v7a_ICache,
O3_ARM_v7a.O3_ARM_v7a_DCache,
O3_ARM_v7a.O3_ARM_v7aL2,
),
}

View File

@@ -0,0 +1,201 @@
# Copyright (c) 2024 ARM Limited
# All rights reserved.
#
# The license below extends only to copyright in the software and shall
# not be construed as granting a license to any other intellectual
# property including but not limited to intellectual property relating
# to a hardware implementation of the functionality of the software
# licensed hereunder. You may use the software subject to the license
# terms below provided that you ensure that this notice is replicated
# unmodified and in its entirety in all distributions of the software,
# modified or unmodified, in source code or in binary form.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# This script showcases the functionality of cache partitioning policies,
# containg a simple system comprised of a memory requestor (TrafficGen),
# a cache enforcing policies for requests and a SimpleMemory backing store.
#
# Using the Way policy, the cache should show the following statistics in the
# provided configuration:
#
# | Allocated Ways | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
# |----------------|---|-----|-----|-----|-----|-----|-----|------|
# | Cache Hits | 0 | 256 | 384 | 512 | 640 | 768 | 896 | 1024 |
#
# Using the MaxCapacity policy, expected results are the following:
#
# | Allocation % | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
# |--------------|----|-----|-----|-----|-----|-----|-----|-----|-----|------|
# | Cache Hits | 0 | 152 | 307 | 409 | 512 | 614 | 716 | 819 | 921 | 1024 |
import argparse
import m5
from m5.objects import *
def capacityAllocation(capacity_str):
"""
Verify that Max Capacity partitioning policy has been provided with a suitable
configuration
"""
capacity = float(capacity_str)
if capacity > 1 or capacity < 0:
raise argparse.ArgumentTypeError(
"Max Capacity Policy needs allocation in range [0, 1]"
)
return capacity
def wayAllocation(way_str):
"""
Verify that Way partitioning policy has been provided with a suitable
configuration
"""
way_alloc = int(way_str)
if way_alloc < 0:
raise argparse.ArgumentTypeError(
"Way Policy needs positive number of ways"
)
return way_alloc
def generatePartPolicy(args):
"""
Generate Partitioning Policy object based on provided arguments
"""
assert args.policy in [
"way",
"max_capacity",
], "Only support generating way and max_capacity policies"
if args.policy == "way":
allocated_ways = [way for way in range(0, args.way_allocation)]
allocation = WayPolicyAllocation(partition_id=0, ways=allocated_ways)
return WayPartitioningPolicy(allocations=[allocation])
if args.policy == "max_capacity":
return MaxCapacityPartitioningPolicy(
partition_ids=[0], capacities=[args.capacity_allocation]
)
def configSystem():
"""
Configure base system and memory
"""
system = System(membus=IOXBar(width=128))
system.clk_domain = SrcClockDomain(
clock="10THz",
voltage_domain=VoltageDomain(),
)
# Memory configuration
system.mem_ctrl = SimpleMemory(bandwidth="1GiB/s", latency="10ns")
# add memory
system.mem_ctrl.range = AddrRange("64KiB")
system.mem_ctrl.port = system.membus.mem_side_ports
return system
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--policy",
default="way",
choices=["way", "max_capacity"],
help="This option defines which Cache Partitioning Policy to use for "
"the system cache",
)
parser.add_argument(
"--capacity-allocation",
type=capacityAllocation,
default=0.5,
help="The amount of the cache to partition to the default PartitionID "
"when using Max Capacity Cache Partitioning Policy in [0,1] range",
)
parser.add_argument(
"--way-allocation",
type=wayAllocation,
default=4,
help="The number of ways in the cache to partition to the default "
"PartitionID when using Way Cache Partitioning Policy",
)
args = parser.parse_args()
m5.ticks.setGlobalFrequency("10THz")
system = configSystem()
# create a cache to sit between the memory and traffic gen to enforce
# partitioning policies
part_manager = PartitionManager(
partitioning_policies=[generatePartPolicy(args)]
)
system.cache = NoncoherentCache(
size="64KiB",
assoc=8,
partitioning_manager=part_manager,
tag_latency=0,
data_latency=0,
response_latency=0,
mshrs=1,
tgts_per_mshr=8,
write_buffers=1,
replacement_policy=MRURP(),
)
system.cache.mem_side = system.membus.cpu_side_ports
# instantiate traffic gen and connect to crossbar
system.tgen = PyTrafficGen()
system.tgen.port = system.cache.cpu_side
# finalise config and run simulation
root = Root(full_system=False, system=system)
root.system.mem_mode = "timing"
m5.instantiate()
# configure traffic generator to do 2x 64KiB sequential reads from address 0
# to 65536; one to warm up the cache one to test cache partitioning
linear_tgen = system.tgen.createLinear(
1000000000, 0, 65536, 64, 1, 1, 100, 65536
)
exit_tgen = system.tgen.createExit(1)
system.tgen.start([linear_tgen, linear_tgen, exit_tgen])
# handle exit reporting
exit_event = m5.simulate(2000000000)
print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")

View File

@@ -84,7 +84,7 @@ board.set_se_binary_workload(
# Any resource specified in this file will be automatically retrieved.
# At the time of writing, this file is a WIP and does not contain all
# resources. Jira ticket: https://gem5.atlassian.net/browse/GEM5-1096
obtain_resource("arm-hello64-static")
obtain_resource("arm-hello64-static", resource_version="1.0.0")
)
# Lastly we run the simulation.

View File

@@ -0,0 +1,143 @@
# Copyright (c) 2022-23 The Regents of the University of California
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
This script further shows an example of booting an ARM based full system Ubuntu
disk image. This simulation boots the disk image using 2 TIMING CPU cores. The
simulation ends when the startup is completed successfully (i.e. when an
`m5_exit instruction is reached on successful boot).
Usage
-----
```
scons build/ARM/gem5.opt -j<NUM_CPUS>
./build/ARM/gem5.opt configs/example/gem5_library/arm-ubuntu-run-with-kvm.py
```
"""
from m5.objects import (
ArmDefaultRelease,
VExpress_GEM5_V1,
)
from gem5.coherence_protocol import CoherenceProtocol
from gem5.components.boards.arm_board import ArmBoard
from gem5.components.memory import DualChannelDDR4_2400
from gem5.components.processors.cpu_types import CPUTypes
from gem5.components.processors.simple_switchable_processor import (
SimpleSwitchableProcessor,
)
from gem5.isas import ISA
from gem5.resources.resource import obtain_resource
from gem5.simulate.exit_event import ExitEvent
from gem5.simulate.simulator import Simulator
from gem5.utils.requires import requires
# This runs a check to ensure the gem5 binary is compiled for ARM.
requires(isa_required=ISA.ARM)
from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
PrivateL1PrivateL2CacheHierarchy,
)
# Here we setup the parameters of the l1 and l2 caches.
cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
l1d_size="16kB", l1i_size="16kB", l2_size="256kB"
)
# Memory: Dual Channel DDR4 2400 DRAM device.
memory = DualChannelDDR4_2400(size="2GB")
# Here we setup the processor. This is a special switchable processor in which
# a starting core type and a switch core type must be specified. Once a
# configuration is instantiated a user may call `processor.switch()` to switch
# from the starting core types to the switch core types. In this simulation
# we start with KVM cores to simulate the OS boot, then switch to the Timing
# cores for the command we wish to run after boot.
processor = SimpleSwitchableProcessor(
starting_core_type=CPUTypes.KVM,
switch_core_type=CPUTypes.TIMING,
isa=ISA.ARM,
num_cores=2,
)
# The ArmBoard requires a `release` to be specified. This adds all the
# extensions or features to the system. We are setting this to for_kvm()
# to enable KVM simulation.
release = ArmDefaultRelease.for_kvm()
# The platform sets up the memory ranges of all the on-chip and off-chip
# devices present on the ARM system. ARM KVM only works with VExpress_GEM5_V1
# on the ArmBoard at the moment.
platform = VExpress_GEM5_V1()
# Here we setup the board. The ArmBoard allows for Full-System ARM simulations.
board = ArmBoard(
clk_freq="3GHz",
processor=processor,
memory=memory,
cache_hierarchy=cache_hierarchy,
release=release,
platform=platform,
)
# This is the command to run after the system has booted. The first `m5 exit`
# will stop the simulation so we can switch the CPU cores from KVM to timing
# and continue the simulation to run the echo command, sleep for a second,
# then, again, call `m5 exit` to terminate the simulation. After simulation
# has ended you may inspect `m5out/system.pc.com_1.device` to see the echo
# output.
command = (
"m5 --addr=0x10010000 exit;"
+ "echo 'This is running on Timing CPU cores.';"
+ "m5 exit;"
)
# Here we set a full system workload. The "arm64-ubuntu-20.04-boot" boots
# Ubuntu 20.04. We use arm64-bootloader (boot.arm64) as the bootloader to use
# ARM KVM.
board.set_kernel_disk_workload(
kernel=obtain_resource(
"arm64-linux-kernel-5.4.49", resource_version="1.0.0"
),
disk_image=obtain_resource(
"arm64-ubuntu-20.04-img", resource_version="1.0.0"
),
bootloader=obtain_resource("arm64-bootloader", resource_version="1.0.0"),
readfile_contents=command,
)
# We define the system with the aforementioned system defined.
simulator = Simulator(
board=board,
on_exit_event={ExitEvent.EXIT: (func() for func in [processor.switch])},
)
# Once the system successfully boots, it encounters an
# `m5_exit instruction encountered`. We stop the simulation then. When the
# simulation has ended you may inspect `m5out/board.terminal` to see
# the stdout.
simulator.run()

View File

@@ -102,7 +102,9 @@ board = ArmBoard(
# Here we set a full system workload. The "arm64-ubuntu-20.04-boot" boots
# Ubuntu 20.04.
board.set_workload(obtain_resource("arm64-ubuntu-20.04-boot"))
board.set_workload(
obtain_resource("arm64-ubuntu-20.04-boot", resource_version="2.0.0")
)
# We define the system with the aforementioned system defined.

View File

@@ -97,7 +97,9 @@ board = ArmBoard(
platform=platform,
)
board.set_workload(obtain_resource("arm64-ubuntu-20.04-boot"))
board.set_workload(
obtain_resource("arm64-ubuntu-20.04-boot", resource_version="2.0.0")
)
simulator = Simulator(board=board)
simulator.run()

View File

@@ -90,7 +90,9 @@ board = SimpleBoard(
board.set_se_binary_workload(
# the workload should be the same as the save-checkpoint script
obtain_resource("riscv-hello"),
checkpoint=obtain_resource("riscv-hello-example-checkpoint"),
checkpoint=obtain_resource(
"riscv-hello-example-checkpoint", resource_version="3.0.0"
),
)
simulator = Simulator(

View File

@@ -107,8 +107,8 @@ board.set_se_binary_workload(
# Lastly we run the simulation.
max_ticks = 10**6
simulator = Simulator(board=board, full_system=False)
simulator.run(max_ticks=max_ticks)
simulator = Simulator(board=board, full_system=False, max_ticks=max_ticks)
simulator.run()
print(
"Exiting @ tick {} because {}.".format(

View File

@@ -60,8 +60,8 @@ from m5.stats import (
)
from gem5.components.boards.simple_board import SimpleBoard
from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
PrivateL1PrivateL2CacheHierarchy,
from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
PrivateL1PrivateL2WalkCacheHierarchy,
)
from gem5.components.memory import DualChannelDDR4_2400
from gem5.components.processors.cpu_types import CPUTypes
@@ -80,7 +80,7 @@ requires(isa_required=ISA.X86)
# The cache hierarchy can be different from the cache hierarchy used in taking
# the checkpoints
cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
l1d_size="32kB",
l1i_size="32kB",
l2_size="256kB",
@@ -125,7 +125,9 @@ board.set_se_simpoint_workload(
weight_list=[0.1, 0.2, 0.4, 0.3],
warmup_interval=1000000,
),
checkpoint=obtain_resource("simpoints-se-checkpoints-v23-0-v1"),
checkpoint=obtain_resource(
"simpoints-se-checkpoints", resource_version="3.0.0"
),
)

View File

@@ -78,7 +78,7 @@ board.set_se_binary_workload(
# Any resource specified in this file will be automatically retrieved.
# At the time of writing, this file is a WIP and does not contain all
# resources. Jira ticket: https://gem5.atlassian.net/browse/GEM5-1096
obtain_resource("arm-hello64-static")
obtain_resource("arm-hello64-static", resource_version="1.0.0")
)
# Lastly we run the simulation.

View File

@@ -48,8 +48,8 @@ from m5.stats import (
)
from gem5.components.boards.simple_board import SimpleBoard
from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
PrivateL1PrivateL2CacheHierarchy,
from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
PrivateL1PrivateL2WalkCacheHierarchy,
)
from gem5.components.memory import DualChannelDDR4_2400
from gem5.components.processors.cpu_types import CPUTypes
@@ -90,7 +90,7 @@ args = parser.parse_args()
# The cache hierarchy can be different from the cache hierarchy used in taking
# the checkpoints
cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
l1d_size="32kB",
l1i_size="32kB",
l2_size="256kB",

View File

@@ -0,0 +1,138 @@
# Copyright (c) 2024 The Regents of the University of California.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""An example of a single configuration script for defining multiple
simulations through the gem5 `multisim` module.
This script creates 6 full system simulations by interating through a suite
of benchmarks and different cores counts.
Usage
-----
1. To run all the simulations defined in this script::
```shell
<gem5-binary> -m gem5.utils.multisim \
configs/example/gem5_library/multisim/multisim-fs-x86-npb.py
```
2. To run a specific simulation defined in this script:
```shell
<gem5-binary> configs/example/gem5_library/multisim/multisim-fs-x86-npb.py \
<process_id> # e.g. npb-bt-a_cores-1
```
3. To list all the IDs of the simulations defined in this script:
```shell
<gem5-binary> configs/example/gem5_library/multisim/multisim-fs-x86-npb.py -l
```
"""
import m5
import gem5.utils.multisim as multisim
from gem5.coherence_protocol import CoherenceProtocol
from gem5.components.boards.x86_board import X86Board
from gem5.components.memory import DualChannelDDR4_2400
from gem5.components.processors.cpu_types import CPUTypes
from gem5.components.processors.simple_switchable_processor import (
SimpleSwitchableProcessor,
)
from gem5.isas import ISA
from gem5.resources.resource import obtain_resource
from gem5.simulate.simulator import (
ExitEvent,
Simulator,
)
from gem5.utils.requires import requires
requires(
isa_required=ISA.X86,
coherence_protocol_required=CoherenceProtocol.MESI_TWO_LEVEL,
)
from gem5.components.cachehierarchies.ruby.mesi_two_level_cache_hierarchy import (
MESITwoLevelCacheHierarchy,
)
def handle_workbegin():
m5.stats.reset()
processor.switch()
yield False
def handle_workend():
m5.stats.dump()
yield True
# Set the maximum number of concurrent processes to be 3.
multisim.set_num_processes(3)
# Here we imagine an experiment wanting to run each NPB benchmark on the same
# system twice: once with 1 core and once with 2 cores.
for benchmark in obtain_resource("npb-benchmark-suite"):
for num_cores in [1, 2]:
cache_hierarchy = MESITwoLevelCacheHierarchy(
l1d_size="32kB",
l1i_size="32kB",
l2_size="256kB",
l1d_assoc=8,
l1i_assoc=8,
l2_assoc=16,
num_l2_banks=2,
)
memory = DualChannelDDR4_2400(size="3GB")
processor = SimpleSwitchableProcessor(
starting_core_type=CPUTypes.ATOMIC,
switch_core_type=CPUTypes.TIMING,
isa=ISA.X86,
num_cores=num_cores,
)
board = X86Board(
clk_freq="3GHz",
processor=processor,
memory=memory,
cache_hierarchy=cache_hierarchy,
)
board.set_workload(benchmark)
simulator = Simulator(
board=board,
on_exit_event={
ExitEvent.WORKBEGIN: handle_workbegin(),
ExitEvent.WORKEND: handle_workend(),
},
)
simulator.set_id(f"{benchmark.get_id()}_cores-{num_cores}")
multisim.add_simulator(simulator)

View File

@@ -0,0 +1,87 @@
# Copyright (c) 2024 The Regents of the University of California
# All Rights Reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""An example of a single configuration script for defining multiple
simulations through the gem5 `multisim` module.
This script is very simple and simply prints a simple message once for each
simulation, outputing the process id.
Usage
-----
1. To run all the simulations defined in this script::
```shell
<gem5-binary> -m gem5.utils.multisim \
configs/example/gem5_library/multisim/multisim-print-this.py
```
2. To run a specific simulation defined in this script:
```shell
<gem5-binary> configs/example/gem5_library/multisim/multisim-print-this.py \
process_id_1
```
3. To list all the IDs of the simulations defined in this script:
```shell
<gem5-binary> configs/example/gem5_library/multisim/multisim-print-this.py -l
```
"""
import gem5.utils.multisim as multisim
from gem5.components.boards.simple_board import SimpleBoard
from gem5.components.cachehierarchies.classic.no_cache import NoCache
from gem5.components.memory import SingleChannelDDR3_1600
from gem5.components.processors.cpu_types import CPUTypes
from gem5.components.processors.simple_processor import SimpleProcessor
from gem5.isas import ISA
from gem5.resources.resource import obtain_resource
from gem5.simulate.simulator import Simulator
# Set the maximum number of concurrent processes to be 2.
multisim.set_num_processes(2)
for process_id in range(5):
cache_hierarchy = NoCache()
memory = SingleChannelDDR3_1600(size="32MB")
processor = SimpleProcessor(
cpu_type=CPUTypes.TIMING, isa=ISA.X86, num_cores=1
)
board = SimpleBoard(
clk_freq="1GHz",
processor=processor,
memory=memory,
cache_hierarchy=cache_hierarchy,
)
board.set_se_binary_workload(
binary=obtain_resource("x86-print-this"),
arguments=[f"Hello from process {process_id}", 1],
)
multisim.add_simulator(Simulator(board=board, id=f"process_{process_id}"))

View File

@@ -75,7 +75,9 @@ board = SimpleBoard(
cache_hierarchy=cache_hierarchy,
)
board.set_se_binary_workload(obtain_resource("power-hello"))
board.set_se_binary_workload(
obtain_resource("power-hello", resource_version="1.0.0")
)
# Lastly we run the simulation.
simulator = Simulator(board=board)

View File

@@ -40,8 +40,8 @@ Characteristics
"""
from gem5.components.boards.riscv_board import RiscvBoard
from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
PrivateL1PrivateL2CacheHierarchy,
from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
PrivateL1PrivateL2WalkCacheHierarchy,
)
from gem5.components.memory import SingleChannelDDR3_1600
from gem5.components.processors.cpu_types import CPUTypes
@@ -57,7 +57,7 @@ requires(isa_required=ISA.RISCV)
# Setup the cache hierarchy.
# For classic, PrivateL1PrivateL2 and NoCache have been tested.
# For Ruby, MESI_Two_Level and MI_example have been tested.
cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
)
@@ -79,8 +79,10 @@ board = RiscvBoard(
# Set the Full System workload.
board.set_kernel_disk_workload(
kernel=obtain_resource("riscv-bootloader-vmlinux-5.10"),
disk_image=obtain_resource("riscv-disk-img"),
kernel=obtain_resource(
"riscv-bootloader-vmlinux-5.10", resource_version="1.0.0"
),
disk_image=obtain_resource("riscv-disk-img", resource_version="1.0.0"),
)
simulator = Simulator(board=board)

View File

@@ -57,12 +57,12 @@ from gem5.utils.requires import requires
requires(isa_required=ISA.RISCV)
# With RISCV, we use simple caches.
from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
PrivateL1PrivateL2CacheHierarchy,
from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
PrivateL1PrivateL2WalkCacheHierarchy,
)
# Here we setup the parameters of the l1 and l2 caches.
cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
l1d_size="16kB", l1i_size="16kB", l2_size="256kB"
)
@@ -88,7 +88,9 @@ board = RiscvBoard(
# Ubuntu 20.04. Once the system successfully boots it encounters an `m5_exit`
# instruction which stops the simulation. When the simulation has ended you may
# inspect `m5out/system.pc.com_1.device` to see the stdout.
board.set_workload(obtain_resource("riscv-ubuntu-20.04-boot"))
board.set_workload(
obtain_resource("riscv-ubuntu-20.04-boot", resource_version="3.0.0")
)
simulator = Simulator(board=board)
simulator.run()

View File

@@ -76,7 +76,7 @@ board = RISCVMatchedBoard(
# In the case where the `-i` flag is passed, we add the kernel argument
# `init=/root/exit.sh`. This means the simulation will exit after the Linux
# Kernel has booted.
workload = obtain_resource("riscv-ubuntu-20.04-boot")
workload = obtain_resource("riscv-ubuntu-20.04-boot", resource_version="3.0.0")
kernel_args = board.get_default_kernel_args()
if args.to_init:
kernel_args.append("init=/root/exit.sh")

View File

@@ -49,7 +49,9 @@ requires(isa_required=ISA.RISCV)
board = RISCVMatchedBoard()
# set the hello world riscv binary as the board workload
board.set_se_binary_workload(obtain_resource("riscv-hello"))
board.set_se_binary_workload(
obtain_resource("riscv-hello", resource_version="1.0.0")
)
# run the simulation with the RISCV Matched board
simulator = Simulator(board=board, full_system=False)

View File

@@ -45,7 +45,9 @@ requires(isa_required=ISA.RISCV)
board = RISCVMatchedBoard()
# obtain the RISC-V Vertical Microbenchmarks
microbenchmarks = obtain_resource("riscv-vertical-microbenchmarks")
microbenchmarks = obtain_resource(
"riscv-vertical-microbenchmarks", resource_version="1.0.0"
)
# list all the microbenchmarks present in the suite
print("Microbenchmarks present in the suite:")

View File

@@ -0,0 +1,97 @@
# Copyright (c) 2024 The Regents of the University of California
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Script that runs a SpatterGen test with a specific trace file.
This script can be used as an example on how to use SpatterGenerator,
SpatterKernel, and its utilities to run a Spatter trace in gem5.
The script uses a spatter trace taken from the hpcgarage github repository.
Link to the original trace file:
https://github.com/hpcgarage/spatter/blob/main/standard-suite/app-traces/amg.json
It will create a system with `num_cores` SpatterGenerators and interleave the
trace by `intlv_size` elements in the `pattern` field from the trace.
Interleaving is done for assigning part of the access to each core.
Usage:
------
```
scons build/NULL/gem5.opt
./build/NULL/gem5.opt configs/example/gem5_library/spatter_gen/spatter-gen-test.py
```
"""
import argparse
import json
from pathlib import Path
import m5
from m5.objects import Root
from gem5.components.boards.test_board import TestBoard
from gem5.components.cachehierarchies.classic.private_l1_cache_hierarchy import (
PrivateL1CacheHierarchy,
)
from gem5.components.memory import DualChannelDDR4_2400
from gem5.components.processors.spatter_gen import (
SpatterGenerator,
prepare_kernels,
)
from gem5.simulate.simulator import Simulator
num_cores = 8
intlv_size = 128
memory = DualChannelDDR4_2400(size="8GiB")
generator = SpatterGenerator(
processing_mode="synchronous", num_cores=num_cores
)
kernels = prepare_kernels(
Path(__file__).parent / "traces/amg.json",
num_cores,
intlv_size,
0,
memory.get_size() // 2,
)
for kernel in kernels:
generator.add_kernel(kernel)
board = TestBoard(
clk_freq="4GHz",
generator=generator,
cache_hierarchy=PrivateL1CacheHierarchy(
l1d_size="32KiB", l1i_size="32KiB"
),
memory=memory,
)
simulator = Simulator(board=board, full_system=False)
simulator.run()

View File

@@ -0,0 +1 @@
[{"delta": 1, "kernel": "Gather", "pattern": [1333, 0, 1, 2, 36, 37, 38, 72, 73, 74, 1296, 1297, 1298, 1332, 1334, 1368], "count": 1454647}, {"delta": 1, "kernel": "Gather", "pattern": [1333, 0, 1, 36, 37, 72, 73, 1296, 1297, 1332, 1368, 1369, 2592, 2593, 2628, 2629], "count": 1454647}]

View File

@@ -77,7 +77,9 @@ parser = argparse.ArgumentParser(
description="An example configuration script to run the gapbs benchmarks."
)
gapbs_suite = obtain_resource("gapbs-benchmark-suite")
gapbs_suite = obtain_resource(
"gapbs-benchmark-suite", resource_version="1.0.0"
)
# The only positional argument accepted is the benchmark name in this script.

View File

@@ -88,7 +88,7 @@ parser = argparse.ArgumentParser(
description="An example configuration script to run the npb benchmarks."
)
npb_suite = obtain_resource("npb-benchmark-suite")
npb_suite = obtain_resource("npb-benchmark-suite", resource_version="1.0.0")
# The only positional argument accepted is the benchmark name in this script.
parser.add_argument(

View File

@@ -185,10 +185,12 @@ board.set_kernel_disk_workload(
# The x86 linux kernel will be automatically downloaded to the
# `~/.cache/gem5` directory if not already present.
# PARSEC benchamarks were tested with kernel version 4.19.83
kernel=obtain_resource("x86-linux-kernel-4.19.83"),
kernel=obtain_resource(
"x86-linux-kernel-4.19.83", resource_version="1.0.0"
),
# The x86-parsec image will be automatically downloaded to the
# `~/.cache/gem5` directory if not already present.
disk_image=obtain_resource("x86-parsec"),
disk_image=obtain_resource("x86-parsec", resource_version="1.0.0"),
readfile_contents=command,
)

View File

@@ -121,7 +121,7 @@ command = (
+ "m5 exit;"
)
workload = obtain_resource("x86-ubuntu-18.04-boot")
workload = obtain_resource("x86-ubuntu-18.04-boot", resource_version="2.0.0")
workload.set_parameter("readfile_contents", command)
board.set_workload(workload)

View File

@@ -117,7 +117,7 @@ command = (
+ "m5 exit;"
)
workload = obtain_resource("x86-ubuntu-18.04-boot")
workload = obtain_resource("x86-ubuntu-18.04-boot", resource_version="2.0.0")
workload.set_parameter("readfile_contents", command)
board.set_workload(workload)

View File

@@ -55,7 +55,9 @@ board = X86DemoBoard()
# We then set the workload. Here we use the "x86-ubuntu-18.04-boot" workload.
# This boots Ubuntu 18.04 with Linux 5.4.49. If the required resources are not
# found locally, they will be downloaded.
board.set_workload(obtain_resource("x86-ubuntu-18.04-boot"))
board.set_workload(
obtain_resource("x86-ubuntu-18.04-boot", resource_version="2.0.0")
)
simulator = Simulator(board=board)
simulator.run()

View File

@@ -58,6 +58,8 @@ class Disjoint_VIPER(RubySystem):
self.network_cpu = DisjointSimple(self)
self.network_gpu = DisjointSimple(self)
self.block_size_bytes = options.cacheline_size
# Construct CPU controllers
cpu_dir_nodes = construct_dirs(options, system, self, self.network_cpu)
(cp_sequencers, cp_cntrl_nodes) = construct_corepairs(

View File

@@ -247,3 +247,9 @@ def addAmdGPUOptions(parser):
default="simple",
help="register allocation policy (simple/dynamic)",
)
parser.add_argument(
"--register-file-cache-size",
type=int,
default=0,
help="number of registers in cache",
)

View File

@@ -0,0 +1,159 @@
# Copyright (c) 2023 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import argparse
import base64
import os
import sys
import tempfile
import runfs
from amd import AmdGPUOptions
from common import (
GPUTLBOptions,
Options,
)
from ruby import Ruby
import m5
demo_runscript_without_checkpoint = """\
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
export HSA_ENABLE_INTERRUPT=0
export HCC_AMDGPU_TARGET=gfx90a
free -m
dmesg -n8
dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
/sbin/m5 exit
fi
modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
echo "Running {} {}"
echo "{}" | base64 -d > myapp
chmod +x myapp
./myapp {}
/sbin/m5 exit
"""
demo_runscript_with_checkpoint = """\
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
export HSA_ENABLE_INTERRUPT=0
export HCC_AMDGPU_TARGET=gfx90a
dmesg -n8
dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
/sbin/m5 exit
fi
modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
echo "Running {} {}"
echo "{}" | base64 -d > myapp
chmod +x myapp
/sbin/m5 checkpoint
./myapp {}
/sbin/m5 exit
"""
def addDemoOptions(parser):
parser.add_argument(
"-a", "--app", default=None, help="GPU application to run"
)
parser.add_argument(
"-o", "--opts", default="", help="GPU application arguments"
)
def runMI200GPUFS(cpu_type):
parser = argparse.ArgumentParser()
runfs.addRunFSOptions(parser)
Options.addCommonOptions(parser)
AmdGPUOptions.addAmdGPUOptions(parser)
Ruby.define_options(parser)
GPUTLBOptions.tlb_options(parser)
addDemoOptions(parser)
# Parse now so we can override options
args = parser.parse_args()
demo_runscript = ""
# Create temp script to run application
if args.app is None:
print(f"No application given. Use {sys.argv[0]} -a <app>")
sys.exit(1)
elif args.kernel is None:
print(f"No kernel path given. Use {sys.argv[0]} --kernel <vmlinux>")
sys.exit(1)
elif args.disk_image is None:
print(f"No disk path given. Use {sys.argv[0]} --disk-image <linux>")
sys.exit(1)
elif not os.path.isfile(args.app):
print("Could not find applcation", args.app)
sys.exit(1)
# Choose runscript Based on whether any checkpointing args are set
if args.checkpoint_dir is not None:
demo_runscript = demo_runscript_with_checkpoint
else:
demo_runscript = demo_runscript_without_checkpoint
with open(os.path.abspath(args.app), "rb") as binfile:
encodedBin = base64.b64encode(binfile.read()).decode()
_, tempRunscript = tempfile.mkstemp()
with open(tempRunscript, "w") as b64file:
runscriptStr = demo_runscript.format(
args.app, args.opts, encodedBin, args.opts
)
b64file.write(runscriptStr)
if args.second_disk == None:
args.second_disk = args.disk_image
# Defaults for MI200
args.ruby = True
args.cpu_type = "X86KvmCPU"
args.mem_size = "8GB" # CPU host memory
args.dgpu = True
args.dgpu_mem_size = "16GB" # GPU device memory
args.dgpu_start = "0GB"
args.checkpoint_restore = 0
args.disjoint = True
args.timing_gpu = True
args.script = tempRunscript
args.dgpu_xor_low_bit = 0
args.gpu_device = "MI200"
# Run gem5
runfs.runGpuFSSystem(args)
if __name__ == "__m5_main__":
runMI200GPUFS("X86KvmCPU")

View File

@@ -0,0 +1,172 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
""" This file creates an X86 system with a KVM CPU and GPU device capable of
running the MI300 ISA (gfx942). Most of this file sets up a runscript which
will load in a binary, shell script, or python file from the host and run that
within gem5. Jump to line 146 for list of system parameters to configure.
"""
import argparse
import base64
import os
import sys
import tempfile
from typing import Optional
import runfs
from amd import AmdGPUOptions
from common import (
GPUTLBOptions,
Options,
)
from ruby import Ruby
import m5
from gem5.resources.resource import AbstractResource
demo_runscript_without_checkpoint = """\
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
export HSA_ENABLE_INTERRUPT=0
export HCC_AMDGPU_TARGET=gfx942
export HSA_OVERRIDE_GFX_VERSION="9.4.2"
dmesg -n8
cat /proc/cpuinfo
dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
/sbin/m5 exit
fi
modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
echo "Running {} {}"
echo "{}" | base64 -d > myapp
chmod +x myapp
./myapp {}
/sbin/m5 exit
"""
demo_runscript_with_checkpoint = """\
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
export HSA_ENABLE_INTERRUPT=0
export HCC_AMDGPU_TARGET=gfx942
export HSA_OVERRIDE_GFX_VERSION="9.4.2"
dmesg -n8
dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
/sbin/m5 exit
fi
modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
echo "Running {} {}"
echo "{}" | base64 -d > myapp
chmod +x myapp
/sbin/m5 checkpoint
./myapp {}
/sbin/m5 exit
"""
def addDemoOptions(parser):
parser.add_argument(
"-a", "--app", default=None, help="GPU application to run"
)
parser.add_argument(
"-o", "--opts", default="", help="GPU application arguments"
)
def runMI300GPUFS(
cpu_type,
disk: Optional[AbstractResource] = None,
kernel: Optional[AbstractResource] = None,
app: Optional[AbstractResource] = None,
):
parser = argparse.ArgumentParser()
runfs.addRunFSOptions(parser)
Options.addCommonOptions(parser)
AmdGPUOptions.addAmdGPUOptions(parser)
Ruby.define_options(parser)
GPUTLBOptions.tlb_options(parser)
addDemoOptions(parser)
# Parse now so we can override options
args = parser.parse_args()
demo_runscript = ""
if disk != None:
args.disk_image = disk.get_local_path()
if kernel != None:
args.kernel = kernel.get_local_path()
if app != None:
args.app = app.get_local_path()
# Create temp script to run application
if not os.path.isfile(args.app):
print("Could not find applcation", args.app)
sys.exit(1)
# Choose runscript Based on whether any checkpointing args are set
if args.checkpoint_dir is not None:
demo_runscript = demo_runscript_with_checkpoint
else:
demo_runscript = demo_runscript_without_checkpoint
with open(os.path.abspath(args.app), "rb") as binfile:
encodedBin = base64.b64encode(binfile.read()).decode()
_, tempRunscript = tempfile.mkstemp()
with open(tempRunscript, "w") as b64file:
runscriptStr = demo_runscript.format(
args.app, args.opts, encodedBin, args.opts
)
b64file.write(runscriptStr)
args.script = tempRunscript
# Defaults for CPU
args.cpu_type = "X86KvmCPU"
args.mem_size = "8GB"
# Defaults for MI300X
args.gpu_device = "MI300X"
args.dgpu_mem_size = "16GB" # GPU memory size, must be 16GB currently.
# See: https://rocm.docs.amd.com/en/latest/conceptual/gpu-arch/mi300.html
# Topology for one XCD. Number of CUs is approximately 304 / 8, rounded
# up to 40 due to gem5 restriction of 4 CUs per SQC / scalar cache.
args.num_compute_units = 40
args.gpu_topology = "Crossbar"
# Run gem5
runfs.runGpuFSSystem(args)
if __name__ == "__m5_main__":
runMI300GPUFS("X86KvmCPU")

View File

@@ -134,23 +134,41 @@ def addRunFSOptions(parser):
parser.add_argument(
"--gpu-device",
default="Vega10",
choices=["Vega10", "MI100", "MI200"],
help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
"MI200 (gfx90a)",
choices=["Vega10", "MI100", "MI200", "MI300X"],
help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), MI200 "
"(gfx90a), or MI300X (gfx942).",
)
parser.add_argument(
"--debug-at-gpu-kernel",
"--debug-at-gpu-task",
type=int,
default=-1,
help="Turn on debug flags starting with this kernel",
help="Turn on debug flags starting with this task (counting both blit"
" and non-blit kernels)",
)
parser.add_argument(
"--exit-at-gpu-kernel",
"--exit-at-gpu-task",
type=int,
default=-1,
help="Exit simulation after running this many kernels",
help="Exit simulation after running this many tasks (counting both "
"blit and non-blit kernels)",
)
parser.add_argument(
"--exit-after-gpu-kernel",
type=int,
default=-1,
help="Exit simulation after completing this (non-blit) kernel",
)
parser.add_argument(
"--skip-until-gpu-kernel",
type=int,
default=0,
help="Skip (non-blit) kernels until reaching this kernel. Note that "
"this can impact correctness (the skipped kernels are completely "
"skipped, not fast forwarded)",
)
parser.add_argument(
@@ -177,6 +195,28 @@ def addRunFSOptions(parser):
help="Disable KVM perf counters (use this with LSF / ETX)",
)
parser.add_argument(
"--tcp-rp",
type=str,
default="TreePLRURP",
help="cache replacement policy" "policy for tcp",
)
parser.add_argument(
"--tcc-rp",
type=str,
default="TreePLRURP",
help="cache replacement policy" "policy for tcc",
)
# sqc rp both changes sqc rp and scalar cache rp
parser.add_argument(
"--sqc-rp",
type=str,
default="TreePLRURP",
help="cache replacement policy" "policy for sqc",
)
def runGpuFSSystem(args):
"""
@@ -230,8 +270,9 @@ def runGpuFSSystem(args):
print("Running the simulation")
sim_ticks = args.abs_max_tick
kernels_launched = 0
if args.debug_at_gpu_kernel != -1:
kernels_completed = 0
tasks_completed = 0
if args.debug_at_gpu_task != -1:
m5.trace.disable()
exit_event = m5.simulate(sim_ticks)
@@ -249,16 +290,27 @@ def runGpuFSSystem(args):
m5.checkpoint(args.checkpoint_dir)
break
elif "GPU Kernel Completed" in exit_event.getCause():
kernels_launched += 1
if kernels_completed == args.exit_after_gpu_kernel:
print(f"Exiting after GPU kernel {kernels_completed}")
break
kernels_completed += 1
tasks_completed += 1
elif "GPU Blit Kernel Completed" in exit_event.getCause():
tasks_completed += 1
elif "Skipping GPU Kernel" in exit_event.getCause():
print(f"Skipping GPU kernel {kernels_completed}")
kernels_completed += 1
tasks_completed += 1
else:
print(
f"Unknown exit event: {exit_event.getCause()}. Continuing..."
)
if kernels_launched == args.debug_at_gpu_kernel:
if tasks_completed == args.debug_at_gpu_task:
print(f"Enabling debug flags @ GPU task {tasks_completed}")
m5.trace.enable()
if kernels_launched == args.exit_at_gpu_kernel:
print(f"Exiting @ GPU kernel {kernels_launched}")
if tasks_completed == args.exit_at_gpu_task:
print(f"Exiting @ GPU task {tasks_completed}")
break
exit_event = m5.simulate(sim_ticks - m5.curTick())

View File

@@ -33,7 +33,10 @@ from m5.objects import *
def createGPU(system, args):
shader = Shader(
n_wf=args.wfs_per_simd, timing=True, clk_domain=system.clk_domain
n_wf=args.wfs_per_simd,
cu_per_sqc=args.cu_per_sqc,
timing=True,
clk_domain=system.clk_domain,
)
# VIPER GPU protocol implements release consistency at GPU side. So,
@@ -84,6 +87,7 @@ def createGPU(system, args):
vrfs = []
vrf_pool_mgrs = []
srfs = []
rfcs = []
srf_pool_mgrs = []
for j in range(args.simds_per_cu):
for k in range(shader.n_wf):
@@ -133,10 +137,16 @@ def createGPU(system, args):
num_regs=args.sreg_file_size,
)
)
rfcs.append(
RegisterFileCache(
simd_id=j, cache_size=args.register_file_cache_size
)
)
compute_units[-1].wavefronts = wavefronts
compute_units[-1].vector_register_file = vrfs
compute_units[-1].scalar_register_file = srfs
compute_units[-1].register_file_cache = rfcs
compute_units[-1].register_manager = RegisterManager(
policy=args.registerManagerPolicy,
vrf_pool_managers=vrf_pool_mgrs,
@@ -181,10 +191,14 @@ def connectGPU(system, args):
system.pc.south_bridge.gpu.DeviceID = 0x740F
system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
system.pc.south_bridge.gpu.SubsystemID = 0x0C34
elif args.gpu_device == "MI300X":
system.pc.south_bridge.gpu.DeviceID = 0x740F
system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
system.pc.south_bridge.gpu.SubsystemID = 0x0C34
elif args.gpu_device == "Vega10":
system.pc.south_bridge.gpu.DeviceID = 0x6863
else:
panic(f"Unknown GPU device: {args.gpu_device}")
m5.util.panic(f"Unknown GPU device: {args.gpu_device}")
# Use the gem5 default of 0x280 OR'd with 0x10 which tells Linux there is
# a PCI capabilities list to travse.

View File

@@ -108,18 +108,26 @@ def makeGpuFSSystem(args):
system.cpu.append(shader)
# This arbitrary address is something in the X86 I/O hole
hsapp_gpu_map_paddr = 0xE00000000
hsapp_gpu_map_paddr = 0xE0000000
hsapp_pt_walker = VegaPagetableWalker()
gpu_hsapp = HSAPacketProcessor(
pioAddr=hsapp_gpu_map_paddr,
numHWQueues=args.num_hw_queues,
walker=hsapp_pt_walker,
)
dispatcher_exit_events = True if args.exit_at_gpu_kernel > -1 else False
dispatcher_exit_events = False
if args.exit_at_gpu_task > -1:
dispatcher_exit_events = True
if args.exit_after_gpu_kernel > -1:
dispatcher_exit_events = True
dispatcher = GPUDispatcher(kernel_exit_events=dispatcher_exit_events)
cp_pt_walker = VegaPagetableWalker()
target_kernel = args.skip_until_gpu_kernel
gpu_cmd_proc = GPUCommandProcessor(
hsapp=gpu_hsapp, dispatcher=dispatcher, walker=cp_pt_walker
hsapp=gpu_hsapp,
dispatcher=dispatcher,
walker=cp_pt_walker,
target_non_blit_kernel_id=target_kernel,
)
shader.dispatcher = dispatcher
shader.gpu_cmd_proc = gpu_cmd_proc
@@ -153,7 +161,7 @@ def makeGpuFSSystem(args):
0x7D000,
]
sdma_sizes = [0x1000] * 8
elif args.gpu_device == "MI200":
elif args.gpu_device == "MI200" or args.gpu_device == "MI300X":
num_sdmas = 5
sdma_bases = [
0x4980,
@@ -180,9 +188,15 @@ def makeGpuFSSystem(args):
system.pc.south_bridge.gpu.sdmas = sdma_engines
# Setup PM4 packet processor
pm4_pkt_proc = PM4PacketProcessor()
system.pc.south_bridge.gpu.pm4_pkt_proc = pm4_pkt_proc
# Setup PM4 packet processors
pm4_procs = []
pm4_procs.append(
PM4PacketProcessor(
ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000)
)
)
system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs
# GPU data path
gpu_mem_mgr = AMDGPUMemoryManager()
@@ -199,7 +213,8 @@ def makeGpuFSSystem(args):
for sdma in sdma_engines:
system._dma_ports.append(sdma)
system._dma_ports.append(device_ih)
system._dma_ports.append(pm4_pkt_proc)
for pm4_proc in pm4_procs:
system._dma_ports.append(pm4_proc)
system._dma_ports.append(system_hub)
system._dma_ports.append(gpu_mem_mgr)
system._dma_ports.append(hsapp_pt_walker)
@@ -213,7 +228,8 @@ def makeGpuFSSystem(args):
for sdma in sdma_engines:
sdma.pio = system.iobus.mem_side_ports
device_ih.pio = system.iobus.mem_side_ports
pm4_pkt_proc.pio = system.iobus.mem_side_ports
for pm4_proc in pm4_procs:
pm4_proc.pio = system.iobus.mem_side_ports
system_hub.pio = system.iobus.mem_side_ports
# Full system needs special TLBs for SQC, Scalar, and vector data ports
@@ -247,7 +263,7 @@ def makeGpuFSSystem(args):
0x00000340,
0x00000000,
0x00000340,
0x0000000F,
0x00000000,
0x00000340,
0x00000000,
0x00000000,
@@ -265,7 +281,7 @@ def makeGpuFSSystem(args):
# See: https://sandpile.org/x86/cpuid.htm#level_0000_0001h
# Enables AVX, OSXSAVE, XSAVE, POPCNT, SSE4.2, SSE4.1, CMPXCHG16B,
# and FMA.
avx_cpu_features = [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x1C983209]
avx_cpu_features = [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x1C803209]
for i, cpu in enumerate(system.cpu):
# Break once we reach the shader "CPU"

View File

@@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
/sbin/m5 exit
fi
modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0
modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0
echo "Running {} {}"
echo "{}" | base64 -d > myapp
chmod +x myapp

View File

@@ -243,7 +243,7 @@ def createVegaTopology(options):
file_append((node_dir, "properties"), node_prop)
# Fiji HBM reporting
# Vega HBM reporting
# TODO: Extract size, clk, and width from sim paramters
mem_dir = joinpath(node_dir, "mem_banks/0")
remake_dir(mem_dir)
@@ -260,196 +260,7 @@ def createVegaTopology(options):
file_append((mem_dir, "properties"), mem_prop)
# This fakes out a dGPU setup so the runtime correctly operations. The spoofed
# system has a single dGPU and a single socket CPU. Note that more complex
# topologies (multi-GPU, multi-socket CPUs) need to have a different setup
# here or the runtime won't be able to issue Memcpies from one node to another.
#
# TODO: There is way too much hardcoded here. It doesn't effect anything in
# our current ROCm stack (1.6), but it is highly possible that it will in the
# future. We might need to scrub through this and extract the appropriate
# fields from the simulator in the future.
def createFijiTopology(options):
topology_dir = joinpath(
m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology"
)
remake_dir(topology_dir)
amdgpu_dir = joinpath(m5.options.outdir, "fs/sys/module/amdgpu/parameters")
remake_dir(amdgpu_dir)
# Fiji reported VM size in GB. Used to reserve an allocation from CPU
# to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree)
file_append((amdgpu_dir, "vm_size"), 256)
# Ripped from real Fiji platform to appease KMT version checks
file_append((topology_dir, "generation_id"), 2)
# Set up system properties. Regiter as ast-rocm server
sys_prop = (
"platform_oem 35498446626881\n"
+ "platform_id 71791775140929\n"
+ "platform_rev 2\n"
)
file_append((topology_dir, "system_properties"), sys_prop)
# Populate the topology tree
# Our dGPU system is two nodes. Node 0 is a CPU and Node 1 is a dGPU
node_dir = joinpath(topology_dir, "nodes/0")
remake_dir(node_dir)
# Register as a CPU
file_append((node_dir, "gpu_id"), 0)
file_append((node_dir, "name"), "")
# CPU links. Only thing that matters is we tell the runtime that GPU is
# connected through PCIe to CPU socket 0.
io_links = 1
io_dir = joinpath(node_dir, "io_links/0")
remake_dir(io_dir)
io_prop = (
"type 2\n"
+ "version_major 0\n"
+ "version_minor 0\n"
+ "node_from 0\n"
+ "node_to 1\n"
+ "weight 20\n"
+ "min_latency 0\n"
+ "max_latency 0\n"
+ "min_bandwidth 0\n"
+ "max_bandwidth 0\n"
+ "recommended_transfer_size 0\n"
+ "flags 13\n"
)
file_append((io_dir, "properties"), io_prop)
# Populate CPU node properties
node_prop = (
f"cpu_cores_count {options.num_cpus}\n"
+ "simd_count 0\n"
+ "mem_banks_count 1\n"
+ "caches_count 0\n"
+ f"io_links_count {io_links}\n"
+ "cpu_core_id_base 0\n"
+ "simd_id_base 0\n"
+ "max_waves_per_simd 0\n"
+ "lds_size_in_kb 0\n"
+ "gds_size_in_kb 0\n"
+ "wave_front_size 64\n"
+ "array_count 0\n"
+ "simd_arrays_per_engine 0\n"
+ "cu_per_simd_array 0\n"
+ "simd_per_cu 0\n"
+ "max_slots_scratch_cu 0\n"
+ "vendor_id 0\n"
+ "device_id 0\n"
+ "location_id 0\n"
+ "drm_render_minor 0\n"
+ "max_engine_clk_ccompute 3400\n"
)
file_append((node_dir, "properties"), node_prop)
# CPU memory reporting
mem_dir = joinpath(node_dir, "mem_banks/0")
remake_dir(mem_dir)
# Heap type value taken from real system, heap type values:
# https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317
mem_prop = (
"heap_type 0\n"
+ "size_in_bytes 33704329216\n"
+ "flags 0\n"
+ "width 72\n"
+ "mem_clk_max 2400\n"
)
file_append((mem_dir, "properties"), mem_prop)
# Build the GPU node
node_dir = joinpath(topology_dir, "nodes/1")
remake_dir(node_dir)
# Register as a Fiji
file_append((node_dir, "gpu_id"), 50156)
file_append((node_dir, "name"), "Fiji\n")
# Should be the same as the render driver filename (dri/renderD<drm_num>)
drm_num = 128
# Real Fiji shows 96, but building that topology is complex and doesn't
# appear to be required for anything.
caches = 0
# GPU links. Only thing that matters is we tell the runtime that GPU is
# connected through PCIe to CPU socket 0.
io_links = 1
io_dir = joinpath(node_dir, "io_links/0")
remake_dir(io_dir)
io_prop = (
"type 2\n"
+ "version_major 0\n"
+ "version_minor 0\n"
+ "node_from 1\n"
+ "node_to 0\n"
+ "weight 20\n"
+ "min_latency 0\n"
+ "max_latency 0\n"
+ "min_bandwidth 0\n"
+ "max_bandwidth 0\n"
+ "recommended_transfer_size 0\n"
+ "flags 1\n"
)
file_append((io_dir, "properties"), io_prop)
# Populate GPU node properties
node_prop = (
"cpu_cores_count 0\n"
+ f"simd_count {options.num_compute_units * options.simds_per_cu}\n"
+ "mem_banks_count 1\n"
+ f"caches_count {caches}\n"
+ f"io_links_count {io_links}\n"
+ "cpu_core_id_base 0\n"
+ "simd_id_base 2147487744\n"
+ f"max_waves_per_simd {options.wfs_per_simd}\n"
+ f"lds_size_in_kb {int(options.lds_size / 1024)}\n"
+ "gds_size_in_kb 0\n"
+ f"wave_front_size {options.wf_size}\n"
+ "array_count 4\n"
+ f"simd_arrays_per_engine {options.sa_per_complex}\n"
+ f"cu_per_simd_array {options.cu_per_sa}\n"
+ f"simd_per_cu {options.simds_per_cu}\n"
+ "max_slots_scratch_cu 32\n"
+ "vendor_id 4098\n"
+ "device_id 29440\n"
+ "location_id 512\n"
+ f"drm_render_minor {drm_num}\n"
+ f"max_engine_clk_fcompute {int(toFrequency(options.gpu_clock) / 1000000.0)}\n"
+ "local_mem_size 4294967296\n"
+ "fw_version 730\n"
+ "capability 4736\n"
+ f"max_engine_clk_ccompute {int(toFrequency(options.CPUClock) / 1000000.0)}\n"
)
file_append((node_dir, "properties"), node_prop)
# Fiji HBM reporting
# TODO: Extract size, clk, and width from sim paramters
mem_dir = joinpath(node_dir, "mem_banks/0")
remake_dir(mem_dir)
# Heap type value taken from real system, heap type values:
# https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317
mem_prop = (
"heap_type 1\n"
+ "size_in_bytes 4294967296\n"
+ "flags 0\n"
+ "width 4096\n"
+ "mem_clk_max 500\n"
)
file_append((mem_dir, "properties"), mem_prop)
def createCarrizoTopology(options):
def createRavenTopology(options):
topology_dir = joinpath(
m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology"
)
@@ -476,7 +287,6 @@ def createCarrizoTopology(options):
file_append((node_dir, "gpu_id"), 2765)
gfx_dict = {
"gfx801": {"name": "Carrizo\n", "id": 39028},
"gfx902": {"name": "Raven\n", "id": 5597},
}

View File

@@ -49,8 +49,8 @@ from gem5.utils.requires import requires
# Run a check to ensure the right version of gem5 is being used.
requires(isa_required=ISA.RISCV)
from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
PrivateL1PrivateL2CacheHierarchy,
from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
PrivateL1PrivateL2WalkCacheHierarchy,
)
parser = argparse.ArgumentParser(description="Runs Linux fs test with RISCV.")
@@ -72,7 +72,7 @@ parser.add_argument(
args = parser.parse_args()
cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
)
@@ -98,8 +98,12 @@ board = LupvBoard(
# Set the Full System workload.
board.set_kernel_disk_workload(
kernel=obtain_resource("riscv-lupio-linux-kernel"),
disk_image=obtain_resource("riscv-lupio-busybox-img"),
kernel=obtain_resource(
"riscv-lupio-linux-kernel", resource_version="1.0.0"
),
disk_image=obtain_resource(
"riscv-lupio-busybox-img", resource_version="1.0.0"
),
)

View File

@@ -145,7 +145,17 @@ Options.addFSOptions(parser)
parser.add_argument(
"--virtio-rng", action="store_true", help="Enable VirtIORng device"
)
parser.add_argument(
"--semihosting",
action="store_true",
help="Enable the RISC-V semihosting interface",
)
parser.add_argument(
"--semihosting-root",
default="/some/invalid/root/directory",
type=str,
help="The root directory for files exposed to semihosting",
)
# ---------------------------- Parse Options --------------------------- #
args = parser.parse_args()
@@ -168,11 +178,17 @@ mdesc = SysConfig(
system.mem_mode = mem_mode
system.mem_ranges = [AddrRange(start=0x80000000, size=mdesc.mem())]
workload_args = dict()
if args.semihosting:
workload_args["semihosting"] = RiscvSemihosting(
files_root_dir=args.semihosting_root,
cmd_line=args.kernel,
)
if args.bare_metal:
system.workload = RiscvBareMetal()
system.workload = RiscvBareMetal(**workload_args)
system.workload.bootloader = args.kernel
else:
system.workload = RiscvLinux()
system.workload = RiscvLinux(**workload_args)
system.workload.object_file = args.kernel
system.iobus = IOXBar()

View File

@@ -59,7 +59,7 @@ nvm_generators = {"NVM": lambda x: x.createNvm}
# Use a single-channel DDR3-1600 x64 (8x8 topology) by default
parser.add_argument(
"--nvm-type",
"--mem-type",
default="NVM_2400_1x64",
choices=ObjectList.mem_list.get_names(),
help="type of memory to use",
@@ -212,7 +212,7 @@ def trace():
nbr_banks,
bank,
addr_map,
args.dram_ranks,
args.nvm_ranks,
)
yield system.tgen.createExit(0)

View File

@@ -143,7 +143,7 @@ MemConfig.config_mem(args, system)
# the following assumes that we are using the native controller
# with NVM and DRAM interfaces, check to be sure
if not isinstance(system.mem_ctrls[0], m5.objects.HeteroMemCtrl):
if not isinstance(system.mem_ctrls[0], m5.objects.MemCtrl):
fatal("This script assumes the controller is a HeteroMemCtrl subclass")
if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
fatal("This script assumes the first memory is a DRAMInterface subclass")

View File

@@ -149,7 +149,8 @@ class TCPCache(RubyCache):
self.size = MemorySize(options.tcp_size)
self.assoc = options.tcp_assoc
self.resourceStalls = options.no_tcc_resource_stalls
self.replacement_policy = TreePLRURP()
if hasattr(options, "tcp_rp"):
self.replacement_policy = RP_choose(options.tcp_rp)
class TCPCntrl(TCP_Controller, CntrlBase):
@@ -241,7 +242,8 @@ class SQCCache(RubyCache):
def create(self, options):
self.size = MemorySize(options.sqc_size)
self.assoc = options.sqc_assoc
self.replacement_policy = TreePLRURP()
if hasattr(options, "sqc_rp"):
self.replacement_policy = RP_choose(options.sqc_rp)
class SQCCntrl(SQC_Controller, CntrlBase):
@@ -303,7 +305,8 @@ class TCC(RubyCache):
self.start_index_bit = math.log(options.cacheline_size, 2) + math.log(
options.num_tccs, 2
)
self.replacement_policy = TreePLRURP()
if hasattr(options, "tcc_rp"):
self.replacement_policy = RP_choose(options.tcc_rp)
class TCCCntrl(TCC_Controller, CntrlBase):
@@ -497,13 +500,6 @@ def define_options(parser):
parser.add_argument(
"--noL1", action="store_true", default=False, help="bypassL1"
)
parser.add_argument(
"--scalar-buffer-size",
type=int,
default=128,
help="Size of the mandatory queue in the GPU scalar "
"cache controller",
)
parser.add_argument(
"--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency"
)
@@ -841,9 +837,7 @@ def construct_scalars(options, system, ruby_system, network):
scalar_cntrl.responseToSQC = MessageBuffer(ordered=True)
scalar_cntrl.responseToSQC.in_port = network.out_port
scalar_cntrl.mandatoryQueue = MessageBuffer(
buffer_size=options.scalar_buffer_size
)
scalar_cntrl.mandatoryQueue = MessageBuffer()
return (scalar_sequencers, scalar_cntrl_nodes)
@@ -1133,3 +1127,28 @@ def create_system(
ruby_system.network.number_of_virtual_networks = 11
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
def RP_choose(test_name):
if test_name == "TreePLRURP":
replacement_policy = TreePLRURP()
elif test_name == "LRURP":
replacement_policy = LRURP()
elif test_name == "FIFORP":
replacement_policy = FIFORP()
elif test_name == "LFURP":
replacement_policy = LFURP()
elif test_name == "LIPRP":
replacement_policy = LIPRP()
elif test_name == "MRURP":
replacement_policy = MRURP()
elif test_name == "NRURP":
replacement_policy = NRURP()
elif test_name == "RRIPRP":
replacement_policy = RRIPRP()
elif test_name == "SecondChanceRP":
replacement_policy = SecondChanceRP()
elif test_name == "SHiPMemRP":
replacement_policy = SHiPMemRP()
return replacement_policy

View File

@@ -47,6 +47,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
| the types below may, if desired, be defined as aliases for the native types
| (typically 'float' and 'double', and possibly 'long double').
*----------------------------------------------------------------------------*/
typedef struct { uint8_t v; } float8_t;
typedef struct { uint16_t v; } float16_t;
typedef struct { uint32_t v; } float32_t;
typedef struct { uint64_t v; } float64_t;

View File

@@ -25,13 +25,16 @@
import os
from m5.util.terminal import get_termcap
import gem5_scons
import sys
Import('env')
systemc = env.Clone()
build_root = Dir('.').abspath
src_root = Dir('.').srcdir.abspath
gem5_root = Dir('#../..').srcnode().abspath
sys.path.append(os.path.join(gem5_root, 'site_scons'))
import gem5_scons
systemc.Prepend(CPPPATH=Dir('./src').srcnode())
systemc.Prepend(CPATH=Dir('./src'))

View File

@@ -245,7 +245,6 @@ def define_constants(constants):
constants.isa_tag_type = "isa"
constants.x86_tag = "X86"
constants.gcn3_x86_tag = "GCN3_X86"
constants.vega_x86_tag = "VEGA_X86"
constants.sparc_tag = "SPARC"
constants.riscv_tag = "RISCV"
@@ -274,7 +273,6 @@ def define_constants(constants):
constants.supported_tags = {
constants.isa_tag_type: (
constants.x86_tag,
constants.gcn3_x86_tag,
constants.vega_x86_tag,
constants.sparc_tag,
constants.riscv_tag,
@@ -305,7 +303,6 @@ def define_constants(constants):
constants.target_host = {
constants.arm_tag: (constants.host_arm_tag,),
constants.x86_tag: (constants.host_x86_64_tag,),
constants.gcn3_x86_tag: (constants.host_x86_64_tag,),
constants.vega_x86_tag: (constants.host_x86_64_tag,),
constants.sparc_tag: (constants.host_x86_64_tag,),
constants.riscv_tag: (constants.host_x86_64_tag,),

View File

@@ -1 +1 @@
tqdm==4.64.1
tqdm==4.66.4

View File

@@ -1,2 +1,2 @@
mypy==1.5.1
pre-commit==2.20.0
mypy==1.10.0
pre-commit==3.7.1

View File

@@ -59,13 +59,15 @@ def CheckCxxFlag(context, flag, autoadd=True):
return ret
def CheckLinkFlag(context, flag, autoadd=True, set_for_shared=True):
def CheckLinkFlag(context, flag, autoadd=True, set_for_shared=True, code=None):
context.Message(f"Checking for linker {flag} support... ")
last_linkflags = context.env["LINKFLAGS"]
context.env.Append(LINKFLAGS=[flag])
pre_werror = context.env["LINKFLAGS"]
context.env.Append(LINKFLAGS=["-Werror"])
ret = context.TryLink("int main(int, char *[]) { return 0; }", ".cc")
if not code:
code = "int main(int, char *[]) { return 0; }"
ret = context.TryLink(code, ".cc")
context.env["LINKFLAGS"] = pre_werror
if not (ret and autoadd):
context.env["LINKFLAGS"] = last_linkflags

View File

@@ -31,7 +31,7 @@ PROJECT_NAME = gem5
# This could be handy for archiving the generated documentation or
# if some version control system is used.
PROJECT_NUMBER = v23.1.0.0
PROJECT_NUMBER = v24.0.0.0
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.

View File

@@ -51,3 +51,4 @@ rsource "arch/Kconfig"
rsource "cpu/Kconfig"
rsource "systemc/Kconfig"
rsource "gpu-compute/Kconfig"
rsource "test_objects/Kconfig"

View File

@@ -68,7 +68,7 @@ if env['CONF']['BUILD_ISA']:
error("At least one ISA need to be set")
amdgpu_isa = ['gcn3', 'vega']
amdgpu_isa = ['vega']
if env['CONF']['BUILD_GPU']:
env.SwitchingHeaders(

View File

@@ -29,5 +29,4 @@ prompt "GPU ISA"
endchoice
endif
rsource "gcn3/Kconfig"
rsource "vega/Kconfig"

View File

@@ -34,7 +34,7 @@ Import('*')
if not env['CONF']['BUILD_GPU']:
Return()
if env['CONF']['TARGET_GPU_ISA'] in ('gcn3', 'vega'):
if env['CONF']['TARGET_GPU_ISA'] in ('vega'):
SimObject('X86GPUTLB.py', sim_objects=['X86GPUTLB', 'TLBCoalescer'])
Source('tlb.cc')

View File

@@ -0,0 +1,21 @@
# Microscaling Formats
This directory defines [microscaling formats](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) which are reduced precision floating point formats.
The class makes some assumptions to simplify things and is not completely generic.
For example:
- Types must be smaller than 32-bits.
- Type conversions currently assume that either:
- The destination format exponent and mantissa bits are both greater or equal to the source format.
- OR the destination format exponent and mantissa are both less than or equal to the source format.
- In other words, one type cannot have larger exponent and smaller mantissa and visa versa.
- Basic MX operations are implementation defined, meaning MX types can be converted to FP32 for arithmetic
- This means that arithmetic operators need not be defined for MX types.
- Exponent and mantissa of zero is zero. There is no special case for the sign (i.e, -0 is not special).
- The spec does not differentiate between signaling and quiet NaN, therefore quiet NaN is used.
- New types must template specialize the following standard library methods:
- isinf(T)
- isnan(T)
- isnormal(T)
- New types must template specialize the following std::numeric_limits<T> members / methods:
- has_infinity / infinity()
- has_quiet_NaN / quiet_NaN()

View File

@@ -1,6 +1,4 @@
# -*- mode:python -*-
# Copyright (c) 2015, 2017 Advanced Micro Devices, Inc.
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -31,4 +29,4 @@
Import('*')
main.Append(ALL_GPU_ISAS=['gcn3'])
GTest('mxfp.test', 'mxfp.test.cc')

View File

@@ -0,0 +1,113 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_AMDGPU_COMMON_DTYPE_BINARY32_HH__
#define __ARCH_AMDGPU_COMMON_DTYPE_BINARY32_HH__
namespace gem5
{
namespace AMDGPU
{
// Same as IEEE 754 binary 32 - Microscaling types are converted to/from
// this format by default. For now as there do not seem to be any MI300
// instructions operating directly on the types (i.e., they all cast to FP32
// first and then perform arithmetic operations).
typedef union binary32_u
{
enum bitSizes
{
ebits = 8,
mbits = 23,
sbits = 1,
bias = 127,
inf = 0x7f800000,
nan = 0x7f800100,
max = 0x7f7fffff
};
uint32_t storage;
float fp32;
struct
{
unsigned mant : 23;
unsigned exp : 8;
unsigned sign : 1;
};
// To help with stdlib functions with T = float.
operator float() const
{
return fp32;
}
} binary32;
static_assert(sizeof(binary32) == 4);
} // namespace AMDGPU
} // namespace gem5
namespace std
{
template<>
class numeric_limits<gem5::AMDGPU::binary32>
{
public:
static constexpr bool has_quiet_NaN = true;
static gem5::AMDGPU::binary32 quiet_NaN()
{
gem5::AMDGPU::binary32 tmp;
tmp.fp32 = std::numeric_limits<float>::quiet_NaN();
return tmp;
}
static constexpr bool has_infinity = true;
static gem5::AMDGPU::binary32 infinity()
{
gem5::AMDGPU::binary32 tmp;
tmp.fp32 = std::numeric_limits<float>::infinity();
return tmp;
}
static gem5::AMDGPU::binary32 max()
{
gem5::AMDGPU::binary32 tmp;
tmp.fp32 = std::numeric_limits<float>::max();
return tmp;
}
};
} // namespace std
#endif // __ARCH_AMDGPU_COMMON_DTYPE_BINARY32_HH__

View File

@@ -0,0 +1,125 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP16_E5M10_HH__
#define __ARCH_AMDGPU_COMMON_DTYPE_FP16_E5M10_HH__
#include <cassert>
namespace gem5
{
namespace AMDGPU
{
typedef union
{
enum bitSizes
{
ebits = 5,
mbits = 10,
sbits = 1,
zbits = 16,
bias = 15,
inf = 0x7c000000,
nan = 0x7c100000,
max = 0x7bff0000
};
uint32_t storage;
struct
{
unsigned zero : zbits;
unsigned mant : mbits;
unsigned exp : ebits;
unsigned sign : sbits;
};
} fp16_e5m10_info;
static_assert(sizeof(fp16_e5m10_info) == 4);
} // namespace AMDGPU
} // namespace gem5
// std library cmath definitions
namespace std
{
constexpr bool isinf(gem5::AMDGPU::fp16_e5m10_info a)
{
return a.exp == 0x1F && a.mant == 0;
}
constexpr bool isnan(gem5::AMDGPU::fp16_e5m10_info a)
{
return a.exp == 0x1F && a.mant != 0;
}
constexpr bool isnormal(gem5::AMDGPU::fp16_e5m10_info a)
{
return !(a.exp == 0 && a.mant != 0);
}
template<>
class numeric_limits<gem5::AMDGPU::fp16_e5m10_info>
{
public:
static constexpr bool has_quiet_NaN = true;
static gem5::AMDGPU::fp16_e5m10_info quiet_NaN()
{
assert(has_quiet_NaN);
gem5::AMDGPU::fp16_e5m10_info tmp;
tmp.storage = gem5::AMDGPU::fp16_e5m10_info::nan;
return tmp;
}
static constexpr bool has_infinity = true;
static gem5::AMDGPU::fp16_e5m10_info infinity()
{
assert(has_infinity);
gem5::AMDGPU::fp16_e5m10_info tmp;
tmp.storage = gem5::AMDGPU::fp16_e5m10_info::inf;
return tmp;
}
static gem5::AMDGPU::fp16_e5m10_info max()
{
gem5::AMDGPU::fp16_e5m10_info tmp;
tmp.storage = gem5::AMDGPU::fp16_e5m10_info::max;
return tmp;
}
};
} // namespace std
#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP16_E5M10_HH__

View File

@@ -0,0 +1,125 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP16_E8M7_HH__
#define __ARCH_AMDGPU_COMMON_DTYPE_FP16_E8M7_HH__
#include <cassert>
namespace gem5
{
namespace AMDGPU
{
typedef union
{
enum bitSizes
{
ebits = 8,
mbits = 7,
sbits = 1,
zbits = 16,
bias = 127,
inf = 0x7f800000,
nan = 0x7f810000,
max = 0x7f7f0000
};
uint32_t storage;
struct
{
unsigned zero : zbits;
unsigned mant : mbits;
unsigned exp : ebits;
unsigned sign : sbits;
};
} fp16_e8m7_info;
static_assert(sizeof(fp16_e8m7_info) == 4);
} // namespace AMDGPU
} // namespace gem5
// std library cmath definitions
namespace std
{
constexpr bool isinf(gem5::AMDGPU::fp16_e8m7_info a)
{
return a.exp == 0xFF && a.mant == 0;
}
constexpr bool isnan(gem5::AMDGPU::fp16_e8m7_info a)
{
return a.exp == 0xFF && a.mant != 0;
}
constexpr bool isnormal(gem5::AMDGPU::fp16_e8m7_info a)
{
return !(a.exp == 0 && a.mant != 0);
}
template<>
class numeric_limits<gem5::AMDGPU::fp16_e8m7_info>
{
public:
static constexpr bool has_quiet_NaN = true;
static gem5::AMDGPU::fp16_e8m7_info quiet_NaN()
{
assert(has_quiet_NaN);
gem5::AMDGPU::fp16_e8m7_info tmp;
tmp.storage = gem5::AMDGPU::fp16_e8m7_info::nan;
return tmp;
}
static constexpr bool has_infinity = true;
static gem5::AMDGPU::fp16_e8m7_info infinity()
{
assert(has_infinity);
gem5::AMDGPU::fp16_e8m7_info tmp;
tmp.storage = gem5::AMDGPU::fp16_e8m7_info::inf;
return tmp;
}
static gem5::AMDGPU::fp16_e8m7_info max()
{
gem5::AMDGPU::fp16_e8m7_info tmp;
tmp.storage = gem5::AMDGPU::fp16_e8m7_info::max;
return tmp;
}
};
} // namespace std
#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP16_E8M7_HH__

View File

@@ -0,0 +1,124 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP8_E4M3_HH__
#define __ARCH_AMDGPU_COMMON_DTYPE_FP8_E4M3_HH__
#include <cassert>
namespace gem5
{
namespace AMDGPU
{
typedef union
{
enum bitSizes
{
ebits = 4,
mbits = 3,
sbits = 1,
zbits = 24,
bias = 7,
inf = (0x7f << zbits),
nan = (0xff << zbits),
max = (0x7f << zbits)
};
uint32_t storage;
struct
{
unsigned zero : zbits;
unsigned mant : mbits;
unsigned exp : ebits;
unsigned sign : sbits;
};
} fp8_e4m3_info;
static_assert(sizeof(fp8_e4m3_info) == 4);
} // namespace AMDGPU
} // namespace gem5
// std library cmath definitions
namespace std
{
// Inf not defined
constexpr bool isinf(gem5::AMDGPU::fp8_e4m3_info a) { return false; }
constexpr bool isnan(gem5::AMDGPU::fp8_e4m3_info a)
{
return a.exp == 0xF && a.mant == 0x7;
}
constexpr bool isnormal(gem5::AMDGPU::fp8_e4m3_info a)
{
return !(a.exp == 0 && a.mant != 0);
}
template<>
class numeric_limits<gem5::AMDGPU::fp8_e4m3_info>
{
public:
static constexpr bool has_quiet_NaN = true;
static gem5::AMDGPU::fp8_e4m3_info quiet_NaN()
{
assert(has_quiet_NaN);
gem5::AMDGPU::fp8_e4m3_info tmp;
tmp.storage = gem5::AMDGPU::fp8_e4m3_info::nan;
return tmp;
}
static constexpr bool has_infinity = false;
static gem5::AMDGPU::fp8_e4m3_info infinity()
{
assert(has_infinity);
gem5::AMDGPU::fp8_e4m3_info tmp;
tmp.storage = gem5::AMDGPU::fp8_e4m3_info::inf;
return tmp;
}
static gem5::AMDGPU::fp8_e4m3_info max()
{
gem5::AMDGPU::fp8_e4m3_info tmp;
tmp.storage = gem5::AMDGPU::fp8_e4m3_info::max;
return tmp;
}
};
} // namespace std
#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP8_E4M3_HH__

View File

@@ -0,0 +1,125 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP8_E5M2_HH__
#define __ARCH_AMDGPU_COMMON_DTYPE_FP8_E5M2_HH__
#include <cassert>
namespace gem5
{
namespace AMDGPU
{
typedef union
{
enum bitSizes
{
ebits = 5,
mbits = 2,
sbits = 1,
zbits = 24,
bias = 15,
inf = (0x7c << zbits),
nan = (0xff << zbits),
max = (0x7f << zbits)
};
uint32_t storage;
struct
{
unsigned zero : zbits;
unsigned mant : mbits;
unsigned exp : ebits;
unsigned sign : sbits;
};
} fp8_e5m2_info;
static_assert(sizeof(fp8_e5m2_info) == 4);
} // namespace AMDGPU
} // namespace gem5
// std library cmath definitions
namespace std
{
constexpr bool isinf(gem5::AMDGPU::fp8_e5m2_info a)
{
return a.exp == 0x1F && a.mant == 0x0;
}
constexpr bool isnan(gem5::AMDGPU::fp8_e5m2_info a)
{
return a.exp == 0x1F && a.mant != 0x0;
}
constexpr bool isnormal(gem5::AMDGPU::fp8_e5m2_info a)
{
return !(a.exp == 0 && a.mant != 0);
}
template<>
class numeric_limits<gem5::AMDGPU::fp8_e5m2_info>
{
public:
static constexpr bool has_quiet_NaN = true;
static gem5::AMDGPU::fp8_e5m2_info quiet_NaN()
{
assert(has_quiet_NaN);
gem5::AMDGPU::fp8_e5m2_info tmp;
tmp.storage = gem5::AMDGPU::fp8_e5m2_info::nan;
return tmp;
}
static constexpr bool has_infinity = true;
static gem5::AMDGPU::fp8_e5m2_info infinity()
{
assert(has_infinity);
gem5::AMDGPU::fp8_e5m2_info tmp;
tmp.storage = gem5::AMDGPU::fp8_e5m2_info::inf;
return tmp;
}
static gem5::AMDGPU::fp8_e5m2_info max()
{
gem5::AMDGPU::fp8_e5m2_info tmp;
tmp.storage = gem5::AMDGPU::fp8_e5m2_info::max;
return tmp;
}
};
} // namespace std
#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP8_E5M2_HH__

View File

@@ -0,0 +1,329 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_HH__
#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_HH__
#include <cmath>
#include <cstdint>
#include <iostream>
#include "arch/amdgpu/common/dtype/mxfp_convert.hh"
namespace gem5
{
namespace AMDGPU
{
// Base class for all microscaling types. The sizes of everything are
// determined by the enum fields in the FMT struct. All of these share the
// same operator overloads which convert to float before arithmetic and
// convert back if assigned to a microscaling type.
template<typename FMT>
class mxfp
{
public:
mxfp() = default;
mxfp(float f) : mode(roundTiesToEven)
{
data = float_to_mxfp(f);
}
// Set raw bits, used by gem5 to set a raw value read from VGPRs.
mxfp(const uint32_t& raw)
{
// The info unions end up being "left" aligned. For example, in FP4
// only the bits 31:28 are used. Shift the input by the storage size
// of 32 by the type size (sign + exponent + mantissa bits).
data = raw;
data <<= (32 - int(FMT::sbits) - int(FMT::ebits) - int(FMT::mbits));
}
mxfp(const mxfp& f)
{
FMT conv_out;
conv_out = convertMXFP<FMT, decltype(f.getFmt())>(f.getFmt());
data = conv_out.storage;
}
mxfp&
operator=(const float& f)
{
data = float_to_mxfp(f);
return *this;
}
mxfp&
operator=(const mxfp& f)
{
FMT conv_out;
conv_out = convertMXFP<FMT, decltype(f.getFmt())>(f.getFmt());
data = conv_out.storage;
return *this;
}
operator float() const
{
binary32 out;
FMT in;
in.storage = data;
out = convertMXFP<binary32, FMT>(in, mode);
return out.fp32;
}
constexpr static int
size()
{
return int(FMT::mbits) + int(FMT::ebits) + int(FMT::sbits);
}
// Intentionally use storage > size() so that a storage type is not needed
// as a template parameter.
uint32_t data = 0;
FMT
getFmt() const
{
FMT out;
out.storage = data;
return out;
}
void
setFmt(FMT in)
{
data = in.storage;
}
void
scale(const float& f)
{
binary32 bfp;
bfp.fp32 = f;
int scale_val = bfp.exp - bfp.bias;
// Scale value of 0xFF is NaN. Scaling by NaN returns NaN.
// In this implementation, types without NaN define it as zero.
if (scale_val == 0xFF) {
data = FMT::nan;
return;
}
FMT in = getFmt();
int exp = in.exp;
if (exp + scale_val > max_exp<FMT>()) {
in.exp = max_exp<FMT>();
} else if (exp + scale_val < min_exp<FMT>()) {
in.exp = min_exp<FMT>();
} else {
in.exp = exp + scale_val;
}
data = in.storage;
}
private:
mxfpRoundingMode mode = roundTiesToEven;
uint32_t
float_to_mxfp(float f)
{
if (std::isinf(f)) {
assert(std::numeric_limits<FMT>::has_infinity);
return FMT::inf;
}
if (std::isnan(f)) {
assert(std::numeric_limits<FMT>::has_quiet_NaN);
return FMT::nan;
}
return float_to_mxfp_nocheck(f);
}
uint32_t
float_to_mxfp_nocheck(float f)
{
binary32 in;
in.fp32 = f;
FMT out;
out.storage = 0;
out = convertMXFP<FMT, binary32>(in, mode);
return out.storage;
}
};
// Unary operators
template<typename T>
inline T operator+(T a)
{
return a;
}
template<typename T>
inline T operator-(T a)
{
// Flip sign bit
a.data ^= 0x80000000;
return a;
}
template<typename T>
inline T operator++(T a)
{
a = a + T(1.0f);
return a;
}
template<typename T>
inline T operator--(T a)
{
a = a - T(1.0f);
return a;
}
template<typename T>
inline T operator++(T a, int)
{
T original = a;
++a;
return original;
}
template<typename T>
inline T operator--(T a, int)
{
T original = a;
--a;
return original;
}
// Math operators
template<typename T>
inline T operator+(T a, T b)
{
return T(float(a) + float(b));
}
template<typename T>
inline T operator-(T a, T b)
{
return T(float(a) - float(b));
}
template<typename T>
inline T operator*(T a, T b)
{
return T(float(a) * float(b));
}
template<typename T>
inline T operator/(T a, T b)
{
return T(float(a) / float(b));
}
template<typename T>
inline T operator+=(T &a, T b)
{
a = a + b;
return a;
}
template<typename T>
inline T operator-=(T &a, T b)
{
a = a - b;
return a;
}
template<typename T>
inline T operator*=(T &a, T b)
{
a = a * b;
return a;
}
template<typename T>
inline T operator/=(T &a, T b)
{
a = a / b;
return a;
}
// Comparison operators
template<typename T>
inline bool operator<(T a, T b)
{
return float(a) < float(b);
}
template<typename T>
inline bool operator>(T a, T b)
{
return float(a) > float(b);
}
template<typename T>
inline bool operator<=(T a, T b)
{
return float(a) <= float(b);
}
template<typename T>
inline bool operator>=(T a, T b)
{
return float(a) >= float(b);
}
template<typename T>
inline bool operator==(T a, T b)
{
return float(a) == float(b);
}
template<typename T>
inline bool operator!=(T a, T b)
{
return float(a) != float(b);
}
} // namespace AMDGPU
} // namespace gem5
#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_HH__

View File

@@ -0,0 +1,104 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <gtest/gtest.h>
#include "arch/amdgpu/common/dtype/mxfp_types.hh"
template<typename T>
bool test_raw_mxfp(T raw_mxfp, int bits)
{
float tmp = float(raw_mxfp);
T from_float(tmp);
// Simply check that casting to float and back yields the same bit values.
// Exclude inf/NaN as those have multiple values in some MXFP types.
if (raw_mxfp.data != from_float.data &&
!std::isnan(tmp) && !std::isinf(tmp)) {
return false;
}
return true;
}
template<typename T>
int test_type(int bits)
{
T raw_mxfp;
int errors = 0;
int max_val = 1 << bits;
for (int val = 0; val < max_val; ++val) {
// Raw data is aligned to MSb in MXFP types. Shift into place.
raw_mxfp.data = val << (32 - bits);
if (!test_raw_mxfp(raw_mxfp, bits)) {
errors++;
}
}
return errors;
}
TEST(MxfpTest, MxBf16Test)
{
using T = gem5::AMDGPU::mxbfloat16;
int errors = test_type<T>(T::size());
EXPECT_EQ(errors, 0);
}
TEST(MxfpTest, MxFp16Test)
{
using T = gem5::AMDGPU::mxfloat16;
int errors = test_type<T>(T::size());
EXPECT_EQ(errors, 0);
}
TEST(MxfpTest, MxBf8Test)
{
using T = gem5::AMDGPU::mxbfloat8;
int errors = test_type<T>(T::size());
EXPECT_EQ(errors, 0);
}
TEST(MxfpTest, MxFp8Test)
{
using T = gem5::AMDGPU::mxfloat8;
int errors = test_type<T>(T::size());
EXPECT_EQ(errors, 0);
}

View File

@@ -0,0 +1,309 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_CONVERT_HH__
#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_CONVERT_HH__
#include <cassert>
#include "arch/amdgpu/common/dtype/mxfp_type_info.hh"
#include "base/bitfield.hh"
namespace gem5
{
namespace AMDGPU
{
// The various rounding modes for microscaling formats. roundTiesToEven must
// be supported. Other rounding modes may be supported.
enum mxfpRoundingMode
{
roundTiesToEven,
roundStochastic
};
// Conversion functions - For instructions that convert from one microscaling
// format to another. We only need the conversion functions as there do not
// appear to be any instructions yet which operate directly on the MX formats.
//
// in - An MXFP info struct type
// mode - rounding mode
// seed - input value for stochastic rounding function
template<typename dFMT, typename sFMT>
dFMT convertMXFP(sFMT in, mxfpRoundingMode mode = roundTiesToEven,
uint32_t seed = 0)
{
// We assume that *both* exponent and mantissa bits are both >= or <=
// the target type. Checkable at compile time.
//
// This is not necessarily a limitation, others just are not implemented.
// Figuring this out would be interesting for converting FP8 <-> BF8 for
// example. So far all GPU conversion instructions convert explicitly to
// a larger type from a smaller type or smaller to larger.
static_assert(((int(sFMT::mbits) >= int(dFMT::mbits)) &&
(int(sFMT::ebits) >= int(dFMT::ebits)))
|| ((int(sFMT::mbits) <= int(dFMT::mbits)) &&
(int(sFMT::ebits) <= int(dFMT::ebits))));
dFMT out;
out.storage = 0;
if (int(sFMT::mbits) >= int(dFMT::mbits) &&
int(sFMT::ebits) >= int(dFMT::ebits)) {
// Input format is larger, truncate and round mantissa. MX formats
// are subnormal if exp == 0. Zero out exp in that case.
if (std::isnan(in)) {
// For types with no NaN return max value.
if (std::numeric_limits<dFMT>::has_quiet_NaN) {
out = std::numeric_limits<dFMT>::quiet_NaN();
} else {
out = std::numeric_limits<dFMT>::max();
}
} else if (std::isinf(in)) {
// For types with no Inf return max value.
if (std::numeric_limits<dFMT>::has_infinity) {
out = std::numeric_limits<dFMT>::infinity();
} else {
out = std::numeric_limits<dFMT>::max();
}
} else if (in.mant == 0 && in.exp == 0) {
// All MX formats FP32, and FP64 encode 0 as all zeros. Keep sign.
out.mant = 0;
out.exp = 0;
out.sign = in.sign;
} else {
// Extra bits are needed for the mantissa conversion.
uint32_t mant = in.mant & mask(sFMT::mbits);
int32_t exp = in.exp - sFMT::bias + dFMT::bias;
out.sign = in.sign;
// Input is not subnormal, add the implicit 1 bit.
if (in.exp) {
mant |= (1 << sFMT::mbits);
}
mant >>= (sFMT::mbits - dFMT::mbits);
// Output became subnormal
if (exp < 1) {
int shift = 1 - exp;
mant >>= shift;
out.exp = 0;
} else {
out.exp = exp;
}
mant &= mask(dFMT::mbits);
out.mant = mant;
// roundTiesToEven is the only required rounding mode for MXFP
// types. Here we take the original mantissa and check the final
// bit which is shifted out when converting the mantissa. If that
// value is one, then we should round up to the next representable
// number. If the value is one and all other discarded mantissa
// bits are zero, round towards the number which has an even (0)
// bit value in the least significant mantissa bit.
//
// For denormals, the process is similar however we check the nth
// bit of the converted mantissa, where n is the absolute value of
// the converted exponent. If the value of |exp| is larger than
// the max exponent, round to zero. If it is exactly equal, always
// round up.
//
// If the number of destination and source format mantissa bits are
// the same, the mantissa is unchanged.
if (int(sFMT::mbits) > int(dFMT::mbits)
&& mode == roundTiesToEven) {
bool round_up = false;
int check_shift = sFMT::mbits - dFMT::mbits - 1;
uint32_t check_mant = in.mant & mask(sFMT::mbits);
check_mant >>= check_shift;
// out.exp == 0 means subnormal
if (out.exp == 0) {
check_mant = in.mant >> (sFMT::mbits - dFMT::mbits);
uint32_t max_exp = mask(dFMT::ebits);
if (-exp > max_exp) {
// if exp < -(1 << dFMT::ebits), result should be 0
round_up = false;
} else if (-exp == max_exp) {
// if exp == -(1 << dFMT::ebits), round up
round_up = true;
} else {
// Use the |exp|'th bit to determine rounding
int check_bit = 1 << -exp;
round_up = (check_mant & check_bit);
}
} else {
round_up = (check_mant & 0x1);
}
// For roundTiesToEven, if we are exactly between two
// representable numbers, pick the one with an even least
// significant mantissa bit. We are exactly between when
// all of the discarded mantissa bits are 0 (i.e., !sticky).
int sticky = in.mant & mask(sFMT::mbits - dFMT::mbits);
if (round_up && !sticky) {
if (!(out.mant & 1)) {
round_up = false;
}
}
if (round_up) {
if (out.mant == mask(dFMT::mbits)) {
// mantissa at max value, increment exponent if not inf
if (out.exp != mask(dFMT::ebits)) {
out.exp++;
}
out.mant = 0;
} else {
out.mant++;
}
}
} else if (int(sFMT::mbits) > int(dFMT::mbits)
&& mode == roundStochastic) {
// Use the discarded mantissa divided by the max mantissa of
// the source format to determine the probability of rounding
// up. An alternate implementation of this would be to get a
// random number and add that to the input mantissa. Then
// follow the normal rounding path above.
uint32_t discarded = in.mant & mask(sFMT::mbits - dFMT::mbits);
uint32_t max_mant = mask(sFMT::mbits);
float round_prob = float(discarded) / float(max_mant);
// Use a stochastic rounding function with the seed value to
// determine compare probability. This is implemented as a
// "Galois LFSR."
auto srFunc = [](uint32_t in) {
uint32_t bit = (in ^ (in >> 1) ^ (in >> 3) ^ (in >> 12));
return (in >> 1) | (bit << 15);
};
// Assume stochastic rounding returns up to max uint32_t.
// This will return an FP value between 0.0f and 1.0f.
float draw_prob = float(srFunc(seed))
/ float(std::numeric_limits<uint32_t>::max());
// Round up if the number we drew is less than the rounding
// probability. E.g., if round_prob is 90% (0.9) we choose
// values 0.0f - 0.90f to round up.
if (round_prob >= draw_prob) {
if (out.mant == mask(dFMT::mbits)) {
// mantissa at max value, increment exponent if not inf
if (out.exp != mask(dFMT::ebits)) {
out.exp++;
}
out.mant = 0;
} else {
out.mant++;
}
}
}
}
} else if (int(sFMT::mbits) <= int(dFMT::mbits) &&
int(sFMT::ebits) <= int(dFMT::ebits)) {
// Input format is smaller. Extend mantissa / exponent and pad with 0.
// Should be the same for all non-stochastic rounding modes.
if (std::isnan(in)) {
// For types with no NaN return max value.
if (std::numeric_limits<dFMT>::has_quiet_NaN) {
out = std::numeric_limits<dFMT>::quiet_NaN();
} else {
out = std::numeric_limits<dFMT>::max();
}
} else if (std::isinf(in)) {
// For types with no Inf return max value.
if (std::numeric_limits<dFMT>::has_infinity) {
out = std::numeric_limits<dFMT>::infinity();
} else {
out = std::numeric_limits<dFMT>::max();
}
} else if (in.mant == 0 && in.exp == 0) {
// All MX formats FP32, and FP64 encode 0 as all zeros. Keep sign.
out.mant = 0;
out.exp = 0;
out.sign = in.sign;
} else {
out.mant = in.mant << (dFMT::mbits - sFMT::mbits);
out.exp = in.exp + dFMT::bias - sFMT::bias;
out.sign = in.sign;
// Normalize input denormals
if (!in.exp && int(sFMT::ebits) != int(dFMT::ebits)) {
uint32_t m = out.mant;
if (m != 0) {
out.exp++;
while (!(m >> dFMT::mbits)) {
m <<= 1;
out.exp--;
}
out.mant = m & mask(dFMT::mbits);
}
} else if (!in.exp) {
// Exponent is the same, but output is not denorm, so add
// implicit 1. This is specific mainly to bf16 -> f32.
uint32_t m = out.mant;
m <<= 1;
out.mant = m & mask(dFMT::mbits);
}
}
} else {
assert(false);
}
return out;
}
template<typename FMT>
int min_exp()
{
return 1;
}
template<typename FMT>
int max_exp()
{
return (1 << FMT::ebits) - 1;
}
} // namespace AMDGPU
} // namespace gem5
#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_CONVERT_HH__

View File

@@ -0,0 +1,41 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPE_INFO_HH__
#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPE_INFO_HH__
#include "arch/amdgpu/common/dtype/binary32.hh"
#include "arch/amdgpu/common/dtype/fp16_e5m10.hh"
#include "arch/amdgpu/common/dtype/fp16_e8m7.hh"
#include "arch/amdgpu/common/dtype/fp8_e4m3.hh"
#include "arch/amdgpu/common/dtype/fp8_e5m2.hh"
#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPE_INFO_HH__

View File

@@ -0,0 +1,53 @@
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPES_HH__
#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPES_HH__
#include "arch/amdgpu/common/dtype/mxfp.hh"
namespace gem5
{
namespace AMDGPU
{
using mxbfloat8 = mxfp<fp8_e5m2_info>;
using mxfloat8 = mxfp<fp8_e4m3_info>;
using mxbfloat16 = mxfp<fp16_e8m7_info>;
using mxfloat16 = mxfp<fp16_e5m10_info>;
using mxfloat32 = mxfp<binary32>;
}
}
#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPES_HH__

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,106 +0,0 @@
/*
* Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_GCN3_GPU_ISA_HH__
#define __ARCH_GCN3_GPU_ISA_HH__
#include <array>
#include <type_traits>
#include "arch/amdgpu/common/tlb.hh"
#include "arch/amdgpu/gcn3/gpu_registers.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/misc.hh"
namespace gem5
{
class Wavefront;
namespace Gcn3ISA
{
class GPUISA
{
public:
GPUISA(Wavefront &wf);
template<typename T> T
readConstVal(int opIdx) const
{
panic_if(!std::is_integral_v<T>, "Constant values must "
"be an integer.\n");
T val(0);
if (isPosConstVal(opIdx)) {
val = (T)readPosConstReg(opIdx);
}
if (isNegConstVal(opIdx)) {
val = (T)readNegConstReg(opIdx);
}
return val;
}
ScalarRegU32 readMiscReg(int opIdx) const;
void writeMiscReg(int opIdx, ScalarRegU32 operandVal);
bool hasScalarUnit() const { return true; }
void advancePC(GPUDynInstPtr gpuDynInst);
private:
ScalarRegU32 readPosConstReg(int opIdx) const
{
return posConstRegs[opIdx - REG_INT_CONST_POS_MIN];
}
ScalarRegI32 readNegConstReg(int opIdx) const
{
return negConstRegs[opIdx - REG_INT_CONST_NEG_MIN];
}
static const std::array<const ScalarRegU32, NumPosConstRegs>
posConstRegs;
static const std::array<const ScalarRegI32, NumNegConstRegs>
negConstRegs;
// parent wavefront
Wavefront &wavefront;
// shader status bits
StatusReg statusReg;
// memory descriptor reg
ScalarRegU32 m0;
};
} // namespace Gcn3ISA
} // namespace gem5
#endif // __ARCH_GCN3_GPU_ISA_HH__

View File

@@ -1,190 +0,0 @@
/*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__
#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
#include "arch/amdgpu/gcn3/insts/op_encodings.hh"
#include "debug/GPUMem.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
namespace gem5
{
/**
* Helper function for instructions declared in op_encodings. This function
* takes in all of the arguments for a given memory request we are trying to
* initialize, then submits the request or requests depending on if the
* original request is aligned or unaligned.
*/
template<typename T, int N>
inline void
initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
bool is_atomic=false)
{
// local variables
int req_size = N * sizeof(T);
int block_size = gpuDynInst->computeUnit()->cacheLineSize();
Addr vaddr = 0, split_addr = 0;
bool misaligned_acc = false;
RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
gpuDynInst->resetEntireStatusVector();
for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vaddr = gpuDynInst->addr[lane];
/**
* the base address of the cache line where the the last
* byte of the request will be stored.
*/
split_addr = roundDown(vaddr + req_size - 1, block_size);
assert(split_addr <= vaddr || split_addr - vaddr < block_size);
/**
* if the base cache line address of the last byte is
* greater than the address of the first byte then we have
* a misaligned access.
*/
misaligned_acc = split_addr > vaddr;
if (is_atomic) {
// make sure request is word aligned
assert((vaddr & 0x3) == 0);
// a given lane's atomic can't cross cache lines
assert(!misaligned_acc);
req = std::make_shared<Request>(vaddr, sizeof(T), 0,
gpuDynInst->computeUnit()->requestorId(), 0,
gpuDynInst->wfDynId,
gpuDynInst->makeAtomicOpFunctor<T>(
&(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
&(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
} else {
req = std::make_shared<Request>(vaddr, req_size, 0,
gpuDynInst->computeUnit()->requestorId(), 0,
gpuDynInst->wfDynId);
}
if (misaligned_acc) {
gpuDynInst->setStatusVector(lane, 2);
req->splitOnVaddr(split_addr, req1, req2);
gpuDynInst->setRequestFlags(req1);
gpuDynInst->setRequestFlags(req2);
pkt1 = new Packet(req1, mem_req_type);
pkt2 = new Packet(req2, mem_req_type);
pkt1->dataStatic(&(reinterpret_cast<T*>(
gpuDynInst->d_data))[lane * N]);
pkt2->dataStatic(&(reinterpret_cast<T*>(
gpuDynInst->d_data))[lane * N +
req1->getSize()/sizeof(T)]);
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
"request for %#x\n", gpuDynInst->cu_id,
gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
split_addr);
gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
} else {
gpuDynInst->setStatusVector(lane, 1);
gpuDynInst->setRequestFlags(req);
pkt = new Packet(req, mem_req_type);
pkt->dataStatic(&(reinterpret_cast<T*>(
gpuDynInst->d_data))[lane * N]);
gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
}
} else { // if lane is not active, then no pending requests
gpuDynInst->setStatusVector(lane, 0);
}
}
}
/**
* Helper function for scalar instructions declared in op_encodings. This
* function takes in all of the arguments for a given memory request we are
* trying to initialize, then submits the request or requests depending on if
* the original request is aligned or unaligned.
*/
template<typename T, int N>
inline void
initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
{
int req_size = N * sizeof(T);
int block_size = gpuDynInst->computeUnit()->cacheLineSize();
Addr vaddr = gpuDynInst->scalarAddr;
/**
* the base address of the cache line where the the last byte of
* the request will be stored.
*/
Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
assert(split_addr <= vaddr || split_addr - vaddr < block_size);
/**
* if the base cache line address of the last byte is greater
* than the address of the first byte then we have a misaligned
* access.
*/
bool misaligned_acc = split_addr > vaddr;
RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
gpuDynInst->computeUnit()->requestorId(), 0,
gpuDynInst->wfDynId);
if (misaligned_acc) {
RequestPtr req1, req2;
req->splitOnVaddr(split_addr, req1, req2);
gpuDynInst->numScalarReqs = 2;
gpuDynInst->setRequestFlags(req1);
gpuDynInst->setRequestFlags(req2);
PacketPtr pkt1 = new Packet(req1, mem_req_type);
PacketPtr pkt2 = new Packet(req2, mem_req_type);
pkt1->dataStatic(gpuDynInst->scalar_data);
pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
" %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, split_addr);
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
} else {
gpuDynInst->numScalarReqs = 1;
gpuDynInst->setRequestFlags(req);
PacketPtr pkt = new Packet(req, mem_req_type);
pkt->dataStatic(gpuDynInst->scalar_data);
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
}
}
} // namespace gem5
#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__

View File

@@ -1,233 +0,0 @@
/*
* Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_GCN3_REGISTERS_HH__
#define __ARCH_GCN3_REGISTERS_HH__
#include <array>
#include <cstdint>
#include <string>
#include "arch/generic/vec_reg.hh"
#include "base/intmath.hh"
#include "base/logging.hh"
namespace gem5
{
namespace Gcn3ISA
{
enum OpSelector : int
{
REG_SGPR_MIN = 0,
REG_SGPR_MAX = 101,
REG_FLAT_SCRATCH_LO = 102,
REG_FLAT_SCRATCH_HI = 103,
REG_XNACK_MASK_LO = 104,
REG_XNACK_MASK_HI = 105,
REG_VCC_LO = 106,
REG_VCC_HI = 107,
REG_TBA_LO = 108,
REG_TBA_HI = 109,
REG_TMA_LO = 110,
REG_TMA_HI = 111,
REG_TTMP_0 = 112,
REG_TTMP_1 = 113,
REG_TTMP_2 = 114,
REG_TTMP_3 = 115,
REG_TTMP_4 = 116,
REG_TTMP_5 = 117,
REG_TTMP_6 = 118,
REG_TTMP_7 = 119,
REG_TTMP_8 = 120,
REG_TTMP_9 = 121,
REG_TTMP_10 = 122,
REG_TTMP_11 = 123,
REG_M0 = 124,
REG_RESERVED_1 = 125,
REG_EXEC_LO = 126,
REG_EXEC_HI = 127,
REG_ZERO = 128,
REG_INT_CONST_POS_MIN = 129,
REG_INT_CONST_POS_MAX = 192,
REG_INT_CONST_NEG_MIN = 193,
REG_INT_CONST_NEG_MAX = 208,
REG_RESERVED_2 = 209,
REG_RESERVED_3 = 210,
REG_RESERVED_4 = 211,
REG_RESERVED_5 = 212,
REG_RESERVED_6 = 213,
REG_RESERVED_7 = 214,
REG_RESERVED_8 = 215,
REG_RESERVED_9 = 216,
REG_RESERVED_10 = 217,
REG_RESERVED_11 = 218,
REG_RESERVED_12 = 219,
REG_RESERVED_13 = 220,
REG_RESERVED_14 = 221,
REG_RESERVED_15 = 222,
REG_RESERVED_16 = 223,
REG_RESERVED_17 = 224,
REG_RESERVED_18 = 225,
REG_RESERVED_19 = 226,
REG_RESERVED_20 = 227,
REG_RESERVED_21 = 228,
REG_RESERVED_22 = 229,
REG_RESERVED_23 = 230,
REG_RESERVED_24 = 231,
REG_RESERVED_25 = 232,
REG_RESERVED_26 = 233,
REG_RESERVED_27 = 234,
REG_RESERVED_28 = 235,
REG_RESERVED_29 = 236,
REG_RESERVED_30 = 237,
REG_RESERVED_31 = 238,
REG_RESERVED_32 = 239,
REG_POS_HALF = 240,
REG_NEG_HALF = 241,
REG_POS_ONE = 242,
REG_NEG_ONE = 243,
REG_POS_TWO = 244,
REG_NEG_TWO = 245,
REG_POS_FOUR = 246,
REG_NEG_FOUR = 247,
REG_PI = 248,
/* NOTE: SDWA and SWDA both refer to sub d-word addressing */
REG_SRC_SWDA = 249,
REG_SRC_DPP = 250,
REG_VCCZ = 251,
REG_EXECZ = 252,
REG_SCC = 253,
REG_LDS_DIRECT = 254,
REG_SRC_LITERAL = 255,
REG_VGPR_MIN = 256,
REG_VGPR_MAX = 511
};
constexpr size_t MaxOperandDwords(16);
const int NumVecElemPerVecReg(64);
// op selector values 129 - 192 correspond to const values 1 - 64
const int NumPosConstRegs = REG_INT_CONST_POS_MAX
- REG_INT_CONST_POS_MIN + 1;
// op selector values 193 - 208 correspond to const values -1 - 16
const int NumNegConstRegs = REG_INT_CONST_NEG_MAX
- REG_INT_CONST_NEG_MIN + 1;
const int BITS_PER_BYTE = 8;
const int BITS_PER_WORD = 16;
const int MSB_PER_BYTE = (BITS_PER_BYTE - 1);
const int MSB_PER_WORD = (BITS_PER_WORD - 1);
// typedefs for the various sizes/types of scalar regs
typedef uint8_t ScalarRegU8;
typedef int8_t ScalarRegI8;
typedef uint16_t ScalarRegU16;
typedef int16_t ScalarRegI16;
typedef uint32_t ScalarRegU32;
typedef int32_t ScalarRegI32;
typedef float ScalarRegF32;
typedef uint64_t ScalarRegU64;
typedef int64_t ScalarRegI64;
typedef double ScalarRegF64;
// typedefs for the various sizes/types of vector reg elements
typedef uint8_t VecElemU8;
typedef int8_t VecElemI8;
typedef uint16_t VecElemU16;
typedef int16_t VecElemI16;
typedef uint32_t VecElemU32;
typedef int32_t VecElemI32;
typedef float VecElemF32;
typedef uint64_t VecElemU64;
typedef int64_t VecElemI64;
typedef double VecElemF64;
const int DWordSize = sizeof(VecElemU32);
/**
* Size of a single-precision register in DWords.
*/
const int RegSizeDWords = sizeof(VecElemU32) / DWordSize;
using VecRegContainerU32 =
VecRegContainer<sizeof(VecElemU32) * NumVecElemPerVecReg>;
struct StatusReg
{
StatusReg() : SCC(0), SPI_PRIO(0), USER_PRIO(0), PRIV(0), TRAP_EN(0),
TTRACE_EN(0), EXPORT_RDY(0), EXECZ(0), VCCZ(0), IN_TG(0),
IN_BARRIER(0), HALT(0), TRAP(0), TTRACE_CU_EN(0), VALID(0),
ECC_ERR(0), SKIP_EXPORT(0), PERF_EN(0), COND_DBG_USER(0),
COND_DBG_SYS(0), ALLOW_REPLAY(0), INSTRUCTION_ATC(0), RESERVED(0),
MUST_EXPORT(0), RESERVED_1(0)
{
}
uint32_t SCC : 1;
uint32_t SPI_PRIO : 2;
uint32_t USER_PRIO : 2;
uint32_t PRIV : 1;
uint32_t TRAP_EN : 1;
uint32_t TTRACE_EN : 1;
uint32_t EXPORT_RDY : 1;
uint32_t EXECZ : 1;
uint32_t VCCZ : 1;
uint32_t IN_TG : 1;
uint32_t IN_BARRIER : 1;
uint32_t HALT : 1;
uint32_t TRAP : 1;
uint32_t TTRACE_CU_EN : 1;
uint32_t VALID : 1;
uint32_t ECC_ERR : 1;
uint32_t SKIP_EXPORT : 1;
uint32_t PERF_EN : 1;
uint32_t COND_DBG_USER : 1;
uint32_t COND_DBG_SYS : 1;
uint32_t ALLOW_REPLAY : 1;
uint32_t INSTRUCTION_ATC : 1;
uint32_t RESERVED : 3;
uint32_t MUST_EXPORT : 1;
uint32_t RESERVED_1 : 4;
};
std::string opSelectorToRegSym(int opIdx, int numRegs=0);
int opSelectorToRegIdx(int opIdx, int numScalarRegs);
bool isPosConstVal(int opIdx);
bool isNegConstVal(int opIdx);
bool isConstVal(int opIdx);
bool isLiteral(int opIdx);
bool isScalarReg(int opIdx);
bool isVectorReg(int opIdx);
bool isFlatScratchReg(int opIdx);
bool isExecMask(int opIdx);
bool isVccReg(int opIdx);
} // namespace Gcn3ISA
} // namespace gem5
#endif // __ARCH_GCN3_REGISTERS_HH__

View File

@@ -1,94 +0,0 @@
/*
* Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_GCN3_INSTS_GPU_STATIC_INST_HH__
#define __ARCH_GCN3_INSTS_GPU_STATIC_INST_HH__
#include "arch/amdgpu/gcn3/gpu_registers.hh"
#include "arch/amdgpu/gcn3/operand.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
namespace gem5
{
namespace Gcn3ISA
{
class GCN3GPUStaticInst : public GPUStaticInst
{
public:
GCN3GPUStaticInst(const std::string &opcode);
~GCN3GPUStaticInst();
void generateDisassembly() override { disassembly = _opcode; }
bool
isFlatScratchRegister(int opIdx) override
{
return isFlatScratchReg(opIdx);
}
bool
isExecMaskRegister(int opIdx) override
{
return isExecMask(opIdx);
}
void initOperandInfo() override { return; }
int getOperandSize(int opIdx) override { return 0; }
/**
* Return the number of tokens needed by the coalescer. In GCN3 there
* is generally one packet per memory request per lane generated. In
* HSAIL, the number of dest operands is used for loads and src
* operands for stores. This method should be overriden on a per-inst
* basis when this value differs.
*/
int coalescerTokenCount() const override { return 1; }
ScalarRegU32 srcLiteral() const override { return _srcLiteral; }
protected:
void panicUnimplemented() const;
/**
* if the instruction has a src literal - an immediate
* value that is part of the instruction stream - we
* store that here
*/
ScalarRegU32 _srcLiteral;
}; // class GCN3GPUStaticInst
} // namespace Gcn3ISA
} // namespace gem5
#endif //__ARCH_GCN3_INSTS_GPU_STATIC_INST_HH__

View File

@@ -1,896 +0,0 @@
/*
* Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__
#define __ARCH_GCN3_INSTS_INST_UTIL_HH__
#include <cmath>
#include "arch/amdgpu/gcn3/gpu_registers.hh"
namespace gem5
{
// values for SDWA select operations
enum SDWASelVals : int
{
SDWA_BYTE_0 = 0, /* select data[7:0] */
SDWA_BYTE_1 = 1, /* select data[15:8] */
SDWA_BYTE_2 = 2, /* select data[23:16] */
SDWA_BYTE_3 = 3, /* select data[31:24] */
SDWA_WORD_0 = 4, /* select data[15:0] */
SDWA_WORD_1 = 5, /* select data[31:16] */
SDWA_DWORD = 6 /* select data[31:0] */
};
// values for format of destination bits for SDWA operations
enum SDWADstVals : int
{
SDWA_UNUSED_PAD = 0, /* Pad all unused bits with 0 */
SDWA_UNUSED_SEXT = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
SDWA_UNUSED_PRESERVE = 2 /* select data[31:0] */
};
// values for DPP operations
enum SqDPPVals : int
{
SQ_DPP_QUAD_PERM_MAX = 0xFF,
SQ_DPP_RESERVED = 0x100,
SQ_DPP_ROW_SL1 = 0x101,
SQ_DPP_ROW_SL15 = 0x10F,
SQ_DPP_ROW_SR1 = 0x111,
SQ_DPP_ROW_SR15 = 0x11F,
SQ_DPP_ROW_RR1 = 0x121,
SQ_DPP_ROW_RR15 = 0x12F,
SQ_DPP_WF_SL1 = 0x130,
SQ_DPP_WF_RL1 = 0x134,
SQ_DPP_WF_SR1 = 0x138,
SQ_DPP_WF_RR1 = 0x13C,
SQ_DPP_ROW_MIRROR = 0x140,
SQ_DPP_ROW_HALF_MIRROR = 0x141,
SQ_DPP_ROW_BCAST15 = 0x142,
SQ_DPP_ROW_BCAST31 = 0x143
};
static const int ROW_SIZE = 16; /* 16 registers per row */
static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
namespace Gcn3ISA
{
template<typename T>
inline T
wholeQuadMode(T val)
{
T wqm = 0;
T mask = 0xF;
for (T bits = val; mask != 0; mask <<= 4)
if ((bits & mask) != 0)
wqm |= mask;
return wqm;
}
template<typename T>
inline T
quadMask(T val)
{
T qmsk = 0;
T mask = 0xF;
T qbit = 0x1;
for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
if (bits & mask) {
qmsk |= qbit;
}
}
return qmsk;
}
template<typename T>
inline ScalarRegI32
countZeroBits(T val)
{
ScalarRegI32 num_zeros
= std::numeric_limits<T>::digits - popCount(val);
return num_zeros;
}
template<typename T>
inline ScalarRegI32
findFirstZero(T val)
{
if (val == ~T(0)) {
return -1;
}
return findLsbSet(~val);
}
template<typename T>
inline ScalarRegI32
findFirstOne(T val)
{
if (!val) {
return -1;
}
return findLsbSet(val);
}
template<typename T>
inline ScalarRegI32
findFirstOneMsb(T val)
{
if (!val) {
return -1;
}
return findMsbSet(val);
}
template<typename T>
inline ScalarRegI32
countZeroBitsMsb(T val)
{
if (!val) {
return -1;
}
return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
}
inline ScalarRegI32
firstOppositeSignBit(ScalarRegI32 val)
{
bool found(false);
bool sign_bit = (val & 0x80000000) != 0;
ScalarRegU32 tmp_val(0);
int count(0);
if (!val || val == -1) {
return -1;
}
for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
tmp_val = val & (0x80000000 >> i);
if (!sign_bit) {
if (tmp_val) {
found = true;
break;
}
} else {
if (!tmp_val) {
found = true;
break;
}
}
++count;
}
if (found) {
return count;
} else {
return -1;
}
}
inline ScalarRegI32
firstOppositeSignBit(ScalarRegI64 val)
{
bool found(false);
bool sign_bit = (val & 0x8000000000000000ULL) != 0;
ScalarRegU64 tmp_val(0);
int count(0);
if (!val || val == -1) {
return -1;
}
for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
tmp_val = val & (0x8000000000000000ULL >> i);
if (!sign_bit) {
if (tmp_val) {
found = true;
break;
}
} else {
if (!tmp_val) {
found = true;
break;
}
}
++count;
}
if (found) {
return count;
} else {
return -1;
}
}
template<typename T>
inline T
median(T val_0, T val_1, T val_2)
{
if (std::is_floating_point_v<T>) {
return std::fmax(std::fmin(val_0, val_1),
std::fmin(std::fmax(val_0, val_1), val_2));
} else {
return std::max(std::min(val_0, val_1),
std::min(std::max(val_0, val_1), val_2));
}
}
template <typename T>
inline T roundNearestEven(T val)
{
T int_part = 0;
T nearest_round = std::floor(val + 0.5);
if ((int)std::floor(val) % 2 == 0
&& std::modf(std::abs(val), &int_part) == 0.5) {
nearest_round = nearest_round - 1;
}
return nearest_round;
}
inline VecElemU32
muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1,
VecElemU64 val_2)
{
__uint128_t u0 = (__uint128_t)val_0;
__uint128_t u1 = (__uint128_t)val_1;
__uint128_t u2 = (__uint128_t)val_2;
__uint128_t result = u0 * u1 + u2;
dst = (VecElemU64)result;
return (VecElemU32)(result >> 64) ? 1 : 0;
}
inline VecElemU32
muladd(VecElemI64 &dst, VecElemI32 val_0, VecElemI32 val_1,
VecElemI64 val_2)
{
__int128_t u0 = (__int128_t)val_0;
__int128_t u1 = (__int128_t)val_1;
__int128_t u2 = (__int128_t)val_2;
__int128_t result = u0 * u1 + u2;
dst = (VecElemI64)result;
return (VecElemU32)(result >> 64) ? 1 : 0;
}
/**
* dppInstImpl is a helper function that performs the inputted operation
* on the inputted vector register lane. The returned output lane
* represents the input lane given the destination lane and DPP_CTRL word.
*
* Currently the values are:
* 0x0 - 0xFF: full permute of four threads
* 0x100: reserved
* 0x101 - 0x10F: row shift right by 1-15 threads
* 0x111 - 0x11F: row shift right by 1-15 threads
* 0x121 - 0x12F: row shift right by 1-15 threads
* 0x130: wavefront left shift by 1 thread
* 0x134: wavefront left rotate by 1 thread
* 0x138: wavefront right shift by 1 thread
* 0x13C: wavefront right rotate by 1 thread
* 0x140: mirror threads within row
* 0x141: mirror threads within 1/2 row (8 threads)
* 0x142: broadcast 15th thread of each row to next row
* 0x143: broadcast thread 31 to rows 2 and 3
*/
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
int rowOffset, bool & outOfBounds)
{
// local variables
// newLane will be the same as the input lane unless swizzling happens
int newLane = currLane;
// for shift/rotate permutations; positive values are LEFT rotates
int count = 1;
int localRowOffset = rowOffset;
int localRowNum = rowNum;
if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
int quadBase = (currLane & ~(3));
int quadPix = (currLane & 3);
quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
newLane = (quadBase | quadPix);
} else if (dppCtrl == SQ_DPP_RESERVED) {
panic("ERROR: instruction using reserved DPP_CTRL value\n");
} else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
(dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1);
if ((localRowOffset + count >= 0) &&
(localRowOffset + count < ROW_SIZE)) {
localRowOffset += count;
newLane = (rowNum | localRowOffset);
} else {
outOfBounds = true;
}
} else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
(dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1);
if ((localRowOffset + count >= 0) &&
(localRowOffset + count < ROW_SIZE)) {
localRowOffset += count;
newLane = (rowNum | localRowOffset);
} else {
outOfBounds = true;
}
} else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
(dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1);
localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
newLane = (rowNum | localRowOffset);
} else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
count = 1;
if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
newLane += count;
} else {
outOfBounds = true;
}
} else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
count = 1;
newLane = (currLane + count + NumVecElemPerVecReg) %
NumVecElemPerVecReg;
} else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
count = -1;
int currVal = (currLane + count);
if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
newLane += count;
} else {
outOfBounds = true;
}
} else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
count = -1;
newLane = (currLane + count + NumVecElemPerVecReg) %
NumVecElemPerVecReg;
} else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
localRowOffset = (15 - localRowOffset);
newLane = (rowNum | localRowOffset);
} else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
localRowNum = (currLane & -0x7);
localRowOffset = (currLane & 0x7);
localRowOffset = (7 - localRowNum);
newLane = (localRowNum | localRowOffset);
} else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
count = 15;
if (currLane > count) {
newLane = (currLane & ~count) - 1;
}
} else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
count = 31;
if (currLane > count) {
newLane = (currLane & ~count) - 1;
}
} else {
panic("Unimplemented DPP control operation: %d\n", dppCtrl);
}
return newLane;
}
/**
* processDPP is a helper function for implementing Data Parallel Primitive
* instructions. This function may be called by many different VOP1
* instructions to do operations within a register.
*/
template<typename T>
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
T & src0)
{
// local variables
SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
int boundCtrl = dppInst.BOUND_CTRL;
int bankMask = dppInst.BANK_MASK;
int rowMask = dppInst.ROW_MASK;
// row, bank info to be calculated per lane
int rowNum = 0, bankNum = 0, rowOffset = 0;
// outLane will be the same as the input lane unless swizzling happens
int outLane = 0;
bool laneDisabled = false;
// flags used for determining if a lane should be written to/reset/etc.
bool outOfBounds = false, zeroSrc = false;
long long threadValid = 0;
/**
* STEP 1a: check if the absolute value (ABS) or negation (NEG) tags
* are set. If so, do the appropriate action(s) on src0 and/or src1.
*
* NOTE: ABS takes priority over NEG.
*/
if (dppInst.SRC0_NEG) {
src0.negModifier();
}
if (dppInst.SRC0_ABS) {
src0.absModifier();
}
// iterate over all register lanes, performing steps 2-4
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
threadValid = (0x1LL << lane);
/**
* STEP 2: check the row and bank mask values. These determine
* which threads are enabled for the subsequent DPP_CTRL
* operations.
*/
rowNum = (lane / ROW_SIZE);
rowOffset = (lane % ROW_SIZE);
bankNum = (rowOffset / NUM_BANKS);
if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ ||
((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
laneDisabled = true;
continue;
}
/**
* STEP 4: Handle the potential values of DPP_CTRL:
* 0x0 - 0xFF: full permute of four threads
* 0x100: reserved
* 0x101 - 0x10F: row shift right by 1-15 threads
* 0x111 - 0x11F: row shift right by 1-15 threads
* 0x121 - 0x12F: row shift right by 1-15 threads
* 0x130: wavefront left shift by 1 thread
* 0x134: wavefront left rotate by 1 thread
* 0x138: wavefront right shift by 1 thread
* 0x13C: wavefront right rotate by 1 thread
* 0x140: mirror threads within row
* 0x141: mirror threads within 1/2 row (8 threads)
* 0x142: broadcast 15th thread of each row to next row
* 0x143: broadcast thread 31 to rows 2 and 3
*/
if (!laneDisabled) {
outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
outOfBounds);
}
/**
* STEP 4: Implement bound control for disabled threads. If thread
* is disabled but boundCtrl is set, then we need to set the source
* data to 0 (i.e., set this lane to 0).
*/
if (laneDisabled) {
threadValid = 0;
} else if (outOfBounds) {
if (boundCtrl == 1) {
zeroSrc = true;
} else {
threadValid = 0;
}
} else if (!gpuDynInst->exec_mask[lane]) {
if (boundCtrl == 1) {
zeroSrc = true;
} else {
threadValid = 0;
}
}
if (threadValid != 0 && !outOfBounds && !zeroSrc) {
assert(!laneDisabled);
src0[outLane] = src0[lane];
} else if (zeroSrc) {
src0[lane] = 0;
}
// reset for next iteration
laneDisabled = false;
}
}
/**
* processDPP is a helper function for implementing Data Parallel Primitive
* instructions. This function may be called by many different
* VOP2/VOPC instructions to do operations within a register.
*/
template<typename T>
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
T & src0, T & src1)
{
/**
* STEP 1b: check if the absolute value (ABS) or negation (NEG) tags
* are set. If so, do the appropriate action(s) on src0 and/or src1.
*
* NOTE: ABS takes priority over NEG.
*/
if (dppInst.SRC1_NEG) {
src1.negModifier();
}
if (dppInst.SRC1_ABS) {
src1.absModifier();
}
// Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
// which is only used for negation/absolute value, call other version
// to do everything else.
processDPP(gpuDynInst, dppInst, src0);
}
/**
* sdwaInstSrcImpl_helper contains the per-lane code for selecting the
* appropriate bytes/words of the lane and doing the appropriate
* masking/padding/sign extending. It returns the value after these
* operations are done on it.
*/
template<typename T>
T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
const SDWASelVals sel, const bool signExt)
{
// local variables
int low_bit = 0, high_bit = 0;
bool signExt_local = signExt;
T retVal = 0;
// if we're preserving all of the bits, then we can immediately return
if (sel == SDWA_DWORD) {
return currOperVal;
}
if (sel < SDWA_WORD_0) { // we are selecting 1 byte
/*
Process byte 0 first. This code eiter selects the original bits
of byte 0, or makes the bits of the selected byte be byte 0 (and
next either sign extends or zero's out upper bits).
*/
low_bit = (sel * Gcn3ISA::BITS_PER_BYTE);
high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
retVal = bits(currOperVal, high_bit, low_bit);
// make sure update propagated, since used next
fatal_if(bits(retVal, Gcn3ISA::MSB_PER_BYTE) !=
bits(origOperVal, high_bit),
"ERROR: SDWA byte update not propagated: retVal: %d, "
"orig: %d\n", bits(retVal, Gcn3ISA::MSB_PER_BYTE),
bits(origOperVal, high_bit));
// sign extended value depends on upper-most bit of the new byte 0
signExt_local = (signExt &&
(bits(retVal, Gcn3ISA::MSB_PER_BYTE, 0) & 0x80));
// process all other bytes -- if sign extending, make them 1, else
// all 0's so leave as is
if (signExt_local) {
retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
}
} else if (sel < SDWA_DWORD) { // we are selecting 1 word
/*
Process word 0 first. This code eiter selects the original bits
of word 0, or makes the bits of the selected word be word 0 (and
next either sign extends or zero's out upper bits).
*/
low_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD;
high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
retVal = bits(currOperVal, high_bit, low_bit);
// make sure update propagated, since used next
fatal_if(bits(retVal, Gcn3ISA::MSB_PER_WORD) !=
bits(origOperVal, high_bit),
"ERROR: SDWA word update not propagated: retVal: %d, "
"orig: %d\n",
bits(retVal, Gcn3ISA::MSB_PER_WORD),
bits(origOperVal, high_bit));
// sign extended value depends on upper-most bit of the new word 0
signExt_local = (signExt &&
(bits(retVal, Gcn3ISA::MSB_PER_WORD, 0) &
0x8000));
// process other word -- if sign extending, make them 1, else all
// 0's so leave as is
if (signExt_local) {
retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
}
} else {
assert(sel != SDWA_DWORD); // should have returned earlier
panic("Unimplemented SDWA select operation: %d\n", sel);
}
return retVal;
}
/**
* sdwaInstSrcImpl is a helper function that selects the appropriate
* bits/bytes for each lane of the inputted source operand of an SDWA
* instruction, does the appropriate masking/padding/sign extending for the
* non-selected bits/bytes, and updates the operands values with the
* resultant value.
*
* The desired behavior is:
* 1. Select the appropriate bits/bytes based on sel:
* 0 (SDWA_BYTE_0): select data[7:0]
* 1 (SDWA_BYTE_1): select data[15:8]
* 2 (SDWA_BYTE_2): select data[23:16]
* 3 (SDWA_BYTE_3): select data[31:24]
* 4 (SDWA_WORD_0): select data[15:0]
* 5 (SDWA_WORD_1): select data[31:16]
* 6 (SDWA_DWORD): select data[31:0]
* 2. if sign extend is set, then sign extend the value
*/
template<typename T>
void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
const SDWASelVals sel, const bool signExt)
{
// iterate over all lanes, setting appropriate, selected value
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
origCurrOper[lane], sel,
signExt);
}
}
/**
* sdwaInstDstImpl_helper contains the per-lane code for selecting the
* appropriate bytes/words of the lane and doing the appropriate
* masking/padding/sign extending. It returns the value after these
* operations are done on it.
*/
template<typename T>
T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
const bool clamp, const SDWASelVals sel,
const SDWADstVals unusedBits_format)
{
// local variables
int low_bit = 0, high_bit = 0;
bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
//bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
// if we're preserving all of the bits, then we can immediately return
if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
assert(sel == SDWA_DWORD);
return currDstVal;
} else if (sel == SDWA_DWORD) {
// NOTE: users may set the unused bits variable to anything in this
// scenario, because it will be ignored
return currDstVal;
}
if (sel < SDWA_WORD_0) { // we are selecting 1 byte
// if we sign extended depends on upper-most bit of byte 0
signExt = (signExt &&
(bits(currDstVal, Gcn3ISA::MSB_PER_WORD, 0) & 0x80));
for (int byte = 0; byte < 4; ++byte) {
low_bit = byte * Gcn3ISA::BITS_PER_BYTE;
high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
/*
Options:
1. byte == sel: we are keeping all bits in this byte
2. preserve is set: keep this byte as is because the
output preserve flag is set
3. byte > sel && signExt: we're sign extending and
this byte is one of the bytes we need to sign extend
*/
origBits_thisByte = bits(origDstVal, high_bit, low_bit);
currBits_thisByte = bits(currDstVal, high_bit, low_bit);
newBits = ((byte == sel) ? origBits_thisByte :
((preserve) ? currBits_thisByte :
(((byte > sel) && signExt) ? 0xff : 0)));
retVal = insertBits(retVal, high_bit, low_bit, newBits);
}
} else if (sel < SDWA_DWORD) { // we are selecting 1 word
low_bit = 0;
high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
// if we sign extended depends on upper-most bit of word 0
signExt = (signExt &&
(bits(currDstVal, high_bit, low_bit) & 0x8000));
for (int word = 0; word < 2; ++word) {
low_bit = word * Gcn3ISA::BITS_PER_WORD;
high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
/*
Options:
1. word == sel & 1: we are keeping all bits in this word
2. preserve is set: keep this word as is because the
output preserve flag is set
3. word > (sel & 1) && signExt: we're sign extending and
this word is one of the words we need to sign extend
*/
origBits_thisWord = bits(origDstVal, high_bit, low_bit);
currBits_thisWord = bits(currDstVal, high_bit, low_bit);
newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
((preserve) ? currBits_thisWord :
(((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
retVal = insertBits(retVal, high_bit, low_bit, newBits);
}
} else {
assert(sel != SDWA_DWORD); // should have returned earlier
panic("Unimplemented SDWA select operation: %d\n", sel);
}
return retVal;
}
/**
* sdwaInstDestImpl is a helper function that selects the appropriate
* bits/bytes for the inputted dest operand of an SDWA instruction, does
* the appropriate masking/padding/sign extending for the non-selected
* bits/bytes, and updates the operands values with the resultant value.
*
* The desired behavior is:
* 1. Select the appropriate bits/bytes based on sel:
* 0 (SDWA_BYTE_0): select data[7:0]
* 1 (SDWA_BYTE_1): select data[15:8]
* 2 (SDWA_BYTE_2): select data[23:16]
* 3 (SDWA_BYTE_3): select data[31:24]
* 4 (SDWA_WORD_0): select data[15:0]
* 5 (SDWA_WORD_1): select data[31:16]
* 6 (SDWA_DWORD): select data[31:0]
* 2. either pad, sign extend, or select all bits based on the value of
* unusedBits_format:
* 0 (SDWA_UNUSED_PAD): pad all unused bits with 0
* 1 (SDWA_UNUSED_SEXT): sign-extend upper bits; pad lower bits w/ 0
* 2 (SDWA_UNUSED_PRESERVE): select data[31:0]
*/
template<typename T>
void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
const SDWASelVals sel,
const SDWADstVals unusedBits_format)
{
// iterate over all lanes, setting appropriate, selected value
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
origDstOper[lane], clamp,
sel, unusedBits_format);
}
}
/**
* processSDWA_srcHelper is a helper function for implementing sub d-word
* addressing instructions for the src operands. This function may be
* called by many different VOP1/VOP2/VOPC instructions to do operations
* within a register. This function is also agnostic of which operand it
* is operating on, so that it can be called for any src operand.
*/
template<typename T>
void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
const SDWASelVals src_sel,
const bool src_signExt, const bool src_abs,
const bool src_neg)
{
/**
* STEP 1: check if the absolute value (ABS) or negation (NEG) tags
* are set. If so, do the appropriate action(s) on the src operand.
*
* NOTE: According to the CSim implementation, ABS takes priority over
* NEG.
*/
if (src_neg) {
currSrc.negModifier();
}
if (src_abs) {
currSrc.absModifier();
}
/**
* STEP 2: select the appropriate bits for each lane of source operand.
*/
sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
}
/**
* processSDWA_src is a helper function for implementing sub d-word
* addressing instructions for the src operands. This function may be
* called by many different VOP1 instructions to do operations within a
* register. processSDWA_dst is called after the math, while
* processSDWA_src is called before the math.
*/
template<typename T>
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
{
// local variables
const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
const bool src0_signExt = sdwaInst.SRC0_SEXT;
const bool src0_neg = sdwaInst.SRC0_NEG;
const bool src0_abs = sdwaInst.SRC0_ABS;
// NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
// operand. So ensure that SRC1 fields are not set, then call helper
// function only on src0.
assert(!sdwaInst.SRC1_SEXT);
assert(!sdwaInst.SRC1_NEG);
assert(!sdwaInst.SRC1_ABS);
processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
src0_abs, src0_neg);
}
/**
* processSDWA_src is a helper function for implementing sub d-word
* addressing instructions. This function may be called by many different
* VOP2/VOPC instructions to do operations within a register.
* processSDWA_dst is called after the math, while processSDWA_src is
* called before the math.
*/
template<typename T>
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
T & src1, T & origSrc1)
{
// local variables
const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
const bool src0_signExt = sdwaInst.SRC0_SEXT;
const bool src0_neg = sdwaInst.SRC0_NEG;
const bool src0_abs = sdwaInst.SRC0_ABS;
const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
const bool src1_signExt = sdwaInst.SRC1_SEXT;
const bool src1_neg = sdwaInst.SRC1_NEG;
const bool src1_abs = sdwaInst.SRC1_ABS;
processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
src0_abs, src0_neg);
processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
src1_abs, src1_neg);
}
/**
* processSDWA_dst is a helper function for implementing sub d-word
* addressing instructions for the dst operand. This function may be
* called by many different VOP1/VOP2/VOPC instructions to do operations
* within a register. processSDWA_dst is called after the math, while
* processSDWA_src is called before the math.
*/
template<typename T>
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
{
// local variables
const SDWADstVals dst_unusedBits_format =
(SDWADstVals)sdwaInst.DST_UNUSED;
const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
const bool clamp = sdwaInst.CLAMP;
/**
* STEP 1: select the appropriate bits for dst and pad/sign-extend as
* appropriate.
*/
sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
}
} // namespace Gcn3ISA
} // namespace gem5
#endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,925 +0,0 @@
/*
* Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
#define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
#include "arch/amdgpu/gcn3/gpu_decoder.hh"
#include "arch/amdgpu/gcn3/gpu_mem_helpers.hh"
#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
#include "arch/amdgpu/gcn3/operand.hh"
#include "debug/GCN3.hh"
#include "debug/GPUExec.hh"
#include "mem/ruby/system/RubySystem.hh"
namespace gem5
{
namespace Gcn3ISA
{
struct BufferRsrcDescriptor
{
uint64_t baseAddr : 48;
uint32_t stride : 14;
uint32_t cacheSwizzle : 1;
uint32_t swizzleEn : 1;
uint32_t numRecords : 32;
uint32_t dstSelX : 3;
uint32_t dstSelY : 3;
uint32_t dstSelZ : 3;
uint32_t dstSelW : 3;
uint32_t numFmt : 3;
uint32_t dataFmt : 4;
uint32_t elemSize : 2;
uint32_t idxStride : 2;
uint32_t addTidEn : 1;
uint32_t atc : 1;
uint32_t hashEn : 1;
uint32_t heap : 1;
uint32_t mType : 3;
uint32_t type : 2;
};
// --- purely virtual instruction classes ---
class Inst_SOP2 : public GCN3GPUStaticInst
{
public:
Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_SOP2 instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_SOP2 *);
}; // Inst_SOP2
class Inst_SOPK : public GCN3GPUStaticInst
{
public:
Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
~Inst_SOPK();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_SOPK instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_SOPK *);
}; // Inst_SOPK
class Inst_SOP1 : public GCN3GPUStaticInst
{
public:
Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
~Inst_SOP1();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_SOP1 instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_SOP1 *);
}; // Inst_SOP1
class Inst_SOPC : public GCN3GPUStaticInst
{
public:
Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
~Inst_SOPC();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_SOPC instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_SOPC *);
}; // Inst_SOPC
class Inst_SOPP : public GCN3GPUStaticInst
{
public:
Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
~Inst_SOPP();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_SOPP instData;
}; // Inst_SOPP
class Inst_SMEM : public GCN3GPUStaticInst
{
public:
Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
~Inst_SMEM();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
/**
* initiate a memory read access for N dwords
*/
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
MemCmd::ReadReq);
}
/**
* initiate a memory write access for N dwords
*/
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
MemCmd::WriteReq);
}
/**
* For normal s_load_dword/s_store_dword instruction addresses.
*/
void
calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr,
ScalarRegU32 offset)
{
Addr vaddr = ((addr.rawData() + offset) & ~0x3);
gpu_dyn_inst->scalarAddr = vaddr;
}
/**
* For s_buffer_load_dword/s_buffer_store_dword instruction addresses.
* The s_buffer instructions use the same buffer resource descriptor
* as the MUBUF instructions.
*/
void
calcAddr(GPUDynInstPtr gpu_dyn_inst,
ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
{
BufferRsrcDescriptor rsrc_desc;
ScalarRegU32 clamped_offset(offset);
std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
sizeof(BufferRsrcDescriptor));
/**
* The address is clamped if:
* Stride is zero: clamp if offset >= num_records
* Stride is non-zero: clamp if offset > (stride * num_records)
*/
if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) {
clamped_offset = rsrc_desc.numRecords;
} else if (rsrc_desc.stride && offset
> (rsrc_desc.stride * rsrc_desc.numRecords)) {
clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
}
Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
gpu_dyn_inst->scalarAddr = vaddr;
}
// first instruction DWORD
InFmt_SMEM instData;
// second instruction DWORD
InFmt_SMEM_1 extData;
}; // Inst_SMEM
class Inst_VOP2 : public GCN3GPUStaticInst
{
public:
Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
~Inst_VOP2();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOP2 instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_VOP2 *);
}; // Inst_VOP2
class Inst_VOP1 : public GCN3GPUStaticInst
{
public:
Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
~Inst_VOP1();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOP1 instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_VOP1 *);
}; // Inst_VOP1
class Inst_VOPC : public GCN3GPUStaticInst
{
public:
Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
~Inst_VOPC();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOPC instData;
// possible second DWORD
InstFormat extData;
uint32_t varSize;
private:
bool hasSecondDword(InFmt_VOPC *);
}; // Inst_VOPC
class Inst_VINTRP : public GCN3GPUStaticInst
{
public:
Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
~Inst_VINTRP();
int instSize() const override;
protected:
// first instruction DWORD
InFmt_VINTRP instData;
}; // Inst_VINTRP
class Inst_VOP3 : public GCN3GPUStaticInst
{
public:
Inst_VOP3(InFmt_VOP3*, const std::string &opcode, bool sgpr_dst);
~Inst_VOP3();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOP3 instData;
// second instruction DWORD
InFmt_VOP3_1 extData;
private:
bool hasSecondDword(InFmt_VOP3 *);
/**
* the v_cmp and readlane instructions in the VOP3
* encoding are unique because they are the only
* instructions that use the VDST field to specify
* a scalar register destination. for VOP3::V_CMP insts
* VDST specifies the arbitrary SGPR pair used to write
* VCC. for V_READLANE VDST specifies the SGPR to return
* the value of the selected lane in the source VGPR
* from which we are reading.
*/
const bool sgprDst;
}; // Inst_VOP3
class Inst_VOP3_SDST_ENC : public GCN3GPUStaticInst
{
public:
Inst_VOP3_SDST_ENC(InFmt_VOP3_SDST_ENC*, const std::string &opcode);
~Inst_VOP3_SDST_ENC();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_VOP3_SDST_ENC instData;
// second instruction DWORD
InFmt_VOP3_1 extData;
private:
bool hasSecondDword(InFmt_VOP3_SDST_ENC *);
}; // Inst_VOP3_SDST_ENC
class Inst_DS : public GCN3GPUStaticInst
{
public:
Inst_DS(InFmt_DS*, const std::string &opcode);
~Inst_DS();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
template<typename T>
void
initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
= wf->ldsChunk->read<T>(vaddr);
}
}
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
for (int i = 0; i < N; ++i) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]
= wf->ldsChunk->read<VecElemU32>(
vaddr + i*sizeof(VecElemU32));
}
}
}
}
template<typename T>
void
initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
= wf->ldsChunk->read<T>(vaddr0);
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
= wf->ldsChunk->read<T>(vaddr1);
}
}
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
wf->ldsChunk->write<T>(vaddr,
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
}
}
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
for (int i = 0; i < N; ++i) {
wf->ldsChunk->write<VecElemU32>(
vaddr + i*sizeof(VecElemU32),
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]);
}
}
}
}
template<typename T>
void
initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
gpuDynInst->d_data))[lane * 2]);
wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
gpuDynInst->d_data))[lane * 2 + 1]);
}
}
}
void
calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
gpuDynInst->addr.at(lane) = (Addr)addr[lane];
}
}
}
// first instruction DWORD
InFmt_DS instData;
// second instruction DWORD
InFmt_DS_1 extData;
}; // Inst_DS
class Inst_MUBUF : public GCN3GPUStaticInst
{
public:
Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
~Inst_MUBUF();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
template<typename T>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
// temporarily modify exec_mask to supress memory accesses to oob
// regions. Only issue memory requests for lanes that have their
// exec_mask set and are not out of bounds.
VectorMask old_exec_mask = gpuDynInst->exec_mask;
gpuDynInst->exec_mask &= ~oobMask;
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
gpuDynInst->exec_mask = old_exec_mask;
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
// temporarily modify exec_mask to supress memory accesses to oob
// regions. Only issue memory requests for lanes that have their
// exec_mask set and are not out of bounds.
VectorMask old_exec_mask = gpuDynInst->exec_mask;
gpuDynInst->exec_mask &= ~oobMask;
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
gpuDynInst->exec_mask = old_exec_mask;
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
// temporarily modify exec_mask to supress memory accesses to oob
// regions. Only issue memory requests for lanes that have their
// exec_mask set and are not out of bounds.
VectorMask old_exec_mask = gpuDynInst->exec_mask;
gpuDynInst->exec_mask &= ~oobMask;
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
gpuDynInst->exec_mask = old_exec_mask;
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
// temporarily modify exec_mask to supress memory accesses to oob
// regions. Only issue memory requests for lanes that have their
// exec_mask set and are not out of bounds.
VectorMask old_exec_mask = gpuDynInst->exec_mask;
gpuDynInst->exec_mask &= ~oobMask;
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
gpuDynInst->exec_mask = old_exec_mask;
}
void
injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
{
// create request and set flags
gpuDynInst->resetEntireStatusVector();
gpuDynInst->setStatusVector(0, 1);
RequestPtr req = std::make_shared<Request>(0, 0, 0,
gpuDynInst->computeUnit()->
requestorId(), 0,
gpuDynInst->wfDynId);
gpuDynInst->setRequestFlags(req);
gpuDynInst->computeUnit()->
injectGlobalMemFence(gpuDynInst, false, req);
}
/**
* MUBUF insructions calculate their addresses as follows:
*
* index = (IDXEN ? vgpr_idx : 0) + (const_add_tid_en ? TID : 0)
* offset = (OFFEN ? vgpr_off : 0) + inst_off
*
* / ====================== LINEAR ADDRESSING ====================== /
* VADDR = base + sgpr_off + offset + stride * index
*
* / ===================== SWIZZLED ADDRESSING ===================== /
* index_msb = index / const_index_stride
* index_lsb = index % const_index_stride
* offset_msb = offset / const_element_size
* offset_lsb = offset % const_element_size
* buffer_offset = ((index_msb * stride + offset_msb *
* const_element_size) * const_index_stride +
* index_lsb * const_element_size + offset_lsb)
*
* VADDR = base + sgpr_off + buffer_offset
*/
template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
void
calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
{
Addr vaddr = 0;
Addr base_addr = 0;
Addr stride = 0;
Addr buf_idx = 0;
Addr buf_off = 0;
Addr buffer_offset = 0;
BufferRsrcDescriptor rsrc_desc;
std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
sizeof(BufferRsrcDescriptor));
base_addr = rsrc_desc.baseAddr;
stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
+ rsrc_desc.stride) : rsrc_desc.stride;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vaddr = base_addr + s_offset.rawData();
/**
* first we calculate the buffer's index and offset.
* these will be used for either linear or swizzled
* buffers.
*/
buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
buf_off = v_off[lane] + inst_offset;
if (rsrc_desc.swizzleEn) {
Addr idx_stride = 8 << rsrc_desc.idxStride;
Addr elem_size = 2 << rsrc_desc.elemSize;
Addr idx_msb = buf_idx / idx_stride;
Addr idx_lsb = buf_idx % idx_stride;
Addr off_msb = buf_off / elem_size;
Addr off_lsb = buf_off % elem_size;
DPRINTF(GCN3, "mubuf swizzled lane %d: "
"idx_stride = %llx, elem_size = %llx, "
"idx_msb = %llx, idx_lsb = %llx, "
"off_msb = %llx, off_lsb = %llx\n",
lane, idx_stride, elem_size, idx_msb, idx_lsb,
off_msb, off_lsb);
buffer_offset =(idx_msb * stride + off_msb * elem_size)
* idx_stride + idx_lsb * elem_size + off_lsb;
} else {
buffer_offset = buf_off + stride * buf_idx;
}
/**
* Range check behavior causes out of range accesses to
* to be treated differently. Out of range accesses return
* 0 for loads and are ignored for stores. For
* non-formatted accesses, this is done on a per-lane
* basis.
*/
if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
if (buffer_offset >=
rsrc_desc.numRecords - s_offset.rawData()) {
DPRINTF(GCN3, "mubuf out-of-bounds condition 1: "
"lane = %d, buffer_offset = %llx, "
"const_stride = %llx, "
"const_num_records = %llx\n",
lane, buf_off + stride * buf_idx,
rsrc_desc.stride, rsrc_desc.numRecords);
oobMask.set(lane);
continue;
}
}
if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
if (buf_idx >= rsrc_desc.numRecords ||
buf_off >= stride) {
DPRINTF(GCN3, "mubuf out-of-bounds condition 2: "
"lane = %d, offset = %llx, "
"index = %llx, "
"const_num_records = %llx\n",
lane, buf_off, buf_idx,
rsrc_desc.numRecords);
oobMask.set(lane);
continue;
}
}
vaddr += buffer_offset;
DPRINTF(GCN3, "Calculating mubuf address for lane %d: "
"vaddr = %llx, base_addr = %llx, "
"stride = %llx, buf_idx = %llx, buf_off = %llx\n",
lane, vaddr, base_addr, stride,
buf_idx, buf_off);
gpuDynInst->addr.at(lane) = vaddr;
}
}
}
// first instruction DWORD
InFmt_MUBUF instData;
// second instruction DWORD
InFmt_MUBUF_1 extData;
// Mask of lanes with out-of-bounds accesses. Needs to be tracked
// seperately from the exec_mask so that we remember to write zero
// to the registers associated with out of bounds lanes.
VectorMask oobMask;
}; // Inst_MUBUF
class Inst_MTBUF : public GCN3GPUStaticInst
{
public:
Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
~Inst_MTBUF();
int instSize() const override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_MTBUF instData;
// second instruction DWORD
InFmt_MTBUF_1 extData;
private:
bool hasSecondDword(InFmt_MTBUF *);
}; // Inst_MTBUF
class Inst_MIMG : public GCN3GPUStaticInst
{
public:
Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
~Inst_MIMG();
int instSize() const override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_MIMG instData;
// second instruction DWORD
InFmt_MIMG_1 extData;
}; // Inst_MIMG
class Inst_EXP : public GCN3GPUStaticInst
{
public:
Inst_EXP(InFmt_EXP*, const std::string &opcode);
~Inst_EXP();
int instSize() const override;
void initOperandInfo() override;
protected:
// first instruction DWORD
InFmt_EXP instData;
// second instruction DWORD
InFmt_EXP_1 extData;
}; // Inst_EXP
class Inst_FLAT : public GCN3GPUStaticInst
{
public:
Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
~Inst_FLAT();
int instSize() const override;
void generateDisassembly() override;
void initOperandInfo() override;
protected:
template<typename T>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
= wf->ldsChunk->read<T>(vaddr);
}
}
}
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
for (int i = 0; i < N; ++i) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]
= wf->ldsChunk->read<VecElemU32>(
vaddr + i*sizeof(VecElemU32));
}
}
}
}
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
wf->ldsChunk->write<T>(vaddr,
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
}
}
}
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
for (int i = 0; i < N; ++i) {
wf->ldsChunk->write<VecElemU32>(
vaddr + i*sizeof(VecElemU32),
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]);
}
}
}
}
}
template<typename T>
void
initAtomicAccess(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
auto amo_op =
gpuDynInst->makeAtomicOpFunctor<T>(
&(reinterpret_cast<T*>(
gpuDynInst->a_data))[lane],
&(reinterpret_cast<T*>(
gpuDynInst->x_data))[lane]);
T tmp = wf->ldsChunk->read<T>(vaddr);
(*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
wf->ldsChunk->write<T>(vaddr, tmp);
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
}
}
}
}
void
calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr)
{
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
gpuDynInst->addr.at(lane) = addr[lane];
}
}
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
}
// first instruction DWORD
InFmt_FLAT instData;
// second instruction DWORD
InFmt_FLAT_1 extData;
}; // Inst_FLAT
} // namespace Gcn3ISA
} // namespace gem5
#endif // __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__

View File

@@ -1,103 +0,0 @@
/*
* Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/gcn3/gpu_isa.hh"
#include <numeric>
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/wavefront.hh"
namespace gem5
{
namespace Gcn3ISA
{
GPUISA::GPUISA(Wavefront &wf) : wavefront(wf), m0(0)
{
}
ScalarRegU32
GPUISA::readMiscReg(int opIdx) const
{
switch (opIdx) {
case REG_M0:
return m0;
case REG_ZERO:
return 0;
case REG_SCC:
return statusReg.SCC;
default:
fatal("attempting to read from unsupported or non-readable "
"register. selector val: %i\n", opIdx);
return 0;
}
}
void
GPUISA::writeMiscReg(int opIdx, ScalarRegU32 operandVal)
{
switch (opIdx) {
case REG_M0:
m0 = operandVal;
break;
case REG_SCC:
statusReg.SCC = operandVal ? 1 : 0;
break;
default:
fatal("attempting to write to an unsupported or non-writable "
"register. selector val: %i\n", opIdx);
break;
}
}
void
GPUISA::advancePC(GPUDynInstPtr gpuDynInst)
{
wavefront.pc(wavefront.pc()
+ gpuDynInst->staticInstruction()->instSize());
}
const std::array<const ScalarRegU32, NumPosConstRegs>
GPUISA::posConstRegs = { {
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
} };
const std::array<const ScalarRegI32, NumNegConstRegs>
GPUISA::negConstRegs = { {
-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
-16
} };
} // namespace Gcn3ISA
} // namespace gem5

View File

@@ -1,752 +0,0 @@
/*
* Copyright (c) 2017-2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_GCN3_OPERAND_HH__
#define __ARCH_GCN3_OPERAND_HH__
#include <array>
#include "arch/amdgpu/gcn3/gpu_registers.hh"
#include "arch/generic/vec_reg.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
namespace gem5
{
/**
* classes that represnt vector/scalar operands in GCN3 ISA. these classes
* wrap the generic vector register type (i.e., src/arch/generic/vec_reg.hh)
* and allow them to be manipulated in ways that are unique to GCN3 insts.
*/
namespace Gcn3ISA
{
/**
* convenience traits so we can automatically infer the correct FP type
* without looking at the number of dwords (i.e., to determine if we
* need a float or a double when creating FP constants).
*/
template<typename T> struct OpTraits { typedef float FloatT; };
template<> struct OpTraits<ScalarRegF64> { typedef double FloatT; };
template<> struct OpTraits<ScalarRegU64> { typedef double FloatT; };
class Operand
{
public:
Operand() = delete;
Operand(GPUDynInstPtr gpuDynInst, int opIdx)
: _gpuDynInst(gpuDynInst), _opIdx(opIdx)
{
assert(_gpuDynInst);
assert(_opIdx >= 0);
}
/**
* read from and write to the underlying register(s) that
* this operand is referring to.
*/
virtual void read() = 0;
virtual void write() = 0;
protected:
/**
* instruction object that owns this operand
*/
GPUDynInstPtr _gpuDynInst;
/**
* op selector value for this operand. note that this is not
* the same as the register file index, be it scalar or vector.
* this could refer to inline constants, system regs, or even
* special values.
*/
int _opIdx;
};
template<typename DataType, bool Const, size_t NumDwords>
class ScalarOperand;
template<typename DataType, bool Const,
size_t NumDwords = sizeof(DataType) / sizeof(VecElemU32)>
class VecOperand final : public Operand
{
static_assert(NumDwords >= 1 && NumDwords <= MaxOperandDwords,
"Incorrect number of DWORDS for GCN3 operand.");
public:
VecOperand() = delete;
VecOperand(GPUDynInstPtr gpuDynInst, int opIdx)
: Operand(gpuDynInst, opIdx), scalar(false), absMod(false),
negMod(false), scRegData(gpuDynInst, _opIdx),
vrfData{{ nullptr }}
{
vecReg.zero();
}
~VecOperand()
{
}
/**
* certain vector operands can read from the vrf/srf or constants.
* we use this method to first determine the type of the operand,
* then we read from the appropriate source. if vector we read
* directly from the vrf. if scalar, we read in the data through
* the scalar operand component. this should only be used for VSRC
* operands.
*/
void
readSrc()
{
if (isVectorReg(_opIdx)) {
_opIdx = opSelectorToRegIdx(_opIdx, _gpuDynInst->wavefront()
->reservedScalarRegs);
read();
} else {
readScalar();
}
}
/**
* read from the vrf. this should only be used by vector inst
* source operands that are explicitly vector (i.e., VSRC).
*/
void
read() override
{
assert(_gpuDynInst);
assert(_gpuDynInst->wavefront());
assert(_gpuDynInst->computeUnit());
Wavefront *wf = _gpuDynInst->wavefront();
ComputeUnit *cu = _gpuDynInst->computeUnit();
for (auto i = 0; i < NumDwords; ++i) {
int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx + i);
vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx);
cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
}
if (NumDwords == 1) {
assert(vrfData[0]);
auto vgpr = vecReg.template as<DataType>();
auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
std::memcpy((void*)&vgpr[lane],
(void*)&reg_file_vgpr[lane], sizeof(DataType));
}
} else if (NumDwords == 2) {
assert(vrfData[0]);
assert(vrfData[1]);
auto vgpr = vecReg.template as<VecElemU64>();
auto reg_file_vgpr0 = vrfData[0]->template as<VecElemU32>();
auto reg_file_vgpr1 = vrfData[1]->template as<VecElemU32>();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
VecElemU64 tmp_val(0);
((VecElemU32*)&tmp_val)[0] = reg_file_vgpr0[lane];
((VecElemU32*)&tmp_val)[1] = reg_file_vgpr1[lane];
vgpr[lane] = tmp_val;
}
}
}
/**
* write to the vrf. we maintain a copy of the underlying vector
* reg(s) for this operand (i.e., vrfData/scRegData), as well as a
* temporary vector register representation (i.e., vecReg) of the
* vector register, which allows the execute() methods of instructions
* to easily write their operand data using operator[] regardless of
* their size. after the result is calculated we use write() to write
* the data to the actual register file storage. this allows us to do
* type conversion, etc., in a single call as opposed to doing it
* in each execute() method.
*/
void
write() override
{
assert(_gpuDynInst);
assert(_gpuDynInst->wavefront());
assert(_gpuDynInst->computeUnit());
Wavefront *wf = _gpuDynInst->wavefront();
ComputeUnit *cu = _gpuDynInst->computeUnit();
VectorMask &exec_mask = _gpuDynInst->isLoad()
? _gpuDynInst->exec_mask : wf->execMask();
if (NumDwords == 1) {
int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx);
vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
assert(vrfData[0]);
auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
auto vgpr = vecReg.template as<DataType>();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (exec_mask[lane] || _gpuDynInst->ignoreExec()) {
std::memcpy((void*)&reg_file_vgpr[lane],
(void*)&vgpr[lane], sizeof(DataType));
}
}
DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx);
cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
} else if (NumDwords == 2) {
int vgprIdx0 = cu->registerManager->mapVgpr(wf, _opIdx);
int vgprIdx1 = cu->registerManager->mapVgpr(wf, _opIdx + 1);
vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0);
vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1);
assert(vrfData[0]);
assert(vrfData[1]);
auto reg_file_vgpr0 = vrfData[0]->template as<VecElemU32>();
auto reg_file_vgpr1 = vrfData[1]->template as<VecElemU32>();
auto vgpr = vecReg.template as<VecElemU64>();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (exec_mask[lane] || _gpuDynInst->ignoreExec()) {
reg_file_vgpr0[lane] = ((VecElemU32*)&vgpr[lane])[0];
reg_file_vgpr1[lane] = ((VecElemU32*)&vgpr[lane])[1];
}
}
DPRINTF(GPUVRF, "Write v[%d:%d]\n", vgprIdx0, vgprIdx1);
cu->vrf[wf->simdId]->printReg(wf, vgprIdx0);
cu->vrf[wf->simdId]->printReg(wf, vgprIdx1);
}
}
void
negModifier()
{
negMod = true;
}
void
absModifier()
{
absMod = true;
}
/**
* getter [] operator. only enable if this operand is constant
* (i.e, a source operand) and if it can be represented using
* primitive types (i.e., 8b to 64b primitives).
*/
template<bool Condition = (NumDwords == 1 || NumDwords == 2) && Const>
typename std::enable_if_t<Condition, const DataType>
operator[](size_t idx) const
{
assert(idx < NumVecElemPerVecReg);
if (scalar) {
DataType ret_val = scRegData.rawData();
if (absMod) {
assert(std::is_floating_point_v<DataType>);
ret_val = std::fabs(ret_val);
}
if (negMod) {
assert(std::is_floating_point_v<DataType>);
ret_val = -ret_val;
}
return ret_val;
} else {
auto vgpr = vecReg.template as<DataType>();
DataType ret_val = vgpr[idx];
if (absMod) {
assert(std::is_floating_point_v<DataType>);
ret_val = std::fabs(ret_val);
}
if (negMod) {
assert(std::is_floating_point_v<DataType>);
ret_val = -ret_val;
}
return ret_val;
}
}
/**
* setter [] operator. only enable if this operand is non-constant
* (i.e, a destination operand) and if it can be represented using
* primitive types (i.e., 8b to 64b primitives).
*/
template<bool Condition = (NumDwords == 1 || NumDwords == 2) && !Const>
typename std::enable_if_t<Condition, DataType&>
operator[](size_t idx)
{
assert(!scalar);
assert(idx < NumVecElemPerVecReg);
return vecReg.template as<DataType>()[idx];
}
private:
/**
* if we determine that this operand is a scalar (reg or constant)
* then we read the scalar data into the scalar operand data member.
*/
void
readScalar()
{
scalar = true;
scRegData.read();
}
using VecRegCont =
VecRegContainer<sizeof(DataType) * NumVecElemPerVecReg>;
/**
* whether this operand a scalar or not.
*/
bool scalar;
/**
* absolute value and negative modifiers. VOP3 instructions
* may indicate that their input/output operands must be
* modified, either by taking the absolute value or negating
* them. these bools indicate which modifier, if any, to use.
*/
bool absMod;
bool negMod;
/**
* this holds all the operand data in a single vector register
* object (i.e., if an operand is 64b, this will hold the data
* from both registers the operand is using).
*/
VecRegCont vecReg;
/**
* for src operands that read scalars (i.e., scalar regs or
* a scalar constant).
*/
ScalarOperand<DataType, Const, NumDwords> scRegData;
/**
* pointers to the underlyding registers (i.e., the actual
* registers in the register file).
*/
std::array<VecRegContainerU32*, NumDwords> vrfData;
};
template<typename DataType, bool Const,
size_t NumDwords = sizeof(DataType) / sizeof(ScalarRegU32)>
class ScalarOperand final : public Operand
{
static_assert(NumDwords >= 1 && NumDwords <= MaxOperandDwords,
"Incorrect number of DWORDS for GCN3 operand.");
public:
ScalarOperand() = delete;
ScalarOperand(GPUDynInstPtr gpuDynInst, int opIdx)
: Operand(gpuDynInst, opIdx)
{
std::memset(srfData.data(), 0, NumDwords * sizeof(ScalarRegU32));
}
~ScalarOperand()
{
}
/**
* we store scalar data in a std::array, however if we need the
* full operand data we use this method to copy all elements of
* the scalar operand data to a single primitive container. only
* useful for 8b to 64b primitive types, as they are the only types
* that we need to perform computation on.
*/
template<bool Condition = NumDwords == 1 || NumDwords == 2>
typename std::enable_if_t<Condition, DataType>
rawData() const
{
assert(sizeof(DataType) <= sizeof(srfData));
DataType raw_data((DataType)0);
std::memcpy((void*)&raw_data, (void*)srfData.data(),
sizeof(DataType));
return raw_data;
}
void*
rawDataPtr()
{
return (void*)srfData.data();
}
void
read() override
{
Wavefront *wf = _gpuDynInst->wavefront();
ComputeUnit *cu = _gpuDynInst->computeUnit();
if (!isScalarReg(_opIdx)) {
readSpecialVal();
} else {
for (auto i = 0; i < NumDwords; ++i) {
int sgprIdx = regIdx(i);
srfData[i] = cu->srf[wf->simdId]->read(sgprIdx);
DPRINTF(GPUSRF, "Read s[%d]\n", sgprIdx);
cu->srf[wf->simdId]->printReg(wf, sgprIdx);
}
}
}
void
write() override
{
Wavefront *wf = _gpuDynInst->wavefront();
ComputeUnit *cu = _gpuDynInst->computeUnit();
if (!isScalarReg(_opIdx)) {
if (_opIdx == REG_EXEC_LO) {
ScalarRegU64 new_exec_mask_val
= wf->execMask().to_ullong();
if (NumDwords == 1) {
std::memcpy((void*)&new_exec_mask_val,
(void*)srfData.data(), sizeof(VecElemU32));
} else if (NumDwords == 2) {
std::memcpy((void*)&new_exec_mask_val,
(void*)srfData.data(), sizeof(VecElemU64));
} else {
panic("Trying to write more than 2 DWORDS to EXEC\n");
}
VectorMask new_exec_mask(new_exec_mask_val);
wf->execMask() = new_exec_mask;
DPRINTF(GPUSRF, "Write EXEC\n");
DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
} else if (_opIdx == REG_EXEC_HI) {
/**
* If we're writing only the upper half of the EXEC mask
* this ought to be a single dword operand.
*/
assert(NumDwords == 1);
ScalarRegU32 new_exec_mask_hi_val(0);
ScalarRegU64 new_exec_mask_val
= wf->execMask().to_ullong();
std::memcpy((void*)&new_exec_mask_hi_val,
(void*)srfData.data(), sizeof(new_exec_mask_hi_val));
replaceBits(new_exec_mask_val, 63, 32,
new_exec_mask_hi_val);
VectorMask new_exec_mask(new_exec_mask_val);
wf->execMask() = new_exec_mask;
DPRINTF(GPUSRF, "Write EXEC\n");
DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
} else {
_gpuDynInst->writeMiscReg(_opIdx, srfData[0]);
}
} else {
for (auto i = 0; i < NumDwords; ++i) {
int sgprIdx = regIdx(i);
auto &sgpr = cu->srf[wf->simdId]->readWriteable(sgprIdx);
if (_gpuDynInst->isLoad()) {
assert(sizeof(DataType) <= sizeof(ScalarRegU64));
sgpr = reinterpret_cast<ScalarRegU32*>(
_gpuDynInst->scalar_data)[i];
} else {
sgpr = srfData[i];
}
DPRINTF(GPUSRF, "Write s[%d]\n", sgprIdx);
cu->srf[wf->simdId]->printReg(wf, sgprIdx);
}
}
}
/**
* bit access to scalar data. primarily used for setting vcc bits.
*/
template<bool Condition = NumDwords == 1 || NumDwords == 2>
typename std::enable_if_t<Condition, void>
setBit(int bit, int bit_val)
{
DataType &sgpr = *((DataType*)srfData.data());
replaceBits(sgpr, bit, bit_val);
}
template<bool Condition = (NumDwords == 1 || NumDwords == 2) && !Const>
typename std::enable_if_t<Condition, ScalarOperand&>
operator=(DataType rhs)
{
std::memcpy((void*)srfData.data(), (void*)&rhs, sizeof(DataType));
return *this;
}
private:
/**
* we have determined that we are not reading our scalar operand data
* from the register file, so here we figure out which special value
* we are reading (i.e., float constant, int constant, inline
* constant, or various other system registers (e.g., exec mask).
*/
void
readSpecialVal()
{
assert(NumDwords == 1 || NumDwords == 2);
switch(_opIdx) {
case REG_EXEC_LO:
{
if (NumDwords == 1) {
ScalarRegU32 exec_mask = _gpuDynInst->wavefront()->
execMask().to_ulong();
std::memcpy((void*)srfData.data(), (void*)&exec_mask,
sizeof(exec_mask));
DPRINTF(GPUSRF, "Read EXEC\n");
DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
} else {
assert(NumDwords == 2);
ScalarRegU64 exec_mask = _gpuDynInst->wavefront()->
execMask().to_ullong();
std::memcpy((void*)srfData.data(), (void*)&exec_mask,
sizeof(exec_mask));
DPRINTF(GPUSRF, "Read EXEC\n");
DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
}
}
break;
case REG_EXEC_HI:
{
/**
* If we're reading only the upper half of the EXEC mask
* this ought to be a single dword operand.
*/
assert(NumDwords == 1);
ScalarRegU64 exec_mask = _gpuDynInst->wavefront()
->execMask().to_ullong();
ScalarRegU32 exec_mask_hi = bits(exec_mask, 63, 32);
std::memcpy((void*)srfData.data(), (void*)&exec_mask_hi,
sizeof(exec_mask_hi));
DPRINTF(GPUSRF, "Read EXEC_HI\n");
DPRINTF(GPUSRF, "EXEC_HI = %#x\n", exec_mask_hi);
}
break;
case REG_SRC_SWDA:
case REG_SRC_DPP:
case REG_SRC_LITERAL:
assert(NumDwords == 1);
srfData[0] = _gpuDynInst->srcLiteral();
break;
case REG_POS_HALF:
{
typename OpTraits<DataType>::FloatT pos_half = 0.5;
std::memcpy((void*)srfData.data(), (void*)&pos_half,
sizeof(pos_half));
}
break;
case REG_NEG_HALF:
{
typename OpTraits<DataType>::FloatT neg_half = -0.5;
std::memcpy((void*)srfData.data(), (void*)&neg_half,
sizeof(neg_half));
}
break;
case REG_POS_ONE:
{
typename OpTraits<DataType>::FloatT pos_one = 1.0;
std::memcpy(srfData.data(), &pos_one, sizeof(pos_one));
}
break;
case REG_NEG_ONE:
{
typename OpTraits<DataType>::FloatT neg_one = -1.0;
std::memcpy(srfData.data(), &neg_one, sizeof(neg_one));
}
break;
case REG_POS_TWO:
{
typename OpTraits<DataType>::FloatT pos_two = 2.0;
std::memcpy(srfData.data(), &pos_two, sizeof(pos_two));
}
break;
case REG_NEG_TWO:
{
typename OpTraits<DataType>::FloatT neg_two = -2.0;
std::memcpy(srfData.data(), &neg_two, sizeof(neg_two));
}
break;
case REG_POS_FOUR:
{
typename OpTraits<DataType>::FloatT pos_four = 4.0;
std::memcpy(srfData.data(), &pos_four, sizeof(pos_four));
}
break;
case REG_NEG_FOUR:
{
typename OpTraits<DataType>::FloatT neg_four = -4.0;
std::memcpy((void*)srfData.data(), (void*)&neg_four ,
sizeof(neg_four));
}
break;
case REG_PI:
{
assert(sizeof(DataType) == sizeof(ScalarRegF64)
|| sizeof(DataType) == sizeof(ScalarRegF32));
const ScalarRegU32 pi_u32(0x3e22f983UL);
const ScalarRegU64 pi_u64(0x3fc45f306dc9c882ULL);
if (sizeof(DataType) == sizeof(ScalarRegF64)) {
std::memcpy((void*)srfData.data(),
(void*)&pi_u64, sizeof(pi_u64));
} else {
std::memcpy((void*)srfData.data(),
(void*)&pi_u32, sizeof(pi_u32));
}
}
break;
default:
{
assert(sizeof(DataType) <= sizeof(srfData));
DataType misc_val(0);
if (isConstVal(_opIdx)) {
misc_val = (DataType)_gpuDynInst
->readConstVal<DataType>(_opIdx);
} else {
misc_val = (DataType)_gpuDynInst->readMiscReg(_opIdx);
}
std::memcpy((void*)srfData.data(), (void*)&misc_val,
sizeof(DataType));
}
}
}
/**
* for scalars we need to do some extra work to figure out how to
* map the op selector to the sgpr idx because some op selectors
* do not map directly to the srf (i.e., vcc/flat_scratch).
*/
int
regIdx(int dword) const
{
Wavefront *wf = _gpuDynInst->wavefront();
ComputeUnit *cu = _gpuDynInst->computeUnit();
int sgprIdx(-1);
if (_opIdx == REG_VCC_HI) {
sgprIdx = cu->registerManager
->mapSgpr(wf, wf->reservedScalarRegs - 1 + dword);
} else if (_opIdx == REG_VCC_LO) {
sgprIdx = cu->registerManager
->mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
} else if (_opIdx == REG_FLAT_SCRATCH_HI) {
sgprIdx = cu->registerManager
->mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
} else if (_opIdx == REG_FLAT_SCRATCH_LO) {
assert(NumDwords == 1);
sgprIdx = cu->registerManager
->mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
} else {
sgprIdx = cu->registerManager->mapSgpr(wf, _opIdx + dword);
}
assert(sgprIdx > -1);
return sgprIdx;
}
/**
* in GCN3 each register is represented as a 32b unsigned value,
* however operands may require up to 16 registers, so we store
* all the individual 32b components here. for sub-dword operand
* we still consider them to be 1 dword because the minimum size
* of a register is 1 dword. this class will take care to do the
* proper packing/unpacking of sub-dword operands.
*/
std::array<ScalarRegU32, NumDwords> srfData;
};
// typedefs for the various sizes/types of scalar operands
using ScalarOperandU8 = ScalarOperand<ScalarRegU8, false, 1>;
using ScalarOperandI8 = ScalarOperand<ScalarRegI8, false, 1>;
using ScalarOperandU16 = ScalarOperand<ScalarRegU16, false, 1>;
using ScalarOperandI16 = ScalarOperand<ScalarRegI16, false, 1>;
using ScalarOperandU32 = ScalarOperand<ScalarRegU32, false>;
using ScalarOperandI32 = ScalarOperand<ScalarRegI32, false>;
using ScalarOperandF32 = ScalarOperand<ScalarRegF32, false>;
using ScalarOperandU64 = ScalarOperand<ScalarRegU64, false>;
using ScalarOperandI64 = ScalarOperand<ScalarRegI64, false>;
using ScalarOperandF64 = ScalarOperand<ScalarRegF64, false>;
using ScalarOperandU128 = ScalarOperand<ScalarRegU32, false, 4>;
using ScalarOperandU256 = ScalarOperand<ScalarRegU32, false, 8>;
using ScalarOperandU512 = ScalarOperand<ScalarRegU32, false, 16>;
// non-writeable versions of scalar operands
using ConstScalarOperandU8 = ScalarOperand<ScalarRegU8, true, 1>;
using ConstScalarOperandI8 = ScalarOperand<ScalarRegI8, true, 1>;
using ConstScalarOperandU16 = ScalarOperand<ScalarRegU16, true, 1>;
using ConstScalarOperandI16 = ScalarOperand<ScalarRegI16, true, 1>;
using ConstScalarOperandU32 = ScalarOperand<ScalarRegU32, true>;
using ConstScalarOperandI32 = ScalarOperand<ScalarRegI32, true>;
using ConstScalarOperandF32 = ScalarOperand<ScalarRegF32, true>;
using ConstScalarOperandU64 = ScalarOperand<ScalarRegU64, true>;
using ConstScalarOperandI64 = ScalarOperand<ScalarRegI64, true>;
using ConstScalarOperandF64 = ScalarOperand<ScalarRegF64, true>;
using ConstScalarOperandU128 = ScalarOperand<ScalarRegU32, true, 4>;
using ConstScalarOperandU256 = ScalarOperand<ScalarRegU32, true, 8>;
using ConstScalarOperandU512 = ScalarOperand<ScalarRegU32, true, 16>;
// typedefs for the various sizes/types of vector operands
using VecOperandU8 = VecOperand<VecElemU8, false, 1>;
using VecOperandI8 = VecOperand<VecElemI8, false, 1>;
using VecOperandU16 = VecOperand<VecElemU16, false, 1>;
using VecOperandI16 = VecOperand<VecElemI16, false, 1>;
using VecOperandU32 = VecOperand<VecElemU32, false>;
using VecOperandI32 = VecOperand<VecElemI32, false>;
using VecOperandF32 = VecOperand<VecElemF32, false>;
using VecOperandU64 = VecOperand<VecElemU64, false>;
using VecOperandF64 = VecOperand<VecElemF64, false>;
using VecOperandI64 = VecOperand<VecElemI64, false>;
using VecOperandU96 = VecOperand<VecElemU32, false, 3>;
using VecOperandU128 = VecOperand<VecElemU32, false, 4>;
using VecOperandU256 = VecOperand<VecElemU32, false, 8>;
using VecOperandU512 = VecOperand<VecElemU32, false, 16>;
// non-writeable versions of vector operands
using ConstVecOperandU8 = VecOperand<VecElemU8, true, 1>;
using ConstVecOperandI8 = VecOperand<VecElemI8, true, 1>;
using ConstVecOperandU16 = VecOperand<VecElemU16, true, 1>;
using ConstVecOperandI16 = VecOperand<VecElemI16, true, 1>;
using ConstVecOperandU32 = VecOperand<VecElemU32, true>;
using ConstVecOperandI32 = VecOperand<VecElemI32, true>;
using ConstVecOperandF32 = VecOperand<VecElemF32, true>;
using ConstVecOperandU64 = VecOperand<VecElemU64, true>;
using ConstVecOperandI64 = VecOperand<VecElemI64, true>;
using ConstVecOperandF64 = VecOperand<VecElemF64, true>;
using ConstVecOperandU96 = VecOperand<VecElemU32, true, 3>;
using ConstVecOperandU128 = VecOperand<VecElemU32, true, 4>;
using ConstVecOperandU256 = VecOperand<VecElemU32, true, 8>;
using ConstVecOperandU512 = VecOperand<VecElemU32, true, 16>;
}
} // namespace gem5
#endif // __ARCH_GCN3_OPERAND_HH__

View File

@@ -1,241 +0,0 @@
/*
* Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/gcn3/gpu_registers.hh"
namespace gem5
{
namespace Gcn3ISA
{
std::string
opSelectorToRegSym(int idx, int numRegs)
{
std::string reg_sym;
// we have an SGPR
if (idx <= REG_SGPR_MAX) {
if (numRegs > 1)
reg_sym = "s[" + std::to_string(idx) + ":" +
std::to_string(idx + numRegs - 1) + "]";
else
reg_sym = "s" + std::to_string(idx);
return reg_sym;
} else if (idx >= REG_VGPR_MIN && idx <= REG_VGPR_MAX) {
if (numRegs > 1)
reg_sym = "v[" + std::to_string(idx - REG_VGPR_MIN) + ":" +
std::to_string(idx - REG_VGPR_MIN + numRegs - 1) + "]";
else
reg_sym = "v" + std::to_string(idx - REG_VGPR_MIN);
return reg_sym;
} else if (idx >= REG_INT_CONST_POS_MIN &&
idx <= REG_INT_CONST_POS_MAX) {
reg_sym = std::to_string(idx - REG_INT_CONST_POS_MIN + 1);
return reg_sym;
} else if (idx >= REG_INT_CONST_NEG_MIN &&
idx <= REG_INT_CONST_NEG_MAX) {
int inline_val = -1 - (idx - REG_INT_CONST_NEG_MIN);
reg_sym = std::to_string(inline_val);
return reg_sym;
}
switch (idx) {
case REG_FLAT_SCRATCH_LO:
reg_sym = "flat_scratch_lo";
break;
case REG_FLAT_SCRATCH_HI:
reg_sym = "flat_scratch_hi";
break;
case REG_VCC_LO:
reg_sym = "vcc_lo";
break;
case REG_VCC_HI:
reg_sym = "vcc_hi";
break;
case REG_M0:
reg_sym = "m0";
break;
case REG_EXEC_LO:
reg_sym = "exec";
break;
case REG_ZERO:
reg_sym = "0";
break;
case REG_POS_HALF:
reg_sym = "0.5";
break;
case REG_NEG_HALF:
reg_sym = "-0.5";
break;
case REG_POS_ONE:
reg_sym = "1";
break;
case REG_NEG_ONE:
reg_sym = "-1";
break;
case REG_POS_TWO:
reg_sym = "2";
break;
case REG_NEG_TWO:
reg_sym = "-2";
break;
case REG_POS_FOUR:
reg_sym = "4";
break;
case REG_NEG_FOUR:
reg_sym = "-4";
break;
default:
fatal("GCN3 ISA instruction has unknown register index %u\n", idx);
break;
}
return reg_sym;
}
int
opSelectorToRegIdx(int idx, int numScalarRegs)
{
int regIdx = -1;
if (idx <= REG_SGPR_MAX) {
regIdx = idx;
} else if (idx >= REG_VGPR_MIN && idx <= REG_VGPR_MAX) {
regIdx = idx - REG_VGPR_MIN;
} else if (idx == REG_VCC_LO) {
/**
* the VCC register occupies the two highest numbered
* SRF entries. VCC is typically indexed by specifying
* VCC_LO (simply called VCC) in the instruction encoding
* and reading it as a 64b value so we only return the
* index to the lower half of the VCC register.
*
* VCC_LO = s[NUM_SGPRS - 2]
* VCC_HI = s[NUM_SGPRS - 1]
*
*/
regIdx = numScalarRegs - 2;
} else if (idx == REG_VCC_HI) {
regIdx = numScalarRegs - 1;
} else if (idx == REG_FLAT_SCRATCH_LO) {
/**
* the FLAT_SCRATCH register occupies the two SRF entries
* just below VCC. FLAT_SCRATCH is typically indexed by
* specifying FLAT_SCRATCH_LO (simply called FLAT_SCRATCH)
* in the instruction encoding and reading it as a 64b value
* so we only return the index to the lower half of the
* FLAT_SCRATCH register.
*
* FLAT_SCRATCH_LO = s[NUM_SGPRS - 4]
* FLAT_SCRATCH_HI = s[NUM_SGPRS - 3]
*
*/
regIdx = numScalarRegs - 4;
} else if (idx == REG_FLAT_SCRATCH_HI) {
regIdx = numScalarRegs - 3;
}
return regIdx;
}
bool
isPosConstVal(int opIdx)
{
bool is_pos_const_val = (opIdx >= REG_INT_CONST_POS_MIN
&& opIdx <= REG_INT_CONST_POS_MAX);
return is_pos_const_val;
}
bool
isNegConstVal(int opIdx)
{
bool is_neg_const_val = (opIdx >= REG_INT_CONST_NEG_MIN
&& opIdx <= REG_INT_CONST_NEG_MAX);
return is_neg_const_val;
}
bool
isConstVal(int opIdx)
{
bool is_const_val = isPosConstVal(opIdx) || isNegConstVal(opIdx);
return is_const_val;
}
bool
isLiteral(int opIdx)
{
return opIdx == REG_SRC_LITERAL;
}
bool
isExecMask(int opIdx)
{
return opIdx == REG_EXEC_LO || opIdx == REG_EXEC_HI;
}
bool
isVccReg(int opIdx)
{
return opIdx == REG_VCC_LO || opIdx == REG_VCC_HI;
}
bool
isFlatScratchReg(int opIdx)
{
return opIdx == REG_FLAT_SCRATCH_LO || opIdx == REG_FLAT_SCRATCH_HI;
}
bool
isScalarReg(int opIdx)
{
// FLAT_SCRATCH and VCC are stored in an SGPR pair
if (opIdx <= REG_SGPR_MAX || opIdx == REG_FLAT_SCRATCH_LO ||
opIdx == REG_FLAT_SCRATCH_HI || opIdx == REG_VCC_LO ||
opIdx == REG_VCC_HI) {
return true;
}
return false;
}
bool
isVectorReg(int opIdx)
{
if (opIdx >= REG_VGPR_MIN && opIdx <= REG_VGPR_MAX)
return true;
return false;
}
} // namespace Gcn3ISA
} // namespace gem5

View File

@@ -49,11 +49,31 @@ Source('tlb_coalescer.cc')
DebugFlag('GPUPTWalker', 'Debug flag for GPU page table walker')
if env['CONF']['TARGET_GPU_ISA'] == 'vega':
Source('decoder.cc')
Source('gpu_decoder.cc')
Source('insts/gpu_static_inst.cc')
Source('insts/instructions.cc')
Source('insts/op_encodings.cc')
Source('isa.cc')
Source('registers.cc')
Source('gpu_isa.cc')
Source('gpu_registers.cc')
Source('insts/sop2.cc')
Source('insts/sopk.cc')
Source('insts/sop1.cc')
Source('insts/sopc.cc')
Source('insts/sopp.cc')
Source('insts/smem.cc')
Source('insts/vop2.cc')
Source('insts/vop1.cc')
Source('insts/vopc.cc')
Source('insts/vinterp.cc')
Source('insts/vop3.cc')
Source('insts/vop3_cmp.cc')
Source('insts/ds.cc')
Source('insts/mubuf.cc')
Source('insts/mtbuf.cc')
Source('insts/mimg.cc')
Source('insts/exp.cc')
Source('insts/flat.cc')
Source('insts/vop3p.cc')
Source('insts/vop3p_mai.cc')
DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA')

View File

@@ -325,6 +325,7 @@ namespace VegaISA
GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_FMAC_F32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst);
@@ -470,6 +471,7 @@ namespace VegaISA
GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst);
@@ -508,6 +510,7 @@ namespace VegaISA
GPUStaticInst* decode_OPU_VOP3__V_ADD_I16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_SUB_I16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_PACK_B32_F16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_CVT_PK_FP8_F32(MachInst);
GPUStaticInst* decode_OP_DS__DS_ADD_U32(MachInst);
GPUStaticInst* decode_OP_DS__DS_SUB_U32(MachInst);
GPUStaticInst* decode_OP_DS__DS_RSUB_U32(MachInst);
@@ -698,6 +701,9 @@ namespace VegaISA
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_XOR(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_INC(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_DEC(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_ADD_F64(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_MIN_F64(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_MAX_F64(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SWAP_X2(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP_X2(MachInst);
GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_ADD_X2(MachInst);
@@ -746,6 +752,11 @@ namespace VegaISA
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_XOR(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_INC(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_DEC(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F32(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_PK_ADD_F16(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F64(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_MIN_F64(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_MAX_F64(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_SWAP_X2(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_CMPSWAP_X2(MachInst);
GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_X2(MachInst);
@@ -1279,6 +1290,7 @@ namespace VegaISA
GPUStaticInst* decode_OP_VOP1__V_FREXP_MANT_F32(MachInst);
GPUStaticInst* decode_OP_VOP1__V_CLREXCP(MachInst);
GPUStaticInst* decode_OP_VOP1__V_SCREEN_PARTITION_4SE_B32(MachInst);
GPUStaticInst* decode_OP_VOP1__V_MOV_B64(MachInst);
GPUStaticInst* decode_OP_VOP1__V_CVT_F16_U16(MachInst);
GPUStaticInst* decode_OP_VOP1__V_CVT_F16_I16(MachInst);
GPUStaticInst* decode_OP_VOP1__V_CVT_U16_F16(MachInst);
@@ -1303,6 +1315,7 @@ namespace VegaISA
GPUStaticInst* decode_OP_VOP1__V_CVT_NORM_U16_F16(MachInst);
GPUStaticInst* decode_OP_VOP1__V_SAT_PK_U8_I16(MachInst);
GPUStaticInst* decode_OP_VOP1__V_SWAP_B32(MachInst);
GPUStaticInst* decode_OP_VOP1__V_ACCVGPR_MOV_B32(MachInst);
GPUStaticInst* decode_OP_VOP2__V_CNDMASK_B32(MachInst);
GPUStaticInst* decode_OP_VOP2__V_ADD_F32(MachInst);
GPUStaticInst* decode_OP_VOP2__V_SUB_F32(MachInst);
@@ -1585,6 +1598,65 @@ namespace VegaISA
GPUStaticInst* decode_OP_VOP3P__V_MAD_MIX_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXLO_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXHI_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_PK_FMA_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT2_F32_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT2_I32_I16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT2_U32_U16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT4_I32_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT4_U32_U8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT8_I32_I4(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT8_U32_U4(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X1_2B_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X1_4B_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_4X4X1_16B_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X2_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X4_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X4_2B_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X4_4B_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_4X4X4_16B_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X8_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X16_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_32X32X4_2B_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X4_4B_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_4X4X4_16B_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_32X32X8_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_32X32X16_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X32_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X4_2B_BF16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X4_4B_BF16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_4X4X4_16B_BF16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X8_BF16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X16_BF16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X32_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X16_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X32_BF16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X16_BF16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_I32_16X16X64_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_I32_32X32X32_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4_F64(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_4X4X4_4B_F64(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_BF8_BF8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_BF8_FP8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_FP8_BF8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_FP8_FP8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_BF8_BF8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_BF8_FP8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_FP8_BF8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_FP8_FP8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_BF8_BF8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_BF8_FP8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_FP8_BF8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_FP8_FP8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_BF8_BF8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_BF8_FP8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_FP8_BF8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_FP8_FP8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_READ(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst);
GPUStaticInst* subDecode_OPU_VOP3(MachInst);
GPUStaticInst* subDecode_OP_DS(MachInst);
GPUStaticInst* subDecode_OP_FLAT(MachInst);
@@ -1642,7 +1714,7 @@ namespace VegaISA
struct InFmt_FLAT {
unsigned int OFFSET : 13;
unsigned int LDS : 1;
unsigned int SVE : 1;
unsigned int SEG : 2;
unsigned int GLC : 1;
unsigned int SLC : 1;
@@ -1908,7 +1980,27 @@ namespace VegaISA
unsigned int NEG : 3;
};
union InstFormat {
struct InFmt_VOP3P_MAI
{
unsigned int VDST : 8;
unsigned int CBSZ : 3;
unsigned int ABID : 4;
unsigned int ACC_CD : 1;
unsigned int OP : 7;
unsigned int ENCODING : 9;
};
struct InFmt_VOP3P_MAI_1
{
unsigned int SRC0 : 9;
unsigned int SRC1 : 9;
unsigned int SRC2 : 9;
unsigned int ACC : 2;
unsigned int BLGP : 3;
};
union InstFormat
{
InFmt_DS iFmt_DS;
InFmt_DS_1 iFmt_DS_1;
InFmt_EXP iFmt_EXP;
@@ -1941,6 +2033,8 @@ namespace VegaISA
InFmt_VOP_SDWAB iFmt_VOP_SDWAB;
InFmt_VOP3P iFmt_VOP3P;
InFmt_VOP3P_1 iFmt_VOP3P_1;
InFmt_VOP3P_MAI iFmt_VOP3P_MAI;
InFmt_VOP3P_MAI_1 iFmt_VOP3P_MAI_1;
uint32_t imm_u32;
float imm_f32;
}; // union InstFormat

View File

@@ -89,6 +89,18 @@ namespace VegaISA
case REG_ZERO:
reg_sym = "0";
break;
case REG_SHARED_BASE:
reg_sym = "src_shared_base";
break;
case REG_SHARED_LIMIT:
reg_sym = "src_shared_limit";
break;
case REG_PRIVATE_BASE:
reg_sym = "src_private_base";
break;
case REG_PRIVATE_LIMIT:
reg_sym = "src_private_limit";
break;
case REG_POS_HALF:
reg_sym = "0.5";
break;

View File

@@ -106,10 +106,10 @@ namespace VegaISA
REG_RESERVED_25 = 232,
REG_RESERVED_26 = 233,
REG_RESERVED_27 = 234,
REG_RESERVED_28 = 235,
REG_RESERVED_29 = 236,
REG_RESERVED_30 = 237,
REG_RESERVED_31 = 238,
REG_SHARED_BASE = 235,
REG_SHARED_LIMIT = 236,
REG_PRIVATE_BASE = 237,
REG_PRIVATE_LIMIT = 238,
REG_RESERVED_32 = 239,
REG_POS_HALF = 240,
REG_NEG_HALF = 241,
@@ -129,7 +129,7 @@ namespace VegaISA
REG_LDS_DIRECT = 254,
REG_SRC_LITERAL = 255,
REG_VGPR_MIN = 256,
REG_VGPR_MAX = 511
REG_VGPR_MAX = 767
};
constexpr size_t MaxOperandDwords(16);

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
* Copyright (c) 2024 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -29,31 +29,30 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
#include "arch/amdgpu/gcn3/gpu_decoder.hh"
#include "arch/amdgpu/gcn3/insts/instructions.hh"
#include "debug/GPUExec.hh"
#include "gpu-compute/shader.hh"
#include "arch/amdgpu/vega/insts/instructions.hh"
namespace gem5
{
namespace Gcn3ISA
namespace VegaISA
{
GCN3GPUStaticInst::GCN3GPUStaticInst(const std::string &opcode)
: GPUStaticInst(opcode), _srcLiteral(0)
{
}
// --- Inst_EXP__EXP class methods ---
GCN3GPUStaticInst::~GCN3GPUStaticInst()
Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
: Inst_EXP(iFmt, "exp")
{
}
} // Inst_EXP__EXP
Inst_EXP__EXP::~Inst_EXP__EXP()
{
} // ~Inst_EXP__EXP
// --- description from .arch file ---
// Export through SX.
void
GCN3GPUStaticInst::panicUnimplemented() const
Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
{
fatal("Encountered unimplemented GCN3 instruction: %s\n", _opcode);
}
} // namespace Gcn3ISA
panicUnimplemented();
} // execute
} // namespace VegaISA
} // namespace gem5

File diff suppressed because it is too large Load Diff

View File

@@ -35,6 +35,7 @@
#include <cmath>
#include "arch/amdgpu/vega/gpu_registers.hh"
#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
namespace gem5
{
@@ -315,7 +316,8 @@ namespace VegaISA
* 0x142: broadcast 15th thread of each row to next row
* 0x143: broadcast thread 31 to rows 2 and 3
*/
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
inline int
dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
int rowOffset, bool & outOfBounds)
{
// local variables
@@ -699,7 +701,7 @@ namespace VegaISA
if (sel < SDWA_WORD_0) { // we are selecting 1 byte
// if we sign extended depends on upper-most bit of byte 0
signExt = (signExt &&
(bits(currDstVal, VegaISA::MSB_PER_WORD, 0) & 0x80));
(bits(currDstVal, VegaISA::MSB_PER_BYTE, 0) & 0x80));
for (int byte = 0; byte < 4; ++byte) {
low_bit = byte * VegaISA::BITS_PER_BYTE;
@@ -712,7 +714,7 @@ namespace VegaISA
3. byte > sel && signExt: we're sign extending and
this byte is one of the bytes we need to sign extend
*/
origBits_thisByte = bits(origDstVal, high_bit, low_bit);
origBits_thisByte = bits(origDstVal, VegaISA::MSB_PER_BYTE, 0);
currBits_thisByte = bits(currDstVal, high_bit, low_bit);
newBits = ((byte == sel) ? origBits_thisByte :
((preserve) ? currBits_thisByte :
@@ -737,7 +739,7 @@ namespace VegaISA
3. word > (sel & 1) && signExt: we're sign extending and
this word is one of the words we need to sign extend
*/
origBits_thisWord = bits(origDstVal, high_bit, low_bit);
origBits_thisWord = bits(origDstVal, VegaISA::MSB_PER_WORD, 0);
currBits_thisWord = bits(currDstVal, high_bit, low_bit);
newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
((preserve) ? currBits_thisWord :

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More