dev-amdgpu,gpu-compute,configs: MI300X (#1141)

Release of MI300X simulation capability:

- Implements the required MI300X features over MI200 (currently only
architecture flat scratch).
- Make the gpu-compute model use MI200 features when MI300X / gfx942 is
configured.
- Fix up the scratch_ instructions which are seem to be preferred in
debug hipcc builds over buffer_.
- Add mi300.py config similar to mi200.py. This config can optionally
use resources instead of command line args.
This commit is contained in:
Matthew Poremba
2024-05-17 09:26:04 -07:00
committed by GitHub
16 changed files with 371 additions and 71 deletions

View File

@@ -0,0 +1,172 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
""" This file creates an X86 system with a KVM CPU and GPU device capable of
running the MI300 ISA (gfx942). Most of this file sets up a runscript which
will load in a binary, shell script, or python file from the host and run that
within gem5. Jump to line 146 for list of system parameters to configure.
"""
import argparse
import base64
import os
import sys
import tempfile
from typing import Optional
import runfs
from amd import AmdGPUOptions
from common import (
GPUTLBOptions,
Options,
)
from ruby import Ruby
import m5
from gem5.resources.resource import AbstractResource
demo_runscript_without_checkpoint = """\
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
export HSA_ENABLE_INTERRUPT=0
export HCC_AMDGPU_TARGET=gfx942
export HSA_OVERRIDE_GFX_VERSION="9.4.2"
dmesg -n8
cat /proc/cpuinfo
dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
/sbin/m5 exit
fi
modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
echo "Running {} {}"
echo "{}" | base64 -d > myapp
chmod +x myapp
./myapp {}
/sbin/m5 exit
"""
demo_runscript_with_checkpoint = """\
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
export HSA_ENABLE_INTERRUPT=0
export HCC_AMDGPU_TARGET=gfx942
export HSA_OVERRIDE_GFX_VERSION="9.4.2"
dmesg -n8
dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
/sbin/m5 exit
fi
modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
echo "Running {} {}"
echo "{}" | base64 -d > myapp
chmod +x myapp
/sbin/m5 checkpoint
./myapp {}
/sbin/m5 exit
"""
def addDemoOptions(parser):
parser.add_argument(
"-a", "--app", default=None, help="GPU application to run"
)
parser.add_argument(
"-o", "--opts", default="", help="GPU application arguments"
)
def runMI300GPUFS(
cpu_type,
disk: Optional[AbstractResource] = None,
kernel: Optional[AbstractResource] = None,
app: Optional[AbstractResource] = None,
):
parser = argparse.ArgumentParser()
runfs.addRunFSOptions(parser)
Options.addCommonOptions(parser)
AmdGPUOptions.addAmdGPUOptions(parser)
Ruby.define_options(parser)
GPUTLBOptions.tlb_options(parser)
addDemoOptions(parser)
# Parse now so we can override options
args = parser.parse_args()
demo_runscript = ""
if disk != None:
args.disk_image = disk.get_local_path()
if kernel != None:
args.kernel = kernel.get_local_path()
if app != None:
args.app = app.get_local_path()
# Create temp script to run application
if not os.path.isfile(args.app):
print("Could not find applcation", args.app)
sys.exit(1)
# Choose runscript Based on whether any checkpointing args are set
if args.checkpoint_dir is not None:
demo_runscript = demo_runscript_with_checkpoint
else:
demo_runscript = demo_runscript_without_checkpoint
with open(os.path.abspath(args.app), "rb") as binfile:
encodedBin = base64.b64encode(binfile.read()).decode()
_, tempRunscript = tempfile.mkstemp()
with open(tempRunscript, "w") as b64file:
runscriptStr = demo_runscript.format(
args.app, args.opts, encodedBin, args.opts
)
b64file.write(runscriptStr)
args.script = tempRunscript
# Defaults for CPU
args.cpu_type = "X86KvmCPU"
args.mem_size = "8GB"
# Defaults for MI300X
args.gpu_device = "MI300X"
args.dgpu_mem_size = "16GB" # GPU memory size, must be 16GB currently.
# See: https://rocm.docs.amd.com/en/latest/conceptual/gpu-arch/mi300.html
# Topology for one XCD. Number of CUs is approximately 304 / 8, rounded
# up to 40 due to gem5 restriction of 4 CUs per SQC / scalar cache.
args.num_compute_units = 40
args.gpu_topology = "Crossbar"
# Run gem5
runfs.runGpuFSSystem(args)
if __name__ == "__m5_main__":
runMI300GPUFS("X86KvmCPU")

View File

@@ -134,9 +134,9 @@ def addRunFSOptions(parser):
parser.add_argument(
"--gpu-device",
default="Vega10",
choices=["Vega10", "MI100", "MI200"],
help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
"MI200 (gfx90a)",
choices=["Vega10", "MI100", "MI200", "MI300X"],
help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), MI200 "
"(gfx90a), or MI300X (gfx942).",
)
parser.add_argument(

View File

@@ -191,10 +191,14 @@ def connectGPU(system, args):
system.pc.south_bridge.gpu.DeviceID = 0x740F
system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
system.pc.south_bridge.gpu.SubsystemID = 0x0C34
elif args.gpu_device == "MI300X":
system.pc.south_bridge.gpu.DeviceID = 0x740F
system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
system.pc.south_bridge.gpu.SubsystemID = 0x0C34
elif args.gpu_device == "Vega10":
system.pc.south_bridge.gpu.DeviceID = 0x6863
else:
panic(f"Unknown GPU device: {args.gpu_device}")
m5.util.panic(f"Unknown GPU device: {args.gpu_device}")
# Use the gem5 default of 0x280 OR'd with 0x10 which tells Linux there is
# a PCI capabilities list to travse.

View File

@@ -161,7 +161,7 @@ def makeGpuFSSystem(args):
0x7D000,
]
sdma_sizes = [0x1000] * 8
elif args.gpu_device == "MI200":
elif args.gpu_device == "MI200" or args.gpu_device == "MI300X":
num_sdmas = 5
sdma_bases = [
0x4980,

View File

@@ -9922,29 +9922,25 @@ namespace VegaISA
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORD(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX2(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX3(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX4(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
@@ -9977,29 +9973,25 @@ namespace VegaISA
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORD(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX2(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX3(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX4(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT);
}
GPUStaticInst*

View File

@@ -1258,13 +1258,12 @@ namespace VegaISA
// If saddr = 0x7f there is no scalar reg to read and address will
// be a 64-bit address. Otherwise, saddr is the reg index for a
// scalar reg used as the base address for a 32-bit address.
if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
|| isFlat()) {
if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
ConstVecOperandU64 vbase(gpuDynInst, vaddr);
vbase.read();
calcAddrVgpr(gpuDynInst, vbase, offset);
} else {
} else if (isFlatGlobal()) {
// Assume we are operating in 64-bit mode and read a pair of
// SGPRs for the address base.
ConstScalarOperandU64 sbase(gpuDynInst, saddr);
@@ -1274,6 +1273,57 @@ namespace VegaISA
voffset.read();
calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
// For scratch, saddr = 0x7f there is no scalar reg to read and
// a vgpr will be used for address offset. Otherwise, saddr is
// the sgpr index holding the address offset. For scratch
// instructions the offset GPR is always 32-bits.
} else if (saddr != 0x7f) {
assert(isFlatScratch());
ConstScalarOperandU32 soffset(gpuDynInst, saddr);
soffset.read();
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
int elemSize;
auto staticInst = gpuDynInst->staticInstruction();
if (gpuDynInst->isLoad()) {
elemSize = staticInst->getOperandSize(2);
} else {
assert(gpuDynInst->isStore());
elemSize = staticInst->getOperandSize(1);
}
unsigned swizzleOffset = soffset.rawData() + offset;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
gpuDynInst->addr.at(lane) = flat_scratch_addr
+ swizzle(swizzleOffset, lane, elemSize);
}
}
} else {
assert(isFlatScratch());
ConstVecOperandU32 voffset(gpuDynInst, vaddr);
voffset.read();
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
int elemSize;
auto staticInst = gpuDynInst->staticInstruction();
if (gpuDynInst->isLoad()) {
elemSize = staticInst->getOperandSize(2);
} else {
assert(gpuDynInst->isStore());
elemSize = staticInst->getOperandSize(1);
}
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
gpuDynInst->addr.at(lane) = flat_scratch_addr
+ swizzle(voffset[lane] + offset, lane, elemSize);
}
}
}
if (isFlat()) {
@@ -1285,6 +1335,7 @@ namespace VegaISA
assert(isFlatScratch());
gpuDynInst->staticInstruction()->executed_as =
enums::SC_PRIVATE;
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
}
}
@@ -1421,6 +1472,23 @@ namespace VegaISA
}
}
}
VecElemU32
swizzle(VecElemU32 offset, int lane, int elem_size)
{
// This is not described in the spec. We use the swizzle from
// buffer memory instructions and fix the stride to 4. Multiply
// the thread ID by the storage size to avoid threads clobbering
// their data.
return ((offset / 4) * 4 * 64)
+ (offset % 4) + (lane * elem_size);
}
Addr
readFlatScratch(GPUDynInstPtr gpuDynInst)
{
return gpuDynInst->computeUnit()->shader->getScratchBase();
}
}; // Inst_FLAT
} // namespace VegaISA
} // namespace gem5

View File

@@ -86,6 +86,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
gfx_version = GfxVersion::gfx908;
} else if (p.device_name == "MI200") {
gfx_version = GfxVersion::gfx90a;
} else if (p.device_name == "MI300X") {
gfx_version = GfxVersion::gfx942;
} else {
panic("Unknown GPU device %s\n", p.device_name);
}
@@ -124,7 +126,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
} else if (p.device_name == "MI100" || p.device_name == "MI200") {
} else if (p.device_name == "MI100" || p.device_name == "MI200"
|| p.device_name == "MI300X") {
sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
@@ -195,6 +198,10 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
setRegVal(MI200_MEM_SIZE_REG, mem_size);
} else if (p.device_name == "MI300X") {
setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
setRegVal(MI200_MEM_SIZE_REG, mem_size);
} else {
panic("Unknown GPU device %s\n", p.device_name);
}

View File

@@ -328,8 +328,8 @@ typedef struct GEM5_PACKED
};
uint64_t completionSignal;
};
} PM4MapProcessMI200;
static_assert(sizeof(PM4MapProcessMI200) == 80);
} PM4MapProcessV2;
static_assert(sizeof(PM4MapProcessV2) == 80);
typedef struct GEM5_PACKED
{

View File

@@ -290,18 +290,19 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
dmaBuffer);
} break;
case IT_MAP_PROCESS: {
if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
dmaBuffer = new PM4MapProcessMI200();
if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a ||
gpuDevice->getGfxVersion() == GfxVersion::gfx942) {
dmaBuffer = new PM4MapProcessV2();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); });
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
{ mapProcessV2(q, (PM4MapProcessV2 *)dmaBuffer); });
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessV2),
cb, dmaBuffer);
} else {
dmaBuffer = new PM4MapProcess();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
{ mapProcessV1(q, (PM4MapProcess *)dmaBuffer); });
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
dmaBuffer);
}
@@ -701,7 +702,7 @@ PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
}
void
PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
PM4PacketProcessor::mapProcessV1(PM4Queue *q, PM4MapProcess *pkt)
{
q->incRptr(sizeof(PM4MapProcess));
@@ -716,9 +717,9 @@ PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
}
void
PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
PM4PacketProcessor::mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt)
{
q->incRptr(sizeof(PM4MapProcessMI200));
q->incRptr(sizeof(PM4MapProcessV2));
DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
"%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,

View File

@@ -146,8 +146,8 @@ class PM4PacketProcessor : public DmaVirtDevice
void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
void doneMQDWrite(Addr mqdAddr, Addr addr);
void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
void mapProcessV1(PM4Queue *q, PM4MapProcess *pkt);
void mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt);
void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd,
uint16_t vmid);
void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,

View File

@@ -45,7 +45,7 @@ class PrefetchType(Enum):
class GfxVersion(ScopedEnum):
vals = ["gfx900", "gfx902", "gfx908", "gfx90a"]
vals = ["gfx900", "gfx902", "gfx908", "gfx90a", "gfx942"]
class PoolManager(SimObject):

View File

@@ -910,35 +910,63 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
* #flat-addressing
*/
uint32_t numSgprs = wavefront()->maxSgprs;
uint32_t physSgprIdx =
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
numSgprs - 4);
uint32_t offset =
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
physSgprIdx =
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
numSgprs - 3);
uint32_t size =
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
addr[lane] = addr[lane] + lane * size + offset +
wavefront()->computeUnit->shader->getHiddenPrivateBase() -
wavefront()->computeUnit->shader->getScratchBase();
ComputeUnit *cu = wavefront()->computeUnit;
if (wavefront()->gfxVersion == GfxVersion::gfx942) {
// Architected flat scratch base address in FLAT_SCRATCH registers
uint32_t fs_lo = cu->srf[simdId]->read(
VegaISA::REG_FLAT_SCRATCH_LO);
uint32_t fs_hi = cu->srf[simdId]->read(
VegaISA::REG_FLAT_SCRATCH_HI);
Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
for (int lane = 0; lane < cu->wfSize(); ++lane) {
if (mask[lane]) {
// The scratch base is added for other gfx versions,
// otherwise this would simply add the register base.
addr[lane] = addr[lane] - cu->shader->getScratchBase()
+ arch_flat_scratch;
}
}
} else {
// In absolute flat scratch the program needs to place scratch
// address in SGPRn-3,4.
uint32_t numSgprs = wavefront()->maxSgprs;
uint32_t physSgprIdx =
cu->registerManager->mapSgpr(wavefront(), numSgprs - 4);
uint32_t offset = cu->srf[simdId]->read(physSgprIdx);
physSgprIdx =
cu->registerManager->mapSgpr(wavefront(), numSgprs - 3);
uint32_t size = cu->srf[simdId]->read(physSgprIdx);
for (int lane = 0; lane < cu->wfSize(); ++lane) {
if (mask[lane]) {
addr[lane] = addr[lane] + lane * size + offset +
cu->shader->getHiddenPrivateBase() -
cu->shader->getScratchBase();
}
}
}
wavefront()->execUnitId = wavefront()->flatLmUnitId;
wavefront()->decLGKMInstsIssued();
if (isLoad()) {
wavefront()->rdLmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrLmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->wrLmReqsInPipe--;
wavefront()->rdLmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
wavefront()->execUnitId = wavefront()->flatLmUnitId;
// For FLAT the local memory pipe counters are incremented, but they
// are not incremented for explicit scratch_* instructions. Only
// decrement these counters if we are explicitly a FLAT instruction.
if (isFlat()) {
wavefront()->decLGKMInstsIssued();
if (isLoad()) {
wavefront()->rdLmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrLmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->wrLmReqsInPipe--;
wavefront()->rdLmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
}
}
} else {
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {

View File

@@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags
{
return _flags[MemoryRef] && (_flags[GlobalSegment] ||
_flags[PrivateSegment] || _flags[ReadOnlySegment] ||
_flags[SpillSegment] || _flags[FlatGlobal]);
_flags[SpillSegment] || _flags[FlatGlobal] ||
_flags[FlatScratch]);
}
bool

View File

@@ -94,9 +94,10 @@ class HSAQueueEntry
// LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
// #code-object-v3-kernel-descriptor
//
// Currently, the only supported gfx version in gem5 that computes
// VGPR count differently is gfx90a.
if (gfx_version == GfxVersion::gfx90a) {
// Currently, the only supported gfx versions in gem5 that compute
// VGPR count differently are gfx90a and gfx942.
if (gfx_version == GfxVersion::gfx90a ||
gfx_version == GfxVersion::gfx942) {
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
} else {
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
@@ -107,7 +108,8 @@ class HSAQueueEntry
if (gfx_version == GfxVersion::gfx900 ||
gfx_version == GfxVersion::gfx902 ||
gfx_version == GfxVersion::gfx908 ||
gfx_version == GfxVersion::gfx90a) {
gfx_version == GfxVersion::gfx90a ||
gfx_version == GfxVersion::gfx942) {
numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
} else {
panic("Saw unknown gfx version setting up GPR counts\n");

View File

@@ -118,6 +118,7 @@ void
Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
{
int regInitIdx = 0;
gfxVersion = task->gfxVersion();
// Iterate over all the init fields and check which
// bits are enabled. Useful information can be found here:
@@ -378,8 +379,29 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
break;
case PrivSegWaveByteOffset:
// For architected flat scratch, this enable is reused to set
// the FLAT_SCRATCH register pair to the scratch backing
// memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
if (task->gfxVersion() == GfxVersion::gfx942) {
Addr arch_flat_scratch =
task->amdQueue.scratch_backing_memory_location;
computeUnit->srf[simdId]->write(
VegaISA::REG_FLAT_SCRATCH_HI,
bits(arch_flat_scratch, 63, 32));
computeUnit->srf[simdId]->write(
VegaISA::REG_FLAT_SCRATCH_LO,
bits(arch_flat_scratch, 31, 0));
break;
}
// Not architected flat scratch. Write the scratch wavefront
// offset: https://llvm.org/docs/AMDGPUUsage.html
// #amdgpu-amdhsa-initial-kernel-execution-state
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
/**
* the compute_tmpring_size_wavesize specifies the number of
* kB allocated per wavefront, hence the multiplication by
@@ -442,7 +464,8 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
// Default to false and set to true for gem5 supported ISAs.
bool packed_work_item_id = false;
if (task->gfxVersion() == GfxVersion::gfx90a) {
if (task->gfxVersion() == GfxVersion::gfx90a ||
task->gfxVersion() == GfxVersion::gfx942) {
packed_work_item_id = true;
}

View File

@@ -92,6 +92,8 @@ class Wavefront : public SimObject
S_BARRIER
};
// gfx version wavefront is executing
GfxVersion gfxVersion;
// HW slot id where the WF is mapped to inside a SIMD unit
const int wfSlotId;
int kernId;