dev-amdgpu,gpu-compute,configs: MI300X (#1141)
Release of MI300X simulation capability: - Implements the required MI300X features over MI200 (currently only architecture flat scratch). - Make the gpu-compute model use MI200 features when MI300X / gfx942 is configured. - Fix up the scratch_ instructions which are seem to be preferred in debug hipcc builds over buffer_. - Add mi300.py config similar to mi200.py. This config can optionally use resources instead of command line args.
This commit is contained in:
172
configs/example/gpufs/mi300.py
Normal file
172
configs/example/gpufs/mi300.py
Normal file
@@ -0,0 +1,172 @@
|
||||
# Copyright (c) 2024 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from this
|
||||
# software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
""" This file creates an X86 system with a KVM CPU and GPU device capable of
|
||||
running the MI300 ISA (gfx942). Most of this file sets up a runscript which
|
||||
will load in a binary, shell script, or python file from the host and run that
|
||||
within gem5. Jump to line 146 for list of system parameters to configure.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from typing import Optional
|
||||
|
||||
import runfs
|
||||
from amd import AmdGPUOptions
|
||||
from common import (
|
||||
GPUTLBOptions,
|
||||
Options,
|
||||
)
|
||||
from ruby import Ruby
|
||||
|
||||
import m5
|
||||
|
||||
from gem5.resources.resource import AbstractResource
|
||||
|
||||
demo_runscript_without_checkpoint = """\
|
||||
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
|
||||
export HSA_ENABLE_INTERRUPT=0
|
||||
export HCC_AMDGPU_TARGET=gfx942
|
||||
export HSA_OVERRIDE_GFX_VERSION="9.4.2"
|
||||
dmesg -n8
|
||||
cat /proc/cpuinfo
|
||||
dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
|
||||
if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
|
||||
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
|
||||
/sbin/m5 exit
|
||||
fi
|
||||
modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
|
||||
echo "Running {} {}"
|
||||
echo "{}" | base64 -d > myapp
|
||||
chmod +x myapp
|
||||
./myapp {}
|
||||
/sbin/m5 exit
|
||||
"""
|
||||
|
||||
demo_runscript_with_checkpoint = """\
|
||||
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
|
||||
export HSA_ENABLE_INTERRUPT=0
|
||||
export HCC_AMDGPU_TARGET=gfx942
|
||||
export HSA_OVERRIDE_GFX_VERSION="9.4.2"
|
||||
dmesg -n8
|
||||
dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
|
||||
if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
|
||||
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
|
||||
/sbin/m5 exit
|
||||
fi
|
||||
modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
|
||||
echo "Running {} {}"
|
||||
echo "{}" | base64 -d > myapp
|
||||
chmod +x myapp
|
||||
/sbin/m5 checkpoint
|
||||
./myapp {}
|
||||
/sbin/m5 exit
|
||||
"""
|
||||
|
||||
|
||||
def addDemoOptions(parser):
|
||||
parser.add_argument(
|
||||
"-a", "--app", default=None, help="GPU application to run"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--opts", default="", help="GPU application arguments"
|
||||
)
|
||||
|
||||
|
||||
def runMI300GPUFS(
|
||||
cpu_type,
|
||||
disk: Optional[AbstractResource] = None,
|
||||
kernel: Optional[AbstractResource] = None,
|
||||
app: Optional[AbstractResource] = None,
|
||||
):
|
||||
parser = argparse.ArgumentParser()
|
||||
runfs.addRunFSOptions(parser)
|
||||
Options.addCommonOptions(parser)
|
||||
AmdGPUOptions.addAmdGPUOptions(parser)
|
||||
Ruby.define_options(parser)
|
||||
GPUTLBOptions.tlb_options(parser)
|
||||
addDemoOptions(parser)
|
||||
|
||||
# Parse now so we can override options
|
||||
args = parser.parse_args()
|
||||
demo_runscript = ""
|
||||
|
||||
if disk != None:
|
||||
args.disk_image = disk.get_local_path()
|
||||
if kernel != None:
|
||||
args.kernel = kernel.get_local_path()
|
||||
if app != None:
|
||||
args.app = app.get_local_path()
|
||||
|
||||
# Create temp script to run application
|
||||
if not os.path.isfile(args.app):
|
||||
print("Could not find applcation", args.app)
|
||||
sys.exit(1)
|
||||
|
||||
# Choose runscript Based on whether any checkpointing args are set
|
||||
if args.checkpoint_dir is not None:
|
||||
demo_runscript = demo_runscript_with_checkpoint
|
||||
else:
|
||||
demo_runscript = demo_runscript_without_checkpoint
|
||||
|
||||
with open(os.path.abspath(args.app), "rb") as binfile:
|
||||
encodedBin = base64.b64encode(binfile.read()).decode()
|
||||
|
||||
_, tempRunscript = tempfile.mkstemp()
|
||||
with open(tempRunscript, "w") as b64file:
|
||||
runscriptStr = demo_runscript.format(
|
||||
args.app, args.opts, encodedBin, args.opts
|
||||
)
|
||||
b64file.write(runscriptStr)
|
||||
|
||||
args.script = tempRunscript
|
||||
|
||||
# Defaults for CPU
|
||||
args.cpu_type = "X86KvmCPU"
|
||||
args.mem_size = "8GB"
|
||||
|
||||
# Defaults for MI300X
|
||||
args.gpu_device = "MI300X"
|
||||
args.dgpu_mem_size = "16GB" # GPU memory size, must be 16GB currently.
|
||||
|
||||
# See: https://rocm.docs.amd.com/en/latest/conceptual/gpu-arch/mi300.html
|
||||
# Topology for one XCD. Number of CUs is approximately 304 / 8, rounded
|
||||
# up to 40 due to gem5 restriction of 4 CUs per SQC / scalar cache.
|
||||
args.num_compute_units = 40
|
||||
args.gpu_topology = "Crossbar"
|
||||
|
||||
# Run gem5
|
||||
runfs.runGpuFSSystem(args)
|
||||
|
||||
|
||||
if __name__ == "__m5_main__":
|
||||
runMI300GPUFS("X86KvmCPU")
|
||||
@@ -134,9 +134,9 @@ def addRunFSOptions(parser):
|
||||
parser.add_argument(
|
||||
"--gpu-device",
|
||||
default="Vega10",
|
||||
choices=["Vega10", "MI100", "MI200"],
|
||||
help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
|
||||
"MI200 (gfx90a)",
|
||||
choices=["Vega10", "MI100", "MI200", "MI300X"],
|
||||
help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), MI200 "
|
||||
"(gfx90a), or MI300X (gfx942).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
||||
@@ -191,10 +191,14 @@ def connectGPU(system, args):
|
||||
system.pc.south_bridge.gpu.DeviceID = 0x740F
|
||||
system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
|
||||
system.pc.south_bridge.gpu.SubsystemID = 0x0C34
|
||||
elif args.gpu_device == "MI300X":
|
||||
system.pc.south_bridge.gpu.DeviceID = 0x740F
|
||||
system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
|
||||
system.pc.south_bridge.gpu.SubsystemID = 0x0C34
|
||||
elif args.gpu_device == "Vega10":
|
||||
system.pc.south_bridge.gpu.DeviceID = 0x6863
|
||||
else:
|
||||
panic(f"Unknown GPU device: {args.gpu_device}")
|
||||
m5.util.panic(f"Unknown GPU device: {args.gpu_device}")
|
||||
|
||||
# Use the gem5 default of 0x280 OR'd with 0x10 which tells Linux there is
|
||||
# a PCI capabilities list to travse.
|
||||
|
||||
@@ -161,7 +161,7 @@ def makeGpuFSSystem(args):
|
||||
0x7D000,
|
||||
]
|
||||
sdma_sizes = [0x1000] * 8
|
||||
elif args.gpu_device == "MI200":
|
||||
elif args.gpu_device == "MI200" or args.gpu_device == "MI300X":
|
||||
num_sdmas = 5
|
||||
sdma_bases = [
|
||||
0x4980,
|
||||
|
||||
@@ -9922,29 +9922,25 @@ namespace VegaISA
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORD(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX2(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX3(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX4(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
@@ -9977,29 +9973,25 @@ namespace VegaISA
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORD(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX2(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX3(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX4(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
|
||||
@@ -1258,13 +1258,12 @@ namespace VegaISA
|
||||
// If saddr = 0x7f there is no scalar reg to read and address will
|
||||
// be a 64-bit address. Otherwise, saddr is the reg index for a
|
||||
// scalar reg used as the base address for a 32-bit address.
|
||||
if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
|
||||
|| isFlat()) {
|
||||
if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
|
||||
ConstVecOperandU64 vbase(gpuDynInst, vaddr);
|
||||
vbase.read();
|
||||
|
||||
calcAddrVgpr(gpuDynInst, vbase, offset);
|
||||
} else {
|
||||
} else if (isFlatGlobal()) {
|
||||
// Assume we are operating in 64-bit mode and read a pair of
|
||||
// SGPRs for the address base.
|
||||
ConstScalarOperandU64 sbase(gpuDynInst, saddr);
|
||||
@@ -1274,6 +1273,57 @@ namespace VegaISA
|
||||
voffset.read();
|
||||
|
||||
calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
|
||||
// For scratch, saddr = 0x7f there is no scalar reg to read and
|
||||
// a vgpr will be used for address offset. Otherwise, saddr is
|
||||
// the sgpr index holding the address offset. For scratch
|
||||
// instructions the offset GPR is always 32-bits.
|
||||
} else if (saddr != 0x7f) {
|
||||
assert(isFlatScratch());
|
||||
|
||||
ConstScalarOperandU32 soffset(gpuDynInst, saddr);
|
||||
soffset.read();
|
||||
|
||||
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
|
||||
|
||||
int elemSize;
|
||||
auto staticInst = gpuDynInst->staticInstruction();
|
||||
if (gpuDynInst->isLoad()) {
|
||||
elemSize = staticInst->getOperandSize(2);
|
||||
} else {
|
||||
assert(gpuDynInst->isStore());
|
||||
elemSize = staticInst->getOperandSize(1);
|
||||
}
|
||||
|
||||
unsigned swizzleOffset = soffset.rawData() + offset;
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
gpuDynInst->addr.at(lane) = flat_scratch_addr
|
||||
+ swizzle(swizzleOffset, lane, elemSize);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assert(isFlatScratch());
|
||||
|
||||
ConstVecOperandU32 voffset(gpuDynInst, vaddr);
|
||||
voffset.read();
|
||||
|
||||
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
|
||||
|
||||
int elemSize;
|
||||
auto staticInst = gpuDynInst->staticInstruction();
|
||||
if (gpuDynInst->isLoad()) {
|
||||
elemSize = staticInst->getOperandSize(2);
|
||||
} else {
|
||||
assert(gpuDynInst->isStore());
|
||||
elemSize = staticInst->getOperandSize(1);
|
||||
}
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
gpuDynInst->addr.at(lane) = flat_scratch_addr
|
||||
+ swizzle(voffset[lane] + offset, lane, elemSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isFlat()) {
|
||||
@@ -1285,6 +1335,7 @@ namespace VegaISA
|
||||
assert(isFlatScratch());
|
||||
gpuDynInst->staticInstruction()->executed_as =
|
||||
enums::SC_PRIVATE;
|
||||
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1421,6 +1472,23 @@ namespace VegaISA
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
VecElemU32
|
||||
swizzle(VecElemU32 offset, int lane, int elem_size)
|
||||
{
|
||||
// This is not described in the spec. We use the swizzle from
|
||||
// buffer memory instructions and fix the stride to 4. Multiply
|
||||
// the thread ID by the storage size to avoid threads clobbering
|
||||
// their data.
|
||||
return ((offset / 4) * 4 * 64)
|
||||
+ (offset % 4) + (lane * elem_size);
|
||||
}
|
||||
|
||||
Addr
|
||||
readFlatScratch(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
return gpuDynInst->computeUnit()->shader->getScratchBase();
|
||||
}
|
||||
}; // Inst_FLAT
|
||||
} // namespace VegaISA
|
||||
} // namespace gem5
|
||||
|
||||
@@ -86,6 +86,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
|
||||
gfx_version = GfxVersion::gfx908;
|
||||
} else if (p.device_name == "MI200") {
|
||||
gfx_version = GfxVersion::gfx90a;
|
||||
} else if (p.device_name == "MI300X") {
|
||||
gfx_version = GfxVersion::gfx942;
|
||||
} else {
|
||||
panic("Unknown GPU device %s\n", p.device_name);
|
||||
}
|
||||
@@ -124,7 +126,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
|
||||
sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
|
||||
sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
|
||||
sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
|
||||
} else if (p.device_name == "MI100" || p.device_name == "MI200") {
|
||||
} else if (p.device_name == "MI100" || p.device_name == "MI200"
|
||||
|| p.device_name == "MI300X") {
|
||||
sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
|
||||
sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
|
||||
sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
|
||||
@@ -195,6 +198,10 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
|
||||
setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
|
||||
setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
|
||||
setRegVal(MI200_MEM_SIZE_REG, mem_size);
|
||||
} else if (p.device_name == "MI300X") {
|
||||
setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
|
||||
setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
|
||||
setRegVal(MI200_MEM_SIZE_REG, mem_size);
|
||||
} else {
|
||||
panic("Unknown GPU device %s\n", p.device_name);
|
||||
}
|
||||
|
||||
@@ -328,8 +328,8 @@ typedef struct GEM5_PACKED
|
||||
};
|
||||
uint64_t completionSignal;
|
||||
};
|
||||
} PM4MapProcessMI200;
|
||||
static_assert(sizeof(PM4MapProcessMI200) == 80);
|
||||
} PM4MapProcessV2;
|
||||
static_assert(sizeof(PM4MapProcessV2) == 80);
|
||||
|
||||
typedef struct GEM5_PACKED
|
||||
{
|
||||
|
||||
@@ -290,18 +290,19 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
|
||||
dmaBuffer);
|
||||
} break;
|
||||
case IT_MAP_PROCESS: {
|
||||
if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
|
||||
dmaBuffer = new PM4MapProcessMI200();
|
||||
if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a ||
|
||||
gpuDevice->getGfxVersion() == GfxVersion::gfx942) {
|
||||
dmaBuffer = new PM4MapProcessV2();
|
||||
cb = new DmaVirtCallback<uint64_t>(
|
||||
[ = ] (const uint64_t &)
|
||||
{ mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); });
|
||||
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
|
||||
{ mapProcessV2(q, (PM4MapProcessV2 *)dmaBuffer); });
|
||||
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessV2),
|
||||
cb, dmaBuffer);
|
||||
} else {
|
||||
dmaBuffer = new PM4MapProcess();
|
||||
cb = new DmaVirtCallback<uint64_t>(
|
||||
[ = ] (const uint64_t &)
|
||||
{ mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
|
||||
{ mapProcessV1(q, (PM4MapProcess *)dmaBuffer); });
|
||||
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
|
||||
dmaBuffer);
|
||||
}
|
||||
@@ -701,7 +702,7 @@ PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
|
||||
}
|
||||
|
||||
void
|
||||
PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
|
||||
PM4PacketProcessor::mapProcessV1(PM4Queue *q, PM4MapProcess *pkt)
|
||||
{
|
||||
q->incRptr(sizeof(PM4MapProcess));
|
||||
|
||||
@@ -716,9 +717,9 @@ PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
|
||||
}
|
||||
|
||||
void
|
||||
PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
|
||||
PM4PacketProcessor::mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt)
|
||||
{
|
||||
q->incRptr(sizeof(PM4MapProcessMI200));
|
||||
q->incRptr(sizeof(PM4MapProcessV2));
|
||||
|
||||
DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
|
||||
"%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
|
||||
|
||||
@@ -146,8 +146,8 @@ class PM4PacketProcessor : public DmaVirtDevice
|
||||
void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
|
||||
void doneMQDWrite(Addr mqdAddr, Addr addr);
|
||||
void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
|
||||
void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
|
||||
void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
|
||||
void mapProcessV1(PM4Queue *q, PM4MapProcess *pkt);
|
||||
void mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt);
|
||||
void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd,
|
||||
uint16_t vmid);
|
||||
void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
|
||||
|
||||
@@ -45,7 +45,7 @@ class PrefetchType(Enum):
|
||||
|
||||
|
||||
class GfxVersion(ScopedEnum):
|
||||
vals = ["gfx900", "gfx902", "gfx908", "gfx90a"]
|
||||
vals = ["gfx900", "gfx902", "gfx908", "gfx90a", "gfx942"]
|
||||
|
||||
|
||||
class PoolManager(SimObject):
|
||||
|
||||
@@ -910,35 +910,63 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
|
||||
* #flat-addressing
|
||||
*/
|
||||
|
||||
uint32_t numSgprs = wavefront()->maxSgprs;
|
||||
uint32_t physSgprIdx =
|
||||
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
|
||||
numSgprs - 4);
|
||||
uint32_t offset =
|
||||
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
|
||||
physSgprIdx =
|
||||
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
|
||||
numSgprs - 3);
|
||||
uint32_t size =
|
||||
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
|
||||
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
||||
if (mask[lane]) {
|
||||
addr[lane] = addr[lane] + lane * size + offset +
|
||||
wavefront()->computeUnit->shader->getHiddenPrivateBase() -
|
||||
wavefront()->computeUnit->shader->getScratchBase();
|
||||
ComputeUnit *cu = wavefront()->computeUnit;
|
||||
|
||||
if (wavefront()->gfxVersion == GfxVersion::gfx942) {
|
||||
// Architected flat scratch base address in FLAT_SCRATCH registers
|
||||
uint32_t fs_lo = cu->srf[simdId]->read(
|
||||
VegaISA::REG_FLAT_SCRATCH_LO);
|
||||
uint32_t fs_hi = cu->srf[simdId]->read(
|
||||
VegaISA::REG_FLAT_SCRATCH_HI);
|
||||
|
||||
Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
|
||||
|
||||
for (int lane = 0; lane < cu->wfSize(); ++lane) {
|
||||
if (mask[lane]) {
|
||||
// The scratch base is added for other gfx versions,
|
||||
// otherwise this would simply add the register base.
|
||||
addr[lane] = addr[lane] - cu->shader->getScratchBase()
|
||||
+ arch_flat_scratch;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// In absolute flat scratch the program needs to place scratch
|
||||
// address in SGPRn-3,4.
|
||||
uint32_t numSgprs = wavefront()->maxSgprs;
|
||||
uint32_t physSgprIdx =
|
||||
cu->registerManager->mapSgpr(wavefront(), numSgprs - 4);
|
||||
uint32_t offset = cu->srf[simdId]->read(physSgprIdx);
|
||||
physSgprIdx =
|
||||
cu->registerManager->mapSgpr(wavefront(), numSgprs - 3);
|
||||
uint32_t size = cu->srf[simdId]->read(physSgprIdx);
|
||||
|
||||
|
||||
for (int lane = 0; lane < cu->wfSize(); ++lane) {
|
||||
if (mask[lane]) {
|
||||
addr[lane] = addr[lane] + lane * size + offset +
|
||||
cu->shader->getHiddenPrivateBase() -
|
||||
cu->shader->getScratchBase();
|
||||
}
|
||||
}
|
||||
}
|
||||
wavefront()->execUnitId = wavefront()->flatLmUnitId;
|
||||
wavefront()->decLGKMInstsIssued();
|
||||
if (isLoad()) {
|
||||
wavefront()->rdLmReqsInPipe--;
|
||||
} else if (isStore()) {
|
||||
wavefront()->wrLmReqsInPipe--;
|
||||
} else if (isAtomic() || isMemSync()) {
|
||||
wavefront()->wrLmReqsInPipe--;
|
||||
wavefront()->rdLmReqsInPipe--;
|
||||
} else {
|
||||
panic("Invalid memory operation!\n");
|
||||
|
||||
wavefront()->execUnitId = wavefront()->flatLmUnitId;
|
||||
|
||||
// For FLAT the local memory pipe counters are incremented, but they
|
||||
// are not incremented for explicit scratch_* instructions. Only
|
||||
// decrement these counters if we are explicitly a FLAT instruction.
|
||||
if (isFlat()) {
|
||||
wavefront()->decLGKMInstsIssued();
|
||||
if (isLoad()) {
|
||||
wavefront()->rdLmReqsInPipe--;
|
||||
} else if (isStore()) {
|
||||
wavefront()->wrLmReqsInPipe--;
|
||||
} else if (isAtomic() || isMemSync()) {
|
||||
wavefront()->wrLmReqsInPipe--;
|
||||
wavefront()->rdLmReqsInPipe--;
|
||||
} else {
|
||||
panic("Invalid memory operation!\n");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
||||
|
||||
@@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
{
|
||||
return _flags[MemoryRef] && (_flags[GlobalSegment] ||
|
||||
_flags[PrivateSegment] || _flags[ReadOnlySegment] ||
|
||||
_flags[SpillSegment] || _flags[FlatGlobal]);
|
||||
_flags[SpillSegment] || _flags[FlatGlobal] ||
|
||||
_flags[FlatScratch]);
|
||||
}
|
||||
|
||||
bool
|
||||
|
||||
@@ -94,9 +94,10 @@ class HSAQueueEntry
|
||||
// LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
|
||||
// #code-object-v3-kernel-descriptor
|
||||
//
|
||||
// Currently, the only supported gfx version in gem5 that computes
|
||||
// VGPR count differently is gfx90a.
|
||||
if (gfx_version == GfxVersion::gfx90a) {
|
||||
// Currently, the only supported gfx versions in gem5 that compute
|
||||
// VGPR count differently are gfx90a and gfx942.
|
||||
if (gfx_version == GfxVersion::gfx90a ||
|
||||
gfx_version == GfxVersion::gfx942) {
|
||||
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
|
||||
} else {
|
||||
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
|
||||
@@ -107,7 +108,8 @@ class HSAQueueEntry
|
||||
if (gfx_version == GfxVersion::gfx900 ||
|
||||
gfx_version == GfxVersion::gfx902 ||
|
||||
gfx_version == GfxVersion::gfx908 ||
|
||||
gfx_version == GfxVersion::gfx90a) {
|
||||
gfx_version == GfxVersion::gfx90a ||
|
||||
gfx_version == GfxVersion::gfx942) {
|
||||
numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
|
||||
} else {
|
||||
panic("Saw unknown gfx version setting up GPR counts\n");
|
||||
|
||||
@@ -118,6 +118,7 @@ void
|
||||
Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
|
||||
{
|
||||
int regInitIdx = 0;
|
||||
gfxVersion = task->gfxVersion();
|
||||
|
||||
// Iterate over all the init fields and check which
|
||||
// bits are enabled. Useful information can be found here:
|
||||
@@ -378,8 +379,29 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
|
||||
wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
|
||||
break;
|
||||
case PrivSegWaveByteOffset:
|
||||
|
||||
// For architected flat scratch, this enable is reused to set
|
||||
// the FLAT_SCRATCH register pair to the scratch backing
|
||||
// memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
|
||||
if (task->gfxVersion() == GfxVersion::gfx942) {
|
||||
Addr arch_flat_scratch =
|
||||
task->amdQueue.scratch_backing_memory_location;
|
||||
computeUnit->srf[simdId]->write(
|
||||
VegaISA::REG_FLAT_SCRATCH_HI,
|
||||
bits(arch_flat_scratch, 63, 32));
|
||||
computeUnit->srf[simdId]->write(
|
||||
VegaISA::REG_FLAT_SCRATCH_LO,
|
||||
bits(arch_flat_scratch, 31, 0));
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// Not architected flat scratch. Write the scratch wavefront
|
||||
// offset: https://llvm.org/docs/AMDGPUUsage.html
|
||||
// #amdgpu-amdhsa-initial-kernel-execution-state
|
||||
physSgprIdx =
|
||||
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
||||
|
||||
/**
|
||||
* the compute_tmpring_size_wavesize specifies the number of
|
||||
* kB allocated per wavefront, hence the multiplication by
|
||||
@@ -442,7 +464,8 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
|
||||
// Default to false and set to true for gem5 supported ISAs.
|
||||
bool packed_work_item_id = false;
|
||||
|
||||
if (task->gfxVersion() == GfxVersion::gfx90a) {
|
||||
if (task->gfxVersion() == GfxVersion::gfx90a ||
|
||||
task->gfxVersion() == GfxVersion::gfx942) {
|
||||
packed_work_item_id = true;
|
||||
}
|
||||
|
||||
|
||||
@@ -92,6 +92,8 @@ class Wavefront : public SimObject
|
||||
S_BARRIER
|
||||
};
|
||||
|
||||
// gfx version wavefront is executing
|
||||
GfxVersion gfxVersion;
|
||||
// HW slot id where the WF is mapped to inside a SIMD unit
|
||||
const int wfSlotId;
|
||||
int kernId;
|
||||
|
||||
Reference in New Issue
Block a user