dev-amdgpu,gpu-compute,configs: MI300X (#1141)

Release of MI300X simulation capability:

- Implements the required MI300X features over MI200 (currently only
architecture flat scratch).
- Make the gpu-compute model use MI200 features when MI300X / gfx942 is
configured.
- Fix up the scratch_ instructions which are seem to be preferred in
debug hipcc builds over buffer_.
- Add mi300.py config similar to mi200.py. This config can optionally
use resources instead of command line args.
This commit is contained in:
Matthew Poremba
2024-05-17 09:26:04 -07:00
committed by GitHub
16 changed files with 371 additions and 71 deletions

View File

@@ -9922,29 +9922,25 @@ namespace VegaISA
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORD(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX2(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX3(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX4(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
@@ -9977,29 +9973,25 @@ namespace VegaISA
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORD(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX2(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX3(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX4(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT);
}
GPUStaticInst*

View File

@@ -1258,13 +1258,12 @@ namespace VegaISA
// If saddr = 0x7f there is no scalar reg to read and address will
// be a 64-bit address. Otherwise, saddr is the reg index for a
// scalar reg used as the base address for a 32-bit address.
if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
|| isFlat()) {
if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
ConstVecOperandU64 vbase(gpuDynInst, vaddr);
vbase.read();
calcAddrVgpr(gpuDynInst, vbase, offset);
} else {
} else if (isFlatGlobal()) {
// Assume we are operating in 64-bit mode and read a pair of
// SGPRs for the address base.
ConstScalarOperandU64 sbase(gpuDynInst, saddr);
@@ -1274,6 +1273,57 @@ namespace VegaISA
voffset.read();
calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
// For scratch, saddr = 0x7f there is no scalar reg to read and
// a vgpr will be used for address offset. Otherwise, saddr is
// the sgpr index holding the address offset. For scratch
// instructions the offset GPR is always 32-bits.
} else if (saddr != 0x7f) {
assert(isFlatScratch());
ConstScalarOperandU32 soffset(gpuDynInst, saddr);
soffset.read();
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
int elemSize;
auto staticInst = gpuDynInst->staticInstruction();
if (gpuDynInst->isLoad()) {
elemSize = staticInst->getOperandSize(2);
} else {
assert(gpuDynInst->isStore());
elemSize = staticInst->getOperandSize(1);
}
unsigned swizzleOffset = soffset.rawData() + offset;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
gpuDynInst->addr.at(lane) = flat_scratch_addr
+ swizzle(swizzleOffset, lane, elemSize);
}
}
} else {
assert(isFlatScratch());
ConstVecOperandU32 voffset(gpuDynInst, vaddr);
voffset.read();
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
int elemSize;
auto staticInst = gpuDynInst->staticInstruction();
if (gpuDynInst->isLoad()) {
elemSize = staticInst->getOperandSize(2);
} else {
assert(gpuDynInst->isStore());
elemSize = staticInst->getOperandSize(1);
}
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
gpuDynInst->addr.at(lane) = flat_scratch_addr
+ swizzle(voffset[lane] + offset, lane, elemSize);
}
}
}
if (isFlat()) {
@@ -1285,6 +1335,7 @@ namespace VegaISA
assert(isFlatScratch());
gpuDynInst->staticInstruction()->executed_as =
enums::SC_PRIVATE;
gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
}
}
@@ -1421,6 +1472,23 @@ namespace VegaISA
}
}
}
VecElemU32
swizzle(VecElemU32 offset, int lane, int elem_size)
{
// This is not described in the spec. We use the swizzle from
// buffer memory instructions and fix the stride to 4. Multiply
// the thread ID by the storage size to avoid threads clobbering
// their data.
return ((offset / 4) * 4 * 64)
+ (offset % 4) + (lane * elem_size);
}
Addr
readFlatScratch(GPUDynInstPtr gpuDynInst)
{
return gpuDynInst->computeUnit()->shader->getScratchBase();
}
}; // Inst_FLAT
} // namespace VegaISA
} // namespace gem5

View File

@@ -86,6 +86,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
gfx_version = GfxVersion::gfx908;
} else if (p.device_name == "MI200") {
gfx_version = GfxVersion::gfx90a;
} else if (p.device_name == "MI300X") {
gfx_version = GfxVersion::gfx942;
} else {
panic("Unknown GPU device %s\n", p.device_name);
}
@@ -124,7 +126,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
} else if (p.device_name == "MI100" || p.device_name == "MI200") {
} else if (p.device_name == "MI100" || p.device_name == "MI200"
|| p.device_name == "MI300X") {
sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
@@ -195,6 +198,10 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
setRegVal(MI200_MEM_SIZE_REG, mem_size);
} else if (p.device_name == "MI300X") {
setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
setRegVal(MI200_MEM_SIZE_REG, mem_size);
} else {
panic("Unknown GPU device %s\n", p.device_name);
}

View File

@@ -328,8 +328,8 @@ typedef struct GEM5_PACKED
};
uint64_t completionSignal;
};
} PM4MapProcessMI200;
static_assert(sizeof(PM4MapProcessMI200) == 80);
} PM4MapProcessV2;
static_assert(sizeof(PM4MapProcessV2) == 80);
typedef struct GEM5_PACKED
{

View File

@@ -290,18 +290,19 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
dmaBuffer);
} break;
case IT_MAP_PROCESS: {
if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
dmaBuffer = new PM4MapProcessMI200();
if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a ||
gpuDevice->getGfxVersion() == GfxVersion::gfx942) {
dmaBuffer = new PM4MapProcessV2();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); });
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
{ mapProcessV2(q, (PM4MapProcessV2 *)dmaBuffer); });
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessV2),
cb, dmaBuffer);
} else {
dmaBuffer = new PM4MapProcess();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
{ mapProcessV1(q, (PM4MapProcess *)dmaBuffer); });
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
dmaBuffer);
}
@@ -701,7 +702,7 @@ PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
}
void
PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
PM4PacketProcessor::mapProcessV1(PM4Queue *q, PM4MapProcess *pkt)
{
q->incRptr(sizeof(PM4MapProcess));
@@ -716,9 +717,9 @@ PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
}
void
PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
PM4PacketProcessor::mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt)
{
q->incRptr(sizeof(PM4MapProcessMI200));
q->incRptr(sizeof(PM4MapProcessV2));
DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
"%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,

View File

@@ -146,8 +146,8 @@ class PM4PacketProcessor : public DmaVirtDevice
void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
void doneMQDWrite(Addr mqdAddr, Addr addr);
void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
void mapProcessV1(PM4Queue *q, PM4MapProcess *pkt);
void mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt);
void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd,
uint16_t vmid);
void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,

View File

@@ -45,7 +45,7 @@ class PrefetchType(Enum):
class GfxVersion(ScopedEnum):
vals = ["gfx900", "gfx902", "gfx908", "gfx90a"]
vals = ["gfx900", "gfx902", "gfx908", "gfx90a", "gfx942"]
class PoolManager(SimObject):

View File

@@ -910,35 +910,63 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
* #flat-addressing
*/
uint32_t numSgprs = wavefront()->maxSgprs;
uint32_t physSgprIdx =
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
numSgprs - 4);
uint32_t offset =
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
physSgprIdx =
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
numSgprs - 3);
uint32_t size =
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
addr[lane] = addr[lane] + lane * size + offset +
wavefront()->computeUnit->shader->getHiddenPrivateBase() -
wavefront()->computeUnit->shader->getScratchBase();
ComputeUnit *cu = wavefront()->computeUnit;
if (wavefront()->gfxVersion == GfxVersion::gfx942) {
// Architected flat scratch base address in FLAT_SCRATCH registers
uint32_t fs_lo = cu->srf[simdId]->read(
VegaISA::REG_FLAT_SCRATCH_LO);
uint32_t fs_hi = cu->srf[simdId]->read(
VegaISA::REG_FLAT_SCRATCH_HI);
Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
for (int lane = 0; lane < cu->wfSize(); ++lane) {
if (mask[lane]) {
// The scratch base is added for other gfx versions,
// otherwise this would simply add the register base.
addr[lane] = addr[lane] - cu->shader->getScratchBase()
+ arch_flat_scratch;
}
}
} else {
// In absolute flat scratch the program needs to place scratch
// address in SGPRn-3,4.
uint32_t numSgprs = wavefront()->maxSgprs;
uint32_t physSgprIdx =
cu->registerManager->mapSgpr(wavefront(), numSgprs - 4);
uint32_t offset = cu->srf[simdId]->read(physSgprIdx);
physSgprIdx =
cu->registerManager->mapSgpr(wavefront(), numSgprs - 3);
uint32_t size = cu->srf[simdId]->read(physSgprIdx);
for (int lane = 0; lane < cu->wfSize(); ++lane) {
if (mask[lane]) {
addr[lane] = addr[lane] + lane * size + offset +
cu->shader->getHiddenPrivateBase() -
cu->shader->getScratchBase();
}
}
}
wavefront()->execUnitId = wavefront()->flatLmUnitId;
wavefront()->decLGKMInstsIssued();
if (isLoad()) {
wavefront()->rdLmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrLmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->wrLmReqsInPipe--;
wavefront()->rdLmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
wavefront()->execUnitId = wavefront()->flatLmUnitId;
// For FLAT the local memory pipe counters are incremented, but they
// are not incremented for explicit scratch_* instructions. Only
// decrement these counters if we are explicitly a FLAT instruction.
if (isFlat()) {
wavefront()->decLGKMInstsIssued();
if (isLoad()) {
wavefront()->rdLmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrLmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->wrLmReqsInPipe--;
wavefront()->rdLmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
}
}
} else {
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {

View File

@@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags
{
return _flags[MemoryRef] && (_flags[GlobalSegment] ||
_flags[PrivateSegment] || _flags[ReadOnlySegment] ||
_flags[SpillSegment] || _flags[FlatGlobal]);
_flags[SpillSegment] || _flags[FlatGlobal] ||
_flags[FlatScratch]);
}
bool

View File

@@ -94,9 +94,10 @@ class HSAQueueEntry
// LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
// #code-object-v3-kernel-descriptor
//
// Currently, the only supported gfx version in gem5 that computes
// VGPR count differently is gfx90a.
if (gfx_version == GfxVersion::gfx90a) {
// Currently, the only supported gfx versions in gem5 that compute
// VGPR count differently are gfx90a and gfx942.
if (gfx_version == GfxVersion::gfx90a ||
gfx_version == GfxVersion::gfx942) {
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
} else {
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
@@ -107,7 +108,8 @@ class HSAQueueEntry
if (gfx_version == GfxVersion::gfx900 ||
gfx_version == GfxVersion::gfx902 ||
gfx_version == GfxVersion::gfx908 ||
gfx_version == GfxVersion::gfx90a) {
gfx_version == GfxVersion::gfx90a ||
gfx_version == GfxVersion::gfx942) {
numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
} else {
panic("Saw unknown gfx version setting up GPR counts\n");

View File

@@ -118,6 +118,7 @@ void
Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
{
int regInitIdx = 0;
gfxVersion = task->gfxVersion();
// Iterate over all the init fields and check which
// bits are enabled. Useful information can be found here:
@@ -378,8 +379,29 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
break;
case PrivSegWaveByteOffset:
// For architected flat scratch, this enable is reused to set
// the FLAT_SCRATCH register pair to the scratch backing
// memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
if (task->gfxVersion() == GfxVersion::gfx942) {
Addr arch_flat_scratch =
task->amdQueue.scratch_backing_memory_location;
computeUnit->srf[simdId]->write(
VegaISA::REG_FLAT_SCRATCH_HI,
bits(arch_flat_scratch, 63, 32));
computeUnit->srf[simdId]->write(
VegaISA::REG_FLAT_SCRATCH_LO,
bits(arch_flat_scratch, 31, 0));
break;
}
// Not architected flat scratch. Write the scratch wavefront
// offset: https://llvm.org/docs/AMDGPUUsage.html
// #amdgpu-amdhsa-initial-kernel-execution-state
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
/**
* the compute_tmpring_size_wavesize specifies the number of
* kB allocated per wavefront, hence the multiplication by
@@ -442,7 +464,8 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
// Default to false and set to true for gem5 supported ISAs.
bool packed_work_item_id = false;
if (task->gfxVersion() == GfxVersion::gfx90a) {
if (task->gfxVersion() == GfxVersion::gfx90a ||
task->gfxVersion() == GfxVersion::gfx942) {
packed_work_item_id = true;
}

View File

@@ -92,6 +92,8 @@ class Wavefront : public SimObject
S_BARRIER
};
// gfx version wavefront is executing
GfxVersion gfxVersion;
// HW slot id where the WF is mapped to inside a SIMD unit
const int wfSlotId;
int kernId;