dev-amdgpu: Refactor MMIO interface for SDMA engines

Currently the amdgpu simulated device is assumed to be a Vega10. As a
result there are a few things that are hardcoded. One of those is the
number of SDMAs. In order to add a newer device, such as MI100+, we need
to enable a flexible number of SDMAs.

In order to support a variable number of SDMAs and with the MMIO offsets
of each device being potentially different, the MMIO interface for SDMAs
is changed to use an SDMA class method dispatch table with forwards a
32-bit value from the MMIO packet to the MMIO functions in SDMA of the
format `void method(uint32_t)`. Several changes are made to enable this:

 - Allow the SDMA to have a variable MMIO base and size. These are
   configured in python.
 - An SDMA class method dispatch table which contains the MMIO offset
   relative to the SDMA's MMIO base address.
 - An updated writeMMIO method to iterate over the SDMA MMIO address
   ranges and call the appropriate SDMA MMIO method which matches the
   MMIO offset.
 - Moved all SDMA related MMIO data bit twiddling, masking, etc. into
   the MMIO methods themselves instead of in the writeMMIO method in
   SDMAEngine.

Change-Id: Ifce626f84d52f9e27e4438ba4e685e30dbf06dbc
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70040
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
This commit is contained in:
Matthew Poremba
2023-04-21 19:15:40 -05:00
parent 6c1b95ea41
commit 8b91ac6f8d
8 changed files with 182 additions and 57 deletions

View File

@@ -129,15 +129,45 @@ def makeGpuFSSystem(args):
device_ih = AMDGPUInterruptHandler()
system.pc.south_bridge.gpu.device_ih = device_ih
# Setup the SDMA engines
sdma0_pt_walker = VegaPagetableWalker()
sdma1_pt_walker = VegaPagetableWalker()
# Setup the SDMA engines depending on device. The MMIO base addresses
# can be found in the driver code under:
# include/asic_reg/sdmaX/sdmaX_Y_Z_offset.h
num_sdmas = 2
sdma_bases = []
sdma_sizes = []
if args.gpu_device == "Vega10":
num_sdmas = 2
sdma_bases = [0x4980, 0x5180]
sdma_sizes = [0x800] * 2
elif args.gpu_device == "MI100":
num_sdmas = 8
sdma_bases = [
0x4980,
0x6180,
0x78000,
0x79000,
0x7A000,
0x7B000,
0x7C000,
0x7D000,
]
sdma_sizes = [0x1000] * 8
else:
m5.util.panic(f"Unknown GPU device {args.gpu_device}")
sdma0 = SDMAEngine(walker=sdma0_pt_walker)
sdma1 = SDMAEngine(walker=sdma1_pt_walker)
sdma_pt_walkers = []
sdma_engines = []
for sdma_idx in range(num_sdmas):
sdma_pt_walker = VegaPagetableWalker()
sdma_engine = SDMAEngine(
walker=sdma_pt_walker,
mmio_base=sdma_bases[sdma_idx],
mmio_size=sdma_sizes[sdma_idx],
)
sdma_pt_walkers.append(sdma_pt_walker)
sdma_engines.append(sdma_engine)
system.pc.south_bridge.gpu.sdma0 = sdma0
system.pc.south_bridge.gpu.sdma1 = sdma1
system.pc.south_bridge.gpu.sdmas = sdma_engines
# Setup PM4 packet processor
pm4_pkt_proc = PM4PacketProcessor()
@@ -155,22 +185,22 @@ def makeGpuFSSystem(args):
system._dma_ports.append(gpu_hsapp)
system._dma_ports.append(gpu_cmd_proc)
system._dma_ports.append(system.pc.south_bridge.gpu)
system._dma_ports.append(sdma0)
system._dma_ports.append(sdma1)
for sdma in sdma_engines:
system._dma_ports.append(sdma)
system._dma_ports.append(device_ih)
system._dma_ports.append(pm4_pkt_proc)
system._dma_ports.append(system_hub)
system._dma_ports.append(gpu_mem_mgr)
system._dma_ports.append(hsapp_pt_walker)
system._dma_ports.append(cp_pt_walker)
system._dma_ports.append(sdma0_pt_walker)
system._dma_ports.append(sdma1_pt_walker)
for sdma_pt_walker in sdma_pt_walkers:
system._dma_ports.append(sdma_pt_walker)
gpu_hsapp.pio = system.iobus.mem_side_ports
gpu_cmd_proc.pio = system.iobus.mem_side_ports
system.pc.south_bridge.gpu.pio = system.iobus.mem_side_ports
sdma0.pio = system.iobus.mem_side_ports
sdma1.pio = system.iobus.mem_side_ports
for sdma in sdma_engines:
sdma.pio = system.iobus.mem_side_ports
device_ih.pio = system.iobus.mem_side_ports
pm4_pkt_proc.pio = system.iobus.mem_side_ports
system_hub.pio = system.iobus.mem_side_ports

View File

@@ -79,11 +79,9 @@ class AMDGPUDevice(PciDevice):
False, "Take a checkpoint before the device begins sending MMIOs"
)
# Specific to Vega10: Vega10 has two SDMA engines these do not have any
# assigned function and are referenced by ID so they are given the generic
# names sdma0, sdma1, ... sdmaN.
sdma0 = Param.SDMAEngine("SDMA Engine 0")
sdma1 = Param.SDMAEngine("SDMA Engine 1")
# SDMA engines. There are a different number depending on device,
# therefore an array is used.
sdmas = VectorParam.SDMAEngine("All SDMA Engines")
# The cp is needed here to handle certain packets the device may receive.
# The config script should not create a new cp here but rather assign the
@@ -100,6 +98,9 @@ class SDMAEngine(DmaVirtDevice):
cxx_header = "dev/amdgpu/sdma_engine.hh"
cxx_class = "gem5::SDMAEngine"
mmio_base = Param.Addr(0x0, "Base MMIO Address")
mmio_size = Param.Addr(0x800, "Size of MMIO range")
gpu_device = Param.AMDGPUDevice(NULL, "GPU Controller")
walker = Param.VegaPagetableWalker("Page table walker")

View File

@@ -53,7 +53,7 @@ namespace gem5
AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
: PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih),
sdma0(p.sdma0), sdma1(p.sdma1), pm4PktProc(p.pm4_pkt_proc), cp(p.cp),
pm4PktProc(p.pm4_pkt_proc), cp(p.cp),
checkpoint_before_mmios(p.checkpoint_before_mmios),
init_interrupt_count(0), _lastVMID(0),
deviceMem(name() + ".deviceMem", p.memories, false, "", false)
@@ -84,10 +84,47 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
mmioReader.readMMIOTrace(p.trace_file);
}
sdma0->setGPUDevice(this);
sdma0->setId(0);
sdma1->setGPUDevice(this);
sdma1->setId(1);
int sdma_id = 0;
for (auto& s : p.sdmas) {
s->setGPUDevice(this);
s->setId(sdma_id);
sdmaIds.insert({sdma_id, s});
sdmaMmios.insert({sdma_id,
RangeSize(s->getMmioBase(), s->getMmioSize())});
DPRINTF(AMDGPUDevice, "SDMA%d has MMIO range %s\n", sdma_id,
sdmaMmios[sdma_id].to_string().c_str());
sdma_id++;
}
// Map SDMA MMIO addresses to functions
sdmaFunc.insert({0x81, &SDMAEngine::setGfxBaseLo});
sdmaFunc.insert({0x82, &SDMAEngine::setGfxBaseHi});
sdmaFunc.insert({0x88, &SDMAEngine::setGfxRptrHi});
sdmaFunc.insert({0x89, &SDMAEngine::setGfxRptrLo});
sdmaFunc.insert({0x92, &SDMAEngine::setGfxDoorbellLo});
sdmaFunc.insert({0xab, &SDMAEngine::setGfxDoorbellOffsetLo});
sdmaFunc.insert({0x80, &SDMAEngine::setGfxSize});
sdmaFunc.insert({0xb2, &SDMAEngine::setGfxWptrLo});
sdmaFunc.insert({0xb3, &SDMAEngine::setGfxWptrHi});
if (p.device_name == "Vega10") {
sdmaFunc.insert({0xe1, &SDMAEngine::setPageBaseLo});
sdmaFunc.insert({0xe9, &SDMAEngine::setPageRptrLo});
sdmaFunc.insert({0xe8, &SDMAEngine::setPageRptrHi});
sdmaFunc.insert({0xf2, &SDMAEngine::setPageDoorbellLo});
sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
} else if (p.device_name == "MI100") {
sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
sdmaFunc.insert({0xea, &SDMAEngine::setPageDoorbellLo});
sdmaFunc.insert({0xd8, &SDMAEngine::setPageDoorbellOffsetLo});
sdmaFunc.insert({0x10b, &SDMAEngine::setPageWptrLo});
} else {
panic("Unknown GPU device %s\n", p.device_name);
}
deviceIH->setGPUDevice(this);
pm4PktProc->setGPUDevice(this);
cp->hsaPacketProc().setGPUDevice(this);
@@ -351,15 +388,25 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset);
// Check SDMA functions first, then fallback to switch statement
for (int idx = 0; idx < sdmaIds.size(); ++idx) {
if (sdmaMmios[idx].contains(offset)) {
Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2;
if (sdmaFunc.count(sdma_offset)) {
DPRINTF(AMDGPUDevice, "Calling SDMA%d MMIO function %lx\n",
idx, sdma_offset);
sdmaFuncPtr mptr = sdmaFunc[sdma_offset];
(getSDMAById(idx)->*mptr)(pkt->getLE<uint32_t>());
} else {
DPRINTF(AMDGPUDevice, "Unknown SDMA%d MMIO: %#lx\n", idx,
sdma_offset);
}
return;
}
}
switch (aperture) {
/* Write a register to the first System DMA. */
case SDMA0_BASE:
sdma0->writeMMIO(pkt, aperture_offset >> SDMA_OFFSET_SHIFT);
break;
/* Write a register to the second System DMA. */
case SDMA1_BASE:
sdma1->writeMMIO(pkt, aperture_offset >> SDMA_OFFSET_SHIFT);
break;
/* Write a general register to the graphics register bus manager. */
case GRBM_BASE:
gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
@@ -483,19 +530,9 @@ AMDGPUDevice::getSDMAById(int id)
* PM4 packets selected SDMAs using an integer ID. This method simply maps
* the integer ID to a pointer to the SDMA and checks for invalid IDs.
*/
switch (id) {
case 0:
return sdma0;
break;
case 1:
return sdma1;
break;
default:
panic("No SDMA with id %d\n", id);
break;
}
assert(sdmaIds.count(id));
return nullptr;
return sdmaIds[id];
}
SDMAEngine*
@@ -549,7 +586,7 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
idx = 0;
for (auto & it : sdmaEngs) {
sdma_engs_offset[idx] = it.first;
sdma_engs[idx] = it.second == sdma0 ? 0 : 1;
sdma_engs[idx] = idx;
++idx;
}
@@ -620,7 +657,8 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
UNSERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0]));
for (int idx = 0; idx < sdma_engs_size; ++idx) {
SDMAEngine *sdma = sdma_engs[idx] == 0 ? sdma0 : sdma1;
assert(sdmaIds.count(idx));
SDMAEngine *sdma = sdmaIds[idx];
sdmaEngs.insert(std::make_pair(sdma_engs_offset[idx], sdma));
}
}
@@ -669,8 +707,9 @@ AMDGPUDevice::deallocateAllQueues()
idMap.erase(idMap.begin(), idMap.end());
usedVMIDs.erase(usedVMIDs.begin(), usedVMIDs.end());
sdma0->deallocateRLCQueues();
sdma1->deallocateRLCQueues();
for (auto& it : sdmaEngs) {
it.second->deallocateRLCQueues();
}
}
void

View File

@@ -109,12 +109,19 @@ class AMDGPUDevice : public PciDevice
AMDGPUMemoryManager *gpuMemMgr;
AMDGPUInterruptHandler *deviceIH;
AMDGPUVM gpuvm;
SDMAEngine *sdma0;
SDMAEngine *sdma1;
std::unordered_map<uint32_t, SDMAEngine *> sdmaEngs;
PM4PacketProcessor *pm4PktProc;
GPUCommandProcessor *cp;
// SDMAs mapped by doorbell offset
std::unordered_map<uint32_t, SDMAEngine *> sdmaEngs;
// SDMAs mapped by ID
std::unordered_map<uint32_t, SDMAEngine *> sdmaIds;
// SDMA ID to MMIO range
std::unordered_map<uint32_t, AddrRange> sdmaMmios;
// SDMA ID to function
typedef void (SDMAEngine::*sdmaFuncPtr)(uint32_t);
std::unordered_map<uint32_t, sdmaFuncPtr> sdmaFunc;
/**
* Initial checkpoint support variables.
*/

View File

@@ -80,6 +80,12 @@ AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
assert(client_id == SOC15_IH_CLIENTID_RLC ||
client_id == SOC15_IH_CLIENTID_SDMA0 ||
client_id == SOC15_IH_CLIENTID_SDMA1 ||
client_id == SOC15_IH_CLIENTID_SDMA2 ||
client_id == SOC15_IH_CLIENTID_SDMA3 ||
client_id == SOC15_IH_CLIENTID_SDMA4 ||
client_id == SOC15_IH_CLIENTID_SDMA5 ||
client_id == SOC15_IH_CLIENTID_SDMA6 ||
client_id == SOC15_IH_CLIENTID_SDMA7 ||
client_id == SOC15_IH_CLIENTID_GRBM_CP);
assert(source_id == CP_EOP || source_id == TRAP_ID);

View File

@@ -58,6 +58,12 @@ enum soc15_ih_clientid
SOC15_IH_CLIENTID_RLC = 0x07,
SOC15_IH_CLIENTID_SDMA0 = 0x08,
SOC15_IH_CLIENTID_SDMA1 = 0x09,
SOC15_IH_CLIENTID_SDMA2 = 0x01,
SOC15_IH_CLIENTID_SDMA3 = 0x04,
SOC15_IH_CLIENTID_SDMA4 = 0x05,
SOC15_IH_CLIENTID_SDMA5 = 0x11,
SOC15_IH_CLIENTID_SDMA6 = 0x13,
SOC15_IH_CLIENTID_SDMA7 = 0x18,
SOC15_IH_CLIENTID_GRBM_CP = 0x14
};

View File

@@ -49,7 +49,8 @@ SDMAEngine::SDMAEngine(const SDMAEngineParams &p)
: DmaVirtDevice(p), id(0), gfxBase(0), gfxRptr(0),
gfxDoorbell(0), gfxDoorbellOffset(0), gfxWptr(0), pageBase(0),
pageRptr(0), pageDoorbell(0), pageDoorbellOffset(0),
pageWptr(0), gpuDevice(nullptr), walker(p.walker)
pageWptr(0), gpuDevice(nullptr), walker(p.walker),
mmioBase(p.mmio_base), mmioSize(p.mmio_size)
{
gfx.ib(&gfxIb);
gfxIb.parent(&gfx);
@@ -87,6 +88,18 @@ SDMAEngine::getIHClientId()
return SOC15_IH_CLIENTID_SDMA0;
case 1:
return SOC15_IH_CLIENTID_SDMA1;
case 2:
return SOC15_IH_CLIENTID_SDMA2;
case 3:
return SOC15_IH_CLIENTID_SDMA3;
case 4:
return SOC15_IH_CLIENTID_SDMA4;
case 5:
return SOC15_IH_CLIENTID_SDMA5;
case 6:
return SOC15_IH_CLIENTID_SDMA6;
case 7:
return SOC15_IH_CLIENTID_SDMA7;
default:
panic("Unknown SDMA id");
}
@@ -1240,6 +1253,10 @@ SDMAEngine::setGfxDoorbellOffsetLo(uint32_t data)
{
gfxDoorbellOffset = insertBits(gfxDoorbellOffset, 31, 0, 0);
gfxDoorbellOffset |= data;
if (bits(gfxDoorbell, 28, 28)) {
gpuDevice->setDoorbellType(gfxDoorbellOffset, QueueType::SDMAGfx);
gpuDevice->setSDMAEngine(gfxDoorbellOffset, this);
}
}
void
@@ -1250,9 +1267,11 @@ SDMAEngine::setGfxDoorbellOffsetHi(uint32_t data)
}
void
SDMAEngine::setGfxSize(uint64_t data)
SDMAEngine::setGfxSize(uint32_t data)
{
gfx.size(data);
uint32_t rb_size = bits(data, 6, 1);
assert(rb_size >= 6 && rb_size <= 62);
gfx.size(1 << (rb_size + 2));
}
void
@@ -1320,6 +1339,10 @@ SDMAEngine::setPageDoorbellOffsetLo(uint32_t data)
{
pageDoorbellOffset = insertBits(pageDoorbellOffset, 31, 0, 0);
pageDoorbellOffset |= data;
if (bits(pageDoorbell, 28, 28)) {
gpuDevice->setDoorbellType(pageDoorbellOffset, QueueType::SDMAPage);
gpuDevice->setSDMAEngine(pageDoorbellOffset, this);
}
}
void
@@ -1330,9 +1353,11 @@ SDMAEngine::setPageDoorbellOffsetHi(uint32_t data)
}
void
SDMAEngine::setPageSize(uint64_t data)
SDMAEngine::setPageSize(uint32_t data)
{
page.size(data);
uint32_t rb_size = bits(data, 6, 1);
assert(rb_size >= 6 && rb_size <= 62);
page.size(1 << (rb_size + 2));
}
void

View File

@@ -156,6 +156,9 @@ class SDMAEngine : public DmaVirtDevice
void processRLC0(Addr wptrOffset);
void processRLC1(Addr wptrOffset);
Addr mmioBase = 0;
Addr mmioSize = 0;
public:
SDMAEngine(const SDMAEngineParams &p);
@@ -242,6 +245,14 @@ class SDMAEngine : public DmaVirtDevice
void atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
uint64_t *dmaBuffer);
/**
* Methods for getting SDMA MMIO base address and size. These are set by
* the python configuration depending on device to allow for flexible base
* addresses depending on what GPU is being simulated.
*/
Addr getMmioBase() { return mmioBase; }
Addr getMmioSize() { return mmioSize; }
/**
* Methods for getting the values of SDMA MMIO registers.
*/
@@ -269,7 +280,7 @@ class SDMAEngine : public DmaVirtDevice
void setGfxDoorbellHi(uint32_t data);
void setGfxDoorbellOffsetLo(uint32_t data);
void setGfxDoorbellOffsetHi(uint32_t data);
void setGfxSize(uint64_t data);
void setGfxSize(uint32_t data);
void setGfxWptrLo(uint32_t data);
void setGfxWptrHi(uint32_t data);
void setPageBaseLo(uint32_t data);
@@ -280,7 +291,7 @@ class SDMAEngine : public DmaVirtDevice
void setPageDoorbellHi(uint32_t data);
void setPageDoorbellOffsetLo(uint32_t data);
void setPageDoorbellOffsetHi(uint32_t data);
void setPageSize(uint64_t data);
void setPageSize(uint32_t data);
void setPageWptrLo(uint32_t data);
void setPageWptrHi(uint32_t data);