dev-amdgpu: Support for ROCm 5.4+ and MI200 (#141)

This commit is contained in:
Bobby R. Bruce
2023-07-31 10:24:46 -07:00
committed by GitHub
6 changed files with 126 additions and 14 deletions

View File

@@ -179,10 +179,15 @@ def runGpuFSSystem(args):
math.ceil(float(n_cu) / args.cu_per_scalar_cache)
)
# Verify MMIO trace is valid
mmio_md5 = hashlib.md5(open(args.gpu_mmio_trace, "rb").read()).hexdigest()
if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
m5.util.panic("MMIO file does not match gem5 resources")
# Verify MMIO trace is valid. This is only needed for Vega10 simulations.
# The md5sum refers to the md5sum of the Vega10 MMIO hardware trace in
# the gem5-resources repository. By checking it here, we avoid potential
# errors that would cause the driver not to load and simulations to fail.
if args.gpu_device == "Vega10":
mmio_file = open(args.gpu_mmio_trace, "rb")
mmio_md5 = hashlib.md5(mmio_file.read()).hexdigest()
if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
m5.util.panic("MMIO file does not match gem5 resources")
system = makeGpuFSSystem(args)

View File

@@ -239,9 +239,22 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
Addr part2 = 0;
PageDirectoryEntry pde = static_cast<PageDirectoryEntry>(pte);
// For a four level page table block fragment size should not be needed.
// For now issue a panic to prevent strange behavior if it is non-zero.
panic_if(pde.blockFragmentSize, "PDE blockFragmentSize must be 0");
// Block fragment size can change the size of the pages pointed to while
// moving to the next PDE. A value of 0 implies native page size. A
// non-zero value implies the next leaf in the page table is a PTE unless
// the F bit is set. If we see a non-zero value, set it here and print
// for debugging.
if (pde.blockFragmentSize) {
DPRINTF(GPUPTWalker,
"blockFragmentSize: %d, pde: %#016lx, state: %d\n",
pde.blockFragmentSize, pde, state);
blockFragmentSize = pde.blockFragmentSize;
// At this time, only a value of 9 is used in the driver:
// https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/
// amd/amdgpu/gmc_v9_0.c#L1165
assert(pde.blockFragmentSize == 9);
}
switch(state) {
case PDE2:
@@ -287,7 +300,7 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
nextState = PDE0;
break;
case PDE0:
if (pde.p) {
if (pde.p || (blockFragmentSize && !pte.f)) {
DPRINTF(GPUPTWalker, "Treating PDE0 as PTE: %#016x frag: %d\n",
(uint64_t)pte, pte.fragment);
entry.pte = pte;
@@ -299,7 +312,15 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
}
// Read the PteAddr
part1 = ((((uint64_t)pte) >> 6) << 3);
part2 = offsetFunc(vaddr, 9, 0);
if (pte.f) {
// For F bit we want to use the blockFragmentSize in the previous
// PDE and the blockFragmentSize in this PTE for offset function.
part2 = offsetFunc(vaddr,
blockFragmentSize,
pde.blockFragmentSize);
} else {
part2 = offsetFunc(vaddr, 9, 0);
}
nextRead = ((part1 + part2) << 3) & mask(48);
DPRINTF(GPUPTWalker,
"Got PDE0 entry %#016x. write:%s->%#016x va:%#016x\n",

View File

@@ -99,11 +99,13 @@ class Walker : public ClockedObject
bool started;
bool timing;
PacketPtr tlbPkt;
int blockFragmentSize;
public:
WalkerState(Walker *_walker, PacketPtr pkt, bool is_functional = false)
: walker(_walker), state(Ready), nextState(Ready), dataSize(8),
enableNX(true), retrying(false), started(false), tlbPkt(pkt)
enableNX(true), retrying(false), started(false), tlbPkt(pkt),
blockFragmentSize(0)
{
DPRINTF(GPUPTWalker, "Walker::WalkerState %p %p %d\n",
this, walker, state);

View File

@@ -510,9 +510,12 @@ SDMAEngine::decodeHeader(SDMAQueue *q, uint32_t header)
dmaReadVirt(q->rptr(), sizeof(sdmaAtomic), cb, dmaBuffer);
} break;
case SDMA_OP_CONST_FILL: {
q->incRptr(sizeof(sdmaConstFill));
warn("SDMA_OP_CONST_FILL not implemented");
decodeNext(q);
DPRINTF(SDMAEngine, "SDMA Constant fill packet\n");
dmaBuffer = new sdmaConstFill();
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ constFill(q, (sdmaConstFill *)dmaBuffer, header); });
dmaReadVirt(q->rptr(), sizeof(sdmaConstFill), cb, dmaBuffer);
} break;
case SDMA_OP_PTEPDE: {
DPRINTF(SDMAEngine, "SDMA PTEPDE packet\n");
@@ -1026,6 +1029,68 @@ SDMAEngine::atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
decodeNext(q);
}
void
SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header)
{
q->incRptr(sizeof(sdmaConstFill));
sdmaConstFillHeader fill_header;
fill_header.ordinal = header;
DPRINTF(SDMAEngine, "ConstFill %lx srcData %x count %d size %d sw %d\n",
pkt->addr, pkt->srcData, pkt->count, fill_header.fillsize,
fill_header.sw);
// Count is number of <size> elements - 1. Size is log2 of byte size.
int fill_bytes = (pkt->count + 1) * (1 << fill_header.fillsize);
uint8_t *fill_data = new uint8_t[fill_bytes];
memset(fill_data, pkt->srcData, fill_bytes);
Addr device_addr = getDeviceAddress(pkt->addr);
if (device_addr) {
DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to device at %lx\n",
fill_bytes, pkt->srcData, pkt->addr);
auto cb = new EventFunctionWrapper(
[ = ]{ constFillDone(q, pkt, fill_data); }, name());
// Copy the minimum page size at a time in case the physical addresses
// are not contiguous.
ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE);
for (; !gen.done(); gen.next()) {
Addr chunk_addr = getDeviceAddress(gen.addr());
assert(chunk_addr);
DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
gen.size(), gen.addr(), chunk_addr);
gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data,
gen.size(), 0,
gen.last() ? cb : nullptr);
fill_data += gen.size();
}
} else {
DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n",
fill_bytes, pkt->srcData, pkt->addr);
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ constFillDone(q, pkt, fill_data); });
dmaWriteVirt(pkt->addr, fill_bytes, cb, (void *)fill_data);
}
}
void
SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data)
{
DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr);
delete fill_data;
delete pkt;
decodeNext(q);
}
AddrRangeList
SDMAEngine::getAddrRanges() const
{

View File

@@ -245,6 +245,8 @@ class SDMAEngine : public DmaVirtDevice
uint64_t *dmaBuffer);
void atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
uint64_t *dmaBuffer);
void constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header);
void constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data);
/**
* Methods for getting SDMA MMIO base address and size. These are set by

View File

@@ -37,7 +37,7 @@ namespace gem5
{
/**
* SDMA packets
* SDMA packets - see src/core/inc/sdma_registers.h in ROCR-Runtime
*/
typedef struct GEM5_PACKED
{
@@ -80,6 +80,23 @@ typedef struct GEM5_PACKED
} sdmaConstFill;
static_assert(sizeof(sdmaConstFill) == 16);
typedef struct GEM5_PACKED
{
union
{
struct
{
uint32_t op : 8;
uint32_t sub_op : 8;
uint32_t sw : 2;
uint32_t res0 : 12;
uint32_t fillsize : 2;
};
uint32_t ordinal;
};
} sdmaConstFillHeader;
static_assert(sizeof(sdmaConstFillHeader) == 4);
typedef struct GEM5_PACKED
{
uint32_t key0;