dev-amdgpu: Support for ROCm 5.4+ and MI200 (#141)
This commit is contained in:
@@ -179,10 +179,15 @@ def runGpuFSSystem(args):
|
||||
math.ceil(float(n_cu) / args.cu_per_scalar_cache)
|
||||
)
|
||||
|
||||
# Verify MMIO trace is valid
|
||||
mmio_md5 = hashlib.md5(open(args.gpu_mmio_trace, "rb").read()).hexdigest()
|
||||
if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
|
||||
m5.util.panic("MMIO file does not match gem5 resources")
|
||||
# Verify MMIO trace is valid. This is only needed for Vega10 simulations.
|
||||
# The md5sum refers to the md5sum of the Vega10 MMIO hardware trace in
|
||||
# the gem5-resources repository. By checking it here, we avoid potential
|
||||
# errors that would cause the driver not to load and simulations to fail.
|
||||
if args.gpu_device == "Vega10":
|
||||
mmio_file = open(args.gpu_mmio_trace, "rb")
|
||||
mmio_md5 = hashlib.md5(mmio_file.read()).hexdigest()
|
||||
if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
|
||||
m5.util.panic("MMIO file does not match gem5 resources")
|
||||
|
||||
system = makeGpuFSSystem(args)
|
||||
|
||||
|
||||
@@ -239,9 +239,22 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
|
||||
Addr part2 = 0;
|
||||
PageDirectoryEntry pde = static_cast<PageDirectoryEntry>(pte);
|
||||
|
||||
// For a four level page table block fragment size should not be needed.
|
||||
// For now issue a panic to prevent strange behavior if it is non-zero.
|
||||
panic_if(pde.blockFragmentSize, "PDE blockFragmentSize must be 0");
|
||||
// Block fragment size can change the size of the pages pointed to while
|
||||
// moving to the next PDE. A value of 0 implies native page size. A
|
||||
// non-zero value implies the next leaf in the page table is a PTE unless
|
||||
// the F bit is set. If we see a non-zero value, set it here and print
|
||||
// for debugging.
|
||||
if (pde.blockFragmentSize) {
|
||||
DPRINTF(GPUPTWalker,
|
||||
"blockFragmentSize: %d, pde: %#016lx, state: %d\n",
|
||||
pde.blockFragmentSize, pde, state);
|
||||
blockFragmentSize = pde.blockFragmentSize;
|
||||
|
||||
// At this time, only a value of 9 is used in the driver:
|
||||
// https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/
|
||||
// amd/amdgpu/gmc_v9_0.c#L1165
|
||||
assert(pde.blockFragmentSize == 9);
|
||||
}
|
||||
|
||||
switch(state) {
|
||||
case PDE2:
|
||||
@@ -287,7 +300,7 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
|
||||
nextState = PDE0;
|
||||
break;
|
||||
case PDE0:
|
||||
if (pde.p) {
|
||||
if (pde.p || (blockFragmentSize && !pte.f)) {
|
||||
DPRINTF(GPUPTWalker, "Treating PDE0 as PTE: %#016x frag: %d\n",
|
||||
(uint64_t)pte, pte.fragment);
|
||||
entry.pte = pte;
|
||||
@@ -299,7 +312,15 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
|
||||
}
|
||||
// Read the PteAddr
|
||||
part1 = ((((uint64_t)pte) >> 6) << 3);
|
||||
part2 = offsetFunc(vaddr, 9, 0);
|
||||
if (pte.f) {
|
||||
// For F bit we want to use the blockFragmentSize in the previous
|
||||
// PDE and the blockFragmentSize in this PTE for offset function.
|
||||
part2 = offsetFunc(vaddr,
|
||||
blockFragmentSize,
|
||||
pde.blockFragmentSize);
|
||||
} else {
|
||||
part2 = offsetFunc(vaddr, 9, 0);
|
||||
}
|
||||
nextRead = ((part1 + part2) << 3) & mask(48);
|
||||
DPRINTF(GPUPTWalker,
|
||||
"Got PDE0 entry %#016x. write:%s->%#016x va:%#016x\n",
|
||||
|
||||
@@ -99,11 +99,13 @@ class Walker : public ClockedObject
|
||||
bool started;
|
||||
bool timing;
|
||||
PacketPtr tlbPkt;
|
||||
int blockFragmentSize;
|
||||
|
||||
public:
|
||||
WalkerState(Walker *_walker, PacketPtr pkt, bool is_functional = false)
|
||||
: walker(_walker), state(Ready), nextState(Ready), dataSize(8),
|
||||
enableNX(true), retrying(false), started(false), tlbPkt(pkt)
|
||||
enableNX(true), retrying(false), started(false), tlbPkt(pkt),
|
||||
blockFragmentSize(0)
|
||||
{
|
||||
DPRINTF(GPUPTWalker, "Walker::WalkerState %p %p %d\n",
|
||||
this, walker, state);
|
||||
|
||||
@@ -510,9 +510,12 @@ SDMAEngine::decodeHeader(SDMAQueue *q, uint32_t header)
|
||||
dmaReadVirt(q->rptr(), sizeof(sdmaAtomic), cb, dmaBuffer);
|
||||
} break;
|
||||
case SDMA_OP_CONST_FILL: {
|
||||
q->incRptr(sizeof(sdmaConstFill));
|
||||
warn("SDMA_OP_CONST_FILL not implemented");
|
||||
decodeNext(q);
|
||||
DPRINTF(SDMAEngine, "SDMA Constant fill packet\n");
|
||||
dmaBuffer = new sdmaConstFill();
|
||||
cb = new DmaVirtCallback<uint64_t>(
|
||||
[ = ] (const uint64_t &)
|
||||
{ constFill(q, (sdmaConstFill *)dmaBuffer, header); });
|
||||
dmaReadVirt(q->rptr(), sizeof(sdmaConstFill), cb, dmaBuffer);
|
||||
} break;
|
||||
case SDMA_OP_PTEPDE: {
|
||||
DPRINTF(SDMAEngine, "SDMA PTEPDE packet\n");
|
||||
@@ -1026,6 +1029,68 @@ SDMAEngine::atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
void
|
||||
SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header)
|
||||
{
|
||||
q->incRptr(sizeof(sdmaConstFill));
|
||||
|
||||
sdmaConstFillHeader fill_header;
|
||||
fill_header.ordinal = header;
|
||||
|
||||
DPRINTF(SDMAEngine, "ConstFill %lx srcData %x count %d size %d sw %d\n",
|
||||
pkt->addr, pkt->srcData, pkt->count, fill_header.fillsize,
|
||||
fill_header.sw);
|
||||
|
||||
// Count is number of <size> elements - 1. Size is log2 of byte size.
|
||||
int fill_bytes = (pkt->count + 1) * (1 << fill_header.fillsize);
|
||||
uint8_t *fill_data = new uint8_t[fill_bytes];
|
||||
|
||||
memset(fill_data, pkt->srcData, fill_bytes);
|
||||
|
||||
Addr device_addr = getDeviceAddress(pkt->addr);
|
||||
if (device_addr) {
|
||||
DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to device at %lx\n",
|
||||
fill_bytes, pkt->srcData, pkt->addr);
|
||||
|
||||
auto cb = new EventFunctionWrapper(
|
||||
[ = ]{ constFillDone(q, pkt, fill_data); }, name());
|
||||
|
||||
// Copy the minimum page size at a time in case the physical addresses
|
||||
// are not contiguous.
|
||||
ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE);
|
||||
for (; !gen.done(); gen.next()) {
|
||||
Addr chunk_addr = getDeviceAddress(gen.addr());
|
||||
assert(chunk_addr);
|
||||
|
||||
DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
|
||||
gen.size(), gen.addr(), chunk_addr);
|
||||
|
||||
gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data,
|
||||
gen.size(), 0,
|
||||
gen.last() ? cb : nullptr);
|
||||
fill_data += gen.size();
|
||||
}
|
||||
} else {
|
||||
DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n",
|
||||
fill_bytes, pkt->srcData, pkt->addr);
|
||||
|
||||
auto cb = new DmaVirtCallback<uint64_t>(
|
||||
[ = ] (const uint64_t &)
|
||||
{ constFillDone(q, pkt, fill_data); });
|
||||
dmaWriteVirt(pkt->addr, fill_bytes, cb, (void *)fill_data);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data)
|
||||
{
|
||||
DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr);
|
||||
|
||||
delete fill_data;
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
AddrRangeList
|
||||
SDMAEngine::getAddrRanges() const
|
||||
{
|
||||
|
||||
@@ -245,6 +245,8 @@ class SDMAEngine : public DmaVirtDevice
|
||||
uint64_t *dmaBuffer);
|
||||
void atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
|
||||
uint64_t *dmaBuffer);
|
||||
void constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header);
|
||||
void constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data);
|
||||
|
||||
/**
|
||||
* Methods for getting SDMA MMIO base address and size. These are set by
|
||||
|
||||
@@ -37,7 +37,7 @@ namespace gem5
|
||||
{
|
||||
|
||||
/**
|
||||
* SDMA packets
|
||||
* SDMA packets - see src/core/inc/sdma_registers.h in ROCR-Runtime
|
||||
*/
|
||||
typedef struct GEM5_PACKED
|
||||
{
|
||||
@@ -80,6 +80,23 @@ typedef struct GEM5_PACKED
|
||||
} sdmaConstFill;
|
||||
static_assert(sizeof(sdmaConstFill) == 16);
|
||||
|
||||
typedef struct GEM5_PACKED
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32_t op : 8;
|
||||
uint32_t sub_op : 8;
|
||||
uint32_t sw : 2;
|
||||
uint32_t res0 : 12;
|
||||
uint32_t fillsize : 2;
|
||||
};
|
||||
uint32_t ordinal;
|
||||
};
|
||||
} sdmaConstFillHeader;
|
||||
static_assert(sizeof(sdmaConstFillHeader) == 4);
|
||||
|
||||
typedef struct GEM5_PACKED
|
||||
{
|
||||
uint32_t key0;
|
||||
|
||||
Reference in New Issue
Block a user