From 3589a4c11fb72ee4cb6c1707ed08d9d5eec98d5c Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 19 Jul 2023 15:39:51 -0500 Subject: [PATCH] arch-vega: Implement translate further Starting with ROCm 5.4+, MI100 and MI200 make use of the translate further bit in the page table. This bit enables mixing 4kiB and 2MiB pages and is functionally equivalent to mixing page sizes using the PDE.P bit for which gem5 currently has support. With PDE.P bit set, we stop walking and the page size is equal to the level in the page table we stopped at. For example, stopping at level 2 would be a 1GiB page, stopping at level 3 would be a 2MiB page. This assumes most pages are 4kiB. When the F bit is used, it is assumed most pages are 2MiB and we will stop walking at the 3rd level of the page table unless the F bit is set. When the F bit is set, the 2nd level PDE contains a block fragment size representing the page size of the next PDE in the form of 2^(12+size). If the next page has the F bit set we continue walking to the 4th level. The block fragment size is hardcoded to 9 in the driver therefore we assert that the block fragment size must be 0 or 9. This enables MI200 with ROCm 5.4+ in gem5. This functionality was determine by examining the driver source code in Linux and there is no public documentation about this feature or why the change is made in or around ROCm 5.4. Change-Id: I603c0208cd9e821f7ad6eeb1d94ae15eaa146fb9 --- src/arch/amdgpu/vega/pagetable_walker.cc | 31 ++++++++++++++++++++---- src/arch/amdgpu/vega/pagetable_walker.hh | 4 ++- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/arch/amdgpu/vega/pagetable_walker.cc b/src/arch/amdgpu/vega/pagetable_walker.cc index ea4d6b0de9..6a71b14838 100644 --- a/src/arch/amdgpu/vega/pagetable_walker.cc +++ b/src/arch/amdgpu/vega/pagetable_walker.cc @@ -239,9 +239,22 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead, Addr part2 = 0; PageDirectoryEntry pde = static_cast(pte); - // For a four level page table block fragment size should not be needed. - // For now issue a panic to prevent strange behavior if it is non-zero. - panic_if(pde.blockFragmentSize, "PDE blockFragmentSize must be 0"); + // Block fragment size can change the size of the pages pointed to while + // moving to the next PDE. A value of 0 implies native page size. A + // non-zero value implies the next leaf in the page table is a PTE unless + // the F bit is set. If we see a non-zero value, set it here and print + // for debugging. + if (pde.blockFragmentSize) { + DPRINTF(GPUPTWalker, + "blockFragmentSize: %d, pde: %#016lx, state: %d\n", + pde.blockFragmentSize, pde, state); + blockFragmentSize = pde.blockFragmentSize; + + // At this time, only a value of 9 is used in the driver: + // https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/ + // amd/amdgpu/gmc_v9_0.c#L1165 + assert(pde.blockFragmentSize == 9); + } switch(state) { case PDE2: @@ -287,7 +300,7 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead, nextState = PDE0; break; case PDE0: - if (pde.p) { + if (pde.p || (blockFragmentSize && !pte.f)) { DPRINTF(GPUPTWalker, "Treating PDE0 as PTE: %#016x frag: %d\n", (uint64_t)pte, pte.fragment); entry.pte = pte; @@ -299,7 +312,15 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead, } // Read the PteAddr part1 = ((((uint64_t)pte) >> 6) << 3); - part2 = offsetFunc(vaddr, 9, 0); + if (pte.f) { + // For F bit we want to use the blockFragmentSize in the previous + // PDE and the blockFragmentSize in this PTE for offset function. + part2 = offsetFunc(vaddr, + blockFragmentSize, + pde.blockFragmentSize); + } else { + part2 = offsetFunc(vaddr, 9, 0); + } nextRead = ((part1 + part2) << 3) & mask(48); DPRINTF(GPUPTWalker, "Got PDE0 entry %#016x. write:%s->%#016x va:%#016x\n", diff --git a/src/arch/amdgpu/vega/pagetable_walker.hh b/src/arch/amdgpu/vega/pagetable_walker.hh index 2ad0748c14..232be5de70 100644 --- a/src/arch/amdgpu/vega/pagetable_walker.hh +++ b/src/arch/amdgpu/vega/pagetable_walker.hh @@ -99,11 +99,13 @@ class Walker : public ClockedObject bool started; bool timing; PacketPtr tlbPkt; + int blockFragmentSize; public: WalkerState(Walker *_walker, PacketPtr pkt, bool is_functional = false) : walker(_walker), state(Ready), nextState(Ready), dataSize(8), - enableNX(true), retrying(false), started(false), tlbPkt(pkt) + enableNX(true), retrying(false), started(false), tlbPkt(pkt), + blockFragmentSize(0) { DPRINTF(GPUPTWalker, "Walker::WalkerState %p %p %d\n", this, walker, state);