diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py index 5346622155..0f090e2f89 100644 --- a/configs/example/gpufs/runfs.py +++ b/configs/example/gpufs/runfs.py @@ -179,10 +179,15 @@ def runGpuFSSystem(args): math.ceil(float(n_cu) / args.cu_per_scalar_cache) ) - # Verify MMIO trace is valid - mmio_md5 = hashlib.md5(open(args.gpu_mmio_trace, "rb").read()).hexdigest() - if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d": - m5.util.panic("MMIO file does not match gem5 resources") + # Verify MMIO trace is valid. This is only needed for Vega10 simulations. + # The md5sum refers to the md5sum of the Vega10 MMIO hardware trace in + # the gem5-resources repository. By checking it here, we avoid potential + # errors that would cause the driver not to load and simulations to fail. + if args.gpu_device == "Vega10": + mmio_file = open(args.gpu_mmio_trace, "rb") + mmio_md5 = hashlib.md5(mmio_file.read()).hexdigest() + if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d": + m5.util.panic("MMIO file does not match gem5 resources") system = makeGpuFSSystem(args) diff --git a/src/arch/amdgpu/vega/pagetable_walker.cc b/src/arch/amdgpu/vega/pagetable_walker.cc index ea4d6b0de9..6a71b14838 100644 --- a/src/arch/amdgpu/vega/pagetable_walker.cc +++ b/src/arch/amdgpu/vega/pagetable_walker.cc @@ -239,9 +239,22 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead, Addr part2 = 0; PageDirectoryEntry pde = static_cast(pte); - // For a four level page table block fragment size should not be needed. - // For now issue a panic to prevent strange behavior if it is non-zero. - panic_if(pde.blockFragmentSize, "PDE blockFragmentSize must be 0"); + // Block fragment size can change the size of the pages pointed to while + // moving to the next PDE. A value of 0 implies native page size. A + // non-zero value implies the next leaf in the page table is a PTE unless + // the F bit is set. If we see a non-zero value, set it here and print + // for debugging. + if (pde.blockFragmentSize) { + DPRINTF(GPUPTWalker, + "blockFragmentSize: %d, pde: %#016lx, state: %d\n", + pde.blockFragmentSize, pde, state); + blockFragmentSize = pde.blockFragmentSize; + + // At this time, only a value of 9 is used in the driver: + // https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/ + // amd/amdgpu/gmc_v9_0.c#L1165 + assert(pde.blockFragmentSize == 9); + } switch(state) { case PDE2: @@ -287,7 +300,7 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead, nextState = PDE0; break; case PDE0: - if (pde.p) { + if (pde.p || (blockFragmentSize && !pte.f)) { DPRINTF(GPUPTWalker, "Treating PDE0 as PTE: %#016x frag: %d\n", (uint64_t)pte, pte.fragment); entry.pte = pte; @@ -299,7 +312,15 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead, } // Read the PteAddr part1 = ((((uint64_t)pte) >> 6) << 3); - part2 = offsetFunc(vaddr, 9, 0); + if (pte.f) { + // For F bit we want to use the blockFragmentSize in the previous + // PDE and the blockFragmentSize in this PTE for offset function. + part2 = offsetFunc(vaddr, + blockFragmentSize, + pde.blockFragmentSize); + } else { + part2 = offsetFunc(vaddr, 9, 0); + } nextRead = ((part1 + part2) << 3) & mask(48); DPRINTF(GPUPTWalker, "Got PDE0 entry %#016x. write:%s->%#016x va:%#016x\n", diff --git a/src/arch/amdgpu/vega/pagetable_walker.hh b/src/arch/amdgpu/vega/pagetable_walker.hh index 2ad0748c14..232be5de70 100644 --- a/src/arch/amdgpu/vega/pagetable_walker.hh +++ b/src/arch/amdgpu/vega/pagetable_walker.hh @@ -99,11 +99,13 @@ class Walker : public ClockedObject bool started; bool timing; PacketPtr tlbPkt; + int blockFragmentSize; public: WalkerState(Walker *_walker, PacketPtr pkt, bool is_functional = false) : walker(_walker), state(Ready), nextState(Ready), dataSize(8), - enableNX(true), retrying(false), started(false), tlbPkt(pkt) + enableNX(true), retrying(false), started(false), tlbPkt(pkt), + blockFragmentSize(0) { DPRINTF(GPUPTWalker, "Walker::WalkerState %p %p %d\n", this, walker, state); diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index e99d694634..0202f583e6 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -510,9 +510,12 @@ SDMAEngine::decodeHeader(SDMAQueue *q, uint32_t header) dmaReadVirt(q->rptr(), sizeof(sdmaAtomic), cb, dmaBuffer); } break; case SDMA_OP_CONST_FILL: { - q->incRptr(sizeof(sdmaConstFill)); - warn("SDMA_OP_CONST_FILL not implemented"); - decodeNext(q); + DPRINTF(SDMAEngine, "SDMA Constant fill packet\n"); + dmaBuffer = new sdmaConstFill(); + cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { constFill(q, (sdmaConstFill *)dmaBuffer, header); }); + dmaReadVirt(q->rptr(), sizeof(sdmaConstFill), cb, dmaBuffer); } break; case SDMA_OP_PTEPDE: { DPRINTF(SDMAEngine, "SDMA PTEPDE packet\n"); @@ -1026,6 +1029,68 @@ SDMAEngine::atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt, decodeNext(q); } +void +SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header) +{ + q->incRptr(sizeof(sdmaConstFill)); + + sdmaConstFillHeader fill_header; + fill_header.ordinal = header; + + DPRINTF(SDMAEngine, "ConstFill %lx srcData %x count %d size %d sw %d\n", + pkt->addr, pkt->srcData, pkt->count, fill_header.fillsize, + fill_header.sw); + + // Count is number of elements - 1. Size is log2 of byte size. + int fill_bytes = (pkt->count + 1) * (1 << fill_header.fillsize); + uint8_t *fill_data = new uint8_t[fill_bytes]; + + memset(fill_data, pkt->srcData, fill_bytes); + + Addr device_addr = getDeviceAddress(pkt->addr); + if (device_addr) { + DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to device at %lx\n", + fill_bytes, pkt->srcData, pkt->addr); + + auto cb = new EventFunctionWrapper( + [ = ]{ constFillDone(q, pkt, fill_data); }, name()); + + // Copy the minimum page size at a time in case the physical addresses + // are not contiguous. + ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE); + for (; !gen.done(); gen.next()) { + Addr chunk_addr = getDeviceAddress(gen.addr()); + assert(chunk_addr); + + DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n", + gen.size(), gen.addr(), chunk_addr); + + gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data, + gen.size(), 0, + gen.last() ? cb : nullptr); + fill_data += gen.size(); + } + } else { + DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n", + fill_bytes, pkt->srcData, pkt->addr); + + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { constFillDone(q, pkt, fill_data); }); + dmaWriteVirt(pkt->addr, fill_bytes, cb, (void *)fill_data); + } +} + +void +SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data) +{ + DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr); + + delete fill_data; + delete pkt; + decodeNext(q); +} + AddrRangeList SDMAEngine::getAddrRanges() const { diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh index bcbd497e8a..5abe63fcc6 100644 --- a/src/dev/amdgpu/sdma_engine.hh +++ b/src/dev/amdgpu/sdma_engine.hh @@ -245,6 +245,8 @@ class SDMAEngine : public DmaVirtDevice uint64_t *dmaBuffer); void atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt, uint64_t *dmaBuffer); + void constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header); + void constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data); /** * Methods for getting SDMA MMIO base address and size. These are set by diff --git a/src/dev/amdgpu/sdma_packets.hh b/src/dev/amdgpu/sdma_packets.hh index 52a47d3a2d..07d3f12600 100644 --- a/src/dev/amdgpu/sdma_packets.hh +++ b/src/dev/amdgpu/sdma_packets.hh @@ -37,7 +37,7 @@ namespace gem5 { /** - * SDMA packets + * SDMA packets - see src/core/inc/sdma_registers.h in ROCR-Runtime */ typedef struct GEM5_PACKED { @@ -80,6 +80,23 @@ typedef struct GEM5_PACKED } sdmaConstFill; static_assert(sizeof(sdmaConstFill) == 16); +typedef struct GEM5_PACKED +{ + union + { + struct + { + uint32_t op : 8; + uint32_t sub_op : 8; + uint32_t sw : 2; + uint32_t res0 : 12; + uint32_t fillsize : 2; + }; + uint32_t ordinal; + }; +} sdmaConstFillHeader; +static_assert(sizeof(sdmaConstFillHeader) == 4); + typedef struct GEM5_PACKED { uint32_t key0;