dev-amdgpu: Support for ROCm 5.4+ and MI200 (#141)

2023-07-31 10:24:46 -07:00
parent 4ee6dbc330 f8490e4681
commit dceabe5fda
6 changed files with 126 additions and 14 deletions
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -179,10 +179,15 @@ def runGpuFSSystem(args):
        math.ceil(float(n_cu) / args.cu_per_scalar_cache)
    )

-    # Verify MMIO trace is valid
-    mmio_md5 = hashlib.md5(open(args.gpu_mmio_trace, "rb").read()).hexdigest()
-    if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
-        m5.util.panic("MMIO file does not match gem5 resources")
+    # Verify MMIO trace is valid. This is only needed for Vega10 simulations.
+    # The md5sum refers to the md5sum of the Vega10 MMIO hardware trace in
+    # the gem5-resources repository. By checking it here, we avoid potential
+    # errors that would cause the driver not to load and simulations to fail.
+    if args.gpu_device == "Vega10":
+        mmio_file = open(args.gpu_mmio_trace, "rb")
+        mmio_md5 = hashlib.md5(mmio_file.read()).hexdigest()
+        if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
+            m5.util.panic("MMIO file does not match gem5 resources")

    system = makeGpuFSSystem(args)

--- a/src/arch/amdgpu/vega/pagetable_walker.cc
+++ b/src/arch/amdgpu/vega/pagetable_walker.cc
@@ -239,9 +239,22 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
    Addr part2 = 0;
    PageDirectoryEntry pde = static_cast<PageDirectoryEntry>(pte);

-    // For a four level page table block fragment size should not be needed.
-    // For now issue a panic to prevent strange behavior if it is non-zero.
-    panic_if(pde.blockFragmentSize, "PDE blockFragmentSize must be 0");
+    // Block fragment size can change the size of the pages pointed to while
+    // moving to the next PDE. A value of 0 implies native page size. A
+    // non-zero value implies the next leaf in the page table is a PTE unless
+    // the F bit is set. If we see a non-zero value, set it here and print
+    // for debugging.
+    if (pde.blockFragmentSize) {
+        DPRINTF(GPUPTWalker,
+                "blockFragmentSize: %d, pde: %#016lx, state: %d\n",
+                pde.blockFragmentSize, pde, state);
+        blockFragmentSize = pde.blockFragmentSize;
+
+        // At this time, only a value of 9 is used in the driver:
+        // https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/
+        //     amd/amdgpu/gmc_v9_0.c#L1165
+        assert(pde.blockFragmentSize == 9);
+    }

    switch(state) {
      case PDE2:
@@ -287,7 +300,7 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
        nextState = PDE0;
        break;
      case PDE0:
-        if (pde.p) {
+        if (pde.p || (blockFragmentSize && !pte.f)) {
            DPRINTF(GPUPTWalker, "Treating PDE0 as PTE: %#016x frag: %d\n",
                    (uint64_t)pte, pte.fragment);
            entry.pte = pte;
@@ -299,7 +312,15 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
        }
        // Read the PteAddr
        part1 = ((((uint64_t)pte) >> 6) << 3);
-        part2 = offsetFunc(vaddr, 9, 0);
+        if (pte.f) {
+            // For F bit we want to use the blockFragmentSize in the previous
+            // PDE and the blockFragmentSize in this PTE for offset function.
+            part2 = offsetFunc(vaddr,
+                               blockFragmentSize,
+                               pde.blockFragmentSize);
+        } else {
+            part2 = offsetFunc(vaddr, 9, 0);
+        }
        nextRead = ((part1 + part2) << 3) & mask(48);
        DPRINTF(GPUPTWalker,
                "Got PDE0 entry %#016x. write:%s->%#016x va:%#016x\n",
--- a/src/arch/amdgpu/vega/pagetable_walker.hh
+++ b/src/arch/amdgpu/vega/pagetable_walker.hh
@@ -99,11 +99,13 @@ class Walker : public ClockedObject
        bool started;
        bool timing;
        PacketPtr tlbPkt;
+        int blockFragmentSize;

      public:
        WalkerState(Walker *_walker, PacketPtr pkt, bool is_functional = false)
            : walker(_walker), state(Ready), nextState(Ready), dataSize(8),
-              enableNX(true), retrying(false), started(false), tlbPkt(pkt)
+              enableNX(true), retrying(false), started(false), tlbPkt(pkt),
+              blockFragmentSize(0)
        {
            DPRINTF(GPUPTWalker, "Walker::WalkerState %p %p %d\n",
                    this, walker, state);
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -510,9 +510,12 @@ SDMAEngine::decodeHeader(SDMAQueue *q, uint32_t header)
        dmaReadVirt(q->rptr(), sizeof(sdmaAtomic), cb, dmaBuffer);
        } break;
      case SDMA_OP_CONST_FILL: {
-        q->incRptr(sizeof(sdmaConstFill));
-        warn("SDMA_OP_CONST_FILL not implemented");
-        decodeNext(q);
+        DPRINTF(SDMAEngine, "SDMA Constant fill packet\n");
+        dmaBuffer = new sdmaConstFill();
+        cb = new DmaVirtCallback<uint64_t>(
+            [ = ] (const uint64_t &)
+                { constFill(q, (sdmaConstFill *)dmaBuffer, header); });
+        dmaReadVirt(q->rptr(), sizeof(sdmaConstFill), cb, dmaBuffer);
        } break;
      case SDMA_OP_PTEPDE: {
        DPRINTF(SDMAEngine, "SDMA PTEPDE packet\n");
@@ -1026,6 +1029,68 @@ SDMAEngine::atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
    decodeNext(q);
 }

+void
+SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header)
+{
+    q->incRptr(sizeof(sdmaConstFill));
+
+    sdmaConstFillHeader fill_header;
+    fill_header.ordinal = header;
+
+    DPRINTF(SDMAEngine, "ConstFill %lx srcData %x count %d size %d sw %d\n",
+            pkt->addr, pkt->srcData, pkt->count, fill_header.fillsize,
+            fill_header.sw);
+
+    // Count is number of <size> elements - 1. Size is log2 of byte size.
+    int fill_bytes = (pkt->count + 1) * (1 << fill_header.fillsize);
+    uint8_t *fill_data = new uint8_t[fill_bytes];
+
+    memset(fill_data, pkt->srcData, fill_bytes);
+
+    Addr device_addr = getDeviceAddress(pkt->addr);
+    if (device_addr) {
+        DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to device at %lx\n",
+                fill_bytes, pkt->srcData, pkt->addr);
+
+        auto cb = new EventFunctionWrapper(
+            [ = ]{ constFillDone(q, pkt, fill_data); }, name());
+
+        // Copy the minimum page size at a time in case the physical addresses
+        // are not contiguous.
+        ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE);
+        for (; !gen.done(); gen.next()) {
+            Addr chunk_addr = getDeviceAddress(gen.addr());
+            assert(chunk_addr);
+
+            DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
+                    gen.size(), gen.addr(), chunk_addr);
+
+            gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data,
+                                                 gen.size(), 0,
+                                                 gen.last() ? cb : nullptr);
+            fill_data += gen.size();
+        }
+    } else {
+        DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n",
+                fill_bytes, pkt->srcData, pkt->addr);
+
+        auto cb = new DmaVirtCallback<uint64_t>(
+            [ = ] (const uint64_t &)
+                { constFillDone(q, pkt, fill_data); });
+        dmaWriteVirt(pkt->addr, fill_bytes, cb, (void *)fill_data);
+    }
+}
+
+void
+SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data)
+{
+    DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr);
+
+    delete fill_data;
+    delete pkt;
+    decodeNext(q);
+}
+
 AddrRangeList
 SDMAEngine::getAddrRanges() const
 {
--- a/src/dev/amdgpu/sdma_engine.hh
+++ b/src/dev/amdgpu/sdma_engine.hh
@@ -245,6 +245,8 @@ class SDMAEngine : public DmaVirtDevice
                    uint64_t *dmaBuffer);
    void atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
                    uint64_t *dmaBuffer);
+    void constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header);
+    void constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data);

    /**
     * Methods for getting SDMA MMIO base address and size. These are set by
--- a/src/dev/amdgpu/sdma_packets.hh
+++ b/src/dev/amdgpu/sdma_packets.hh
@@ -37,7 +37,7 @@ namespace gem5
 {

 /**
- * SDMA packets
+ * SDMA packets - see src/core/inc/sdma_registers.h in ROCR-Runtime
 */
 typedef struct GEM5_PACKED
 {
@@ -80,6 +80,23 @@ typedef struct GEM5_PACKED
 }  sdmaConstFill;
 static_assert(sizeof(sdmaConstFill) == 16);

+typedef struct GEM5_PACKED
+{
+    union
+    {
+        struct
+        {
+            uint32_t op : 8;
+            uint32_t sub_op : 8;
+            uint32_t sw : 2;
+            uint32_t res0 : 12;
+            uint32_t fillsize : 2;
+        };
+        uint32_t ordinal;
+    };
+}  sdmaConstFillHeader;
+static_assert(sizeof(sdmaConstFillHeader) == 4);
+
 typedef struct GEM5_PACKED
 {
    uint32_t key0;