From 3b35e73eb8e52261750aa8613ec9b4c77852759e Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 19 Jul 2023 15:26:02 -0500
Subject: [PATCH 1/3] dev-amdgpu: Implement SDMA constant fill

This SDMA packet is much more common starting around ROCm 5.4.
Previously this was mostly used to clear page tables after an
application ended and was therefore left unimplemented. It is
now used for basic operation like device memsets.

This patch implements constant fill as it is now necessary.

Change-Id: I9b2cf076ec17f5ed07c20bb820e7db0c082bbfbc
---
 src/dev/amdgpu/sdma_engine.cc  | 71 ++++++++++++++++++++++++++++++++--
 src/dev/amdgpu/sdma_engine.hh  |  2 +
 src/dev/amdgpu/sdma_packets.hh | 19 ++++++++-
 3 files changed, 88 insertions(+), 4 deletions(-)
diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index e99d694634..0202f583e6 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -510,9 +510,12 @@ SDMAEngine::decodeHeader(SDMAQueue *q, uint32_t header)
         dmaReadVirt(q->rptr(), sizeof(sdmaAtomic), cb, dmaBuffer);
         } break;
       case SDMA_OP_CONST_FILL: {
-        q->incRptr(sizeof(sdmaConstFill));
-        warn("SDMA_OP_CONST_FILL not implemented");
-        decodeNext(q);
+        DPRINTF(SDMAEngine, "SDMA Constant fill packet\n");
+        dmaBuffer = new sdmaConstFill();
+        cb = new DmaVirtCallback<uint64_t>(
+            [ = ] (const uint64_t &)
+                { constFill(q, (sdmaConstFill *)dmaBuffer, header); });
+        dmaReadVirt(q->rptr(), sizeof(sdmaConstFill), cb, dmaBuffer);
         } break;
       case SDMA_OP_PTEPDE: {
         DPRINTF(SDMAEngine, "SDMA PTEPDE packet\n");
@@ -1026,6 +1029,68 @@ SDMAEngine::atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
     decodeNext(q);
 }
 
+void
+SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header)
+{
+    q->incRptr(sizeof(sdmaConstFill));
+
+    sdmaConstFillHeader fill_header;
+    fill_header.ordinal = header;
+
+    DPRINTF(SDMAEngine, "ConstFill %lx srcData %x count %d size %d sw %d\n",
+            pkt->addr, pkt->srcData, pkt->count, fill_header.fillsize,
+            fill_header.sw);
+
+    // Count is number of <size> elements - 1. Size is log2 of byte size.
+    int fill_bytes = (pkt->count + 1) * (1 << fill_header.fillsize);
+    uint8_t *fill_data = new uint8_t[fill_bytes];
+
+    memset(fill_data, pkt->srcData, fill_bytes);
+
+    Addr device_addr = getDeviceAddress(pkt->addr);
+    if (device_addr) {
+        DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to device at %lx\n",
+                fill_bytes, pkt->srcData, pkt->addr);
+
+        auto cb = new EventFunctionWrapper(
+            [ = ]{ constFillDone(q, pkt, fill_data); }, name());
+
+        // Copy the minimum page size at a time in case the physical addresses
+        // are not contiguous.
+        ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE);
+        for (; !gen.done(); gen.next()) {
+            Addr chunk_addr = getDeviceAddress(gen.addr());
+            assert(chunk_addr);
+
+            DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
+                    gen.size(), gen.addr(), chunk_addr);
+
+            gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data,
+                                                 gen.size(), 0,
+                                                 gen.last() ? cb : nullptr);
+            fill_data += gen.size();
+        }
+    } else {
+        DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n",
+                fill_bytes, pkt->srcData, pkt->addr);
+
+        auto cb = new DmaVirtCallback<uint64_t>(
+            [ = ] (const uint64_t &)
+                { constFillDone(q, pkt, fill_data); });
+        dmaWriteVirt(pkt->addr, fill_bytes, cb, (void *)fill_data);
+    }
+}
+
+void
+SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data)
+{
+    DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr);
+
+    delete fill_data;
+    delete pkt;
+    decodeNext(q);
+}
+
 AddrRangeList
 SDMAEngine::getAddrRanges() const
 {
diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh
index bcbd497e8a..5abe63fcc6 100644
--- a/src/dev/amdgpu/sdma_engine.hh
+++ b/src/dev/amdgpu/sdma_engine.hh
@@ -245,6 +245,8 @@ class SDMAEngine : public DmaVirtDevice
                     uint64_t *dmaBuffer);
     void atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
                     uint64_t *dmaBuffer);
+    void constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header);
+    void constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data);
 
     /**
      * Methods for getting SDMA MMIO base address and size. These are set by
diff --git a/src/dev/amdgpu/sdma_packets.hh b/src/dev/amdgpu/sdma_packets.hh
index 52a47d3a2d..07d3f12600 100644
--- a/src/dev/amdgpu/sdma_packets.hh
+++ b/src/dev/amdgpu/sdma_packets.hh
@@ -37,7 +37,7 @@ namespace gem5
 {
 
 /**
- * SDMA packets
+ * SDMA packets - see src/core/inc/sdma_registers.h in ROCR-Runtime
  */
 typedef struct GEM5_PACKED
 {
@@ -80,6 +80,23 @@ typedef struct GEM5_PACKED
 }  sdmaConstFill;
 static_assert(sizeof(sdmaConstFill) == 16);
 
+typedef struct GEM5_PACKED
+{
+    union
+    {
+        struct
+        {
+            uint32_t op : 8;
+            uint32_t sub_op : 8;
+            uint32_t sw : 2;
+            uint32_t res0 : 12;
+            uint32_t fillsize : 2;
+        };
+        uint32_t ordinal;
+    };
+}  sdmaConstFillHeader;
+static_assert(sizeof(sdmaConstFillHeader) == 4);
+
 typedef struct GEM5_PACKED
 {
     uint32_t key0;

From 3589a4c11fb72ee4cb6c1707ed08d9d5eec98d5c Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 19 Jul 2023 15:39:51 -0500
Subject: [PATCH 2/3] arch-vega: Implement translate further

Starting with ROCm 5.4+, MI100 and MI200 make use of the translate
further bit in the page table. This bit enables mixing 4kiB and 2MiB
pages and is functionally equivalent to mixing page sizes using the
PDE.P bit for which gem5 currently has support.

With PDE.P bit set, we stop walking and the page size is equal to the
level in the page table we stopped at. For example, stopping at level
2 would be a 1GiB page, stopping at level 3 would be a 2MiB page.
This assumes most pages are 4kiB.

When the F bit is used, it is assumed most pages are 2MiB and we will
stop walking at the 3rd level of the page table unless the F bit is set.
When the F bit is set, the 2nd level PDE contains a block fragment size
representing the page size of the next PDE in the form of 2^(12+size).
If the next page has the F bit set we continue walking to the 4th level.
The block fragment size is hardcoded to 9 in the driver therefore we
assert that the block fragment size must be 0 or 9.

This enables MI200 with ROCm 5.4+ in gem5. This functionality was
determine by examining the driver source code in Linux and there is no
public documentation about this feature or why the change is made in or
around ROCm 5.4.

Change-Id: I603c0208cd9e821f7ad6eeb1d94ae15eaa146fb9
---
 src/arch/amdgpu/vega/pagetable_walker.cc | 31 ++++++++++++++++++++----
 src/arch/amdgpu/vega/pagetable_walker.hh |  4 ++-
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/arch/amdgpu/vega/pagetable_walker.cc b/src/arch/amdgpu/vega/pagetable_walker.cc
index ea4d6b0de9..6a71b14838 100644
--- a/src/arch/amdgpu/vega/pagetable_walker.cc
+++ b/src/arch/amdgpu/vega/pagetable_walker.cc
@@ -239,9 +239,22 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
     Addr part2 = 0;
     PageDirectoryEntry pde = static_cast<PageDirectoryEntry>(pte);
 
-    // For a four level page table block fragment size should not be needed.
-    // For now issue a panic to prevent strange behavior if it is non-zero.
-    panic_if(pde.blockFragmentSize, "PDE blockFragmentSize must be 0");
+    // Block fragment size can change the size of the pages pointed to while
+    // moving to the next PDE. A value of 0 implies native page size. A
+    // non-zero value implies the next leaf in the page table is a PTE unless
+    // the F bit is set. If we see a non-zero value, set it here and print
+    // for debugging.
+    if (pde.blockFragmentSize) {
+        DPRINTF(GPUPTWalker,
+                "blockFragmentSize: %d, pde: %#016lx, state: %d\n",
+                pde.blockFragmentSize, pde, state);
+        blockFragmentSize = pde.blockFragmentSize;
+
+        // At this time, only a value of 9 is used in the driver:
+        // https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/
+        //     amd/amdgpu/gmc_v9_0.c#L1165
+        assert(pde.blockFragmentSize == 9);
+    }
 
     switch(state) {
       case PDE2:
@@ -287,7 +300,7 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
         nextState = PDE0;
         break;
       case PDE0:
-        if (pde.p) {
+        if (pde.p || (blockFragmentSize && !pte.f)) {
             DPRINTF(GPUPTWalker, "Treating PDE0 as PTE: %#016x frag: %d\n",
                     (uint64_t)pte, pte.fragment);
             entry.pte = pte;
@@ -299,7 +312,15 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
         }
         // Read the PteAddr
         part1 = ((((uint64_t)pte) >> 6) << 3);
-        part2 = offsetFunc(vaddr, 9, 0);
+        if (pte.f) {
+            // For F bit we want to use the blockFragmentSize in the previous
+            // PDE and the blockFragmentSize in this PTE for offset function.
+            part2 = offsetFunc(vaddr,
+                               blockFragmentSize,
+                               pde.blockFragmentSize);
+        } else {
+            part2 = offsetFunc(vaddr, 9, 0);
+        }
         nextRead = ((part1 + part2) << 3) & mask(48);
         DPRINTF(GPUPTWalker,
                 "Got PDE0 entry %#016x. write:%s->%#016x va:%#016x\n",
diff --git a/src/arch/amdgpu/vega/pagetable_walker.hh b/src/arch/amdgpu/vega/pagetable_walker.hh
index 2ad0748c14..232be5de70 100644
--- a/src/arch/amdgpu/vega/pagetable_walker.hh
+++ b/src/arch/amdgpu/vega/pagetable_walker.hh
@@ -99,11 +99,13 @@ class Walker : public ClockedObject
         bool started;
         bool timing;
         PacketPtr tlbPkt;
+        int blockFragmentSize;
 
       public:
         WalkerState(Walker *_walker, PacketPtr pkt, bool is_functional = false)
             : walker(_walker), state(Ready), nextState(Ready), dataSize(8),
-              enableNX(true), retrying(false), started(false), tlbPkt(pkt)
+              enableNX(true), retrying(false), started(false), tlbPkt(pkt),
+              blockFragmentSize(0)
         {
             DPRINTF(GPUPTWalker, "Walker::WalkerState %p %p %d\n",
                     this, walker, state);

From f8490e4681e0b38f3cc8c12e195a915ef288e29b Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 19 Jul 2023 15:37:14 -0500
Subject: [PATCH 3/3] configs: Only require MMIO trace for Vega10

The MMIO trace contains register values for parts of the GPU that are
not modeled in gem5, such as registers related to the graphics core.
Since MI100 and MI200 do not have anything that is not modeled, the
MMIO trace is not needed, therefore it does not need to be used or
checked and the command line option goes away entirely for MI100/200.

Change-Id: I23839db32b1b072bd44c8c977899a99347fc9687
---
 configs/example/gpufs/runfs.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index 5346622155..0f090e2f89 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -179,10 +179,15 @@ def runGpuFSSystem(args):
         math.ceil(float(n_cu) / args.cu_per_scalar_cache)
     )
 
-    # Verify MMIO trace is valid
-    mmio_md5 = hashlib.md5(open(args.gpu_mmio_trace, "rb").read()).hexdigest()
-    if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
-        m5.util.panic("MMIO file does not match gem5 resources")
+    # Verify MMIO trace is valid. This is only needed for Vega10 simulations.
+    # The md5sum refers to the md5sum of the Vega10 MMIO hardware trace in
+    # the gem5-resources repository. By checking it here, we avoid potential
+    # errors that would cause the driver not to load and simulations to fail.
+    if args.gpu_device == "Vega10":
+        mmio_file = open(args.gpu_mmio_trace, "rb")
+        mmio_md5 = hashlib.md5(mmio_file.read()).hexdigest()
+        if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
+            m5.util.panic("MMIO file does not match gem5 resources")
 
     system = makeGpuFSSystem(args)