From 3b35e73eb8e52261750aa8613ec9b4c77852759e Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 19 Jul 2023 15:26:02 -0500 Subject: [PATCH] dev-amdgpu: Implement SDMA constant fill This SDMA packet is much more common starting around ROCm 5.4. Previously this was mostly used to clear page tables after an application ended and was therefore left unimplemented. It is now used for basic operation like device memsets. This patch implements constant fill as it is now necessary. Change-Id: I9b2cf076ec17f5ed07c20bb820e7db0c082bbfbc --- src/dev/amdgpu/sdma_engine.cc | 71 ++++++++++++++++++++++++++++++++-- src/dev/amdgpu/sdma_engine.hh | 2 + src/dev/amdgpu/sdma_packets.hh | 19 ++++++++- 3 files changed, 88 insertions(+), 4 deletions(-) diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index e99d694634..0202f583e6 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -510,9 +510,12 @@ SDMAEngine::decodeHeader(SDMAQueue *q, uint32_t header) dmaReadVirt(q->rptr(), sizeof(sdmaAtomic), cb, dmaBuffer); } break; case SDMA_OP_CONST_FILL: { - q->incRptr(sizeof(sdmaConstFill)); - warn("SDMA_OP_CONST_FILL not implemented"); - decodeNext(q); + DPRINTF(SDMAEngine, "SDMA Constant fill packet\n"); + dmaBuffer = new sdmaConstFill(); + cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { constFill(q, (sdmaConstFill *)dmaBuffer, header); }); + dmaReadVirt(q->rptr(), sizeof(sdmaConstFill), cb, dmaBuffer); } break; case SDMA_OP_PTEPDE: { DPRINTF(SDMAEngine, "SDMA PTEPDE packet\n"); @@ -1026,6 +1029,68 @@ SDMAEngine::atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt, decodeNext(q); } +void +SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header) +{ + q->incRptr(sizeof(sdmaConstFill)); + + sdmaConstFillHeader fill_header; + fill_header.ordinal = header; + + DPRINTF(SDMAEngine, "ConstFill %lx srcData %x count %d size %d sw %d\n", + pkt->addr, pkt->srcData, pkt->count, fill_header.fillsize, + fill_header.sw); + + // Count is number of elements - 1. Size is log2 of byte size. + int fill_bytes = (pkt->count + 1) * (1 << fill_header.fillsize); + uint8_t *fill_data = new uint8_t[fill_bytes]; + + memset(fill_data, pkt->srcData, fill_bytes); + + Addr device_addr = getDeviceAddress(pkt->addr); + if (device_addr) { + DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to device at %lx\n", + fill_bytes, pkt->srcData, pkt->addr); + + auto cb = new EventFunctionWrapper( + [ = ]{ constFillDone(q, pkt, fill_data); }, name()); + + // Copy the minimum page size at a time in case the physical addresses + // are not contiguous. + ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE); + for (; !gen.done(); gen.next()) { + Addr chunk_addr = getDeviceAddress(gen.addr()); + assert(chunk_addr); + + DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n", + gen.size(), gen.addr(), chunk_addr); + + gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data, + gen.size(), 0, + gen.last() ? cb : nullptr); + fill_data += gen.size(); + } + } else { + DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n", + fill_bytes, pkt->srcData, pkt->addr); + + auto cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { constFillDone(q, pkt, fill_data); }); + dmaWriteVirt(pkt->addr, fill_bytes, cb, (void *)fill_data); + } +} + +void +SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data) +{ + DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr); + + delete fill_data; + delete pkt; + decodeNext(q); +} + AddrRangeList SDMAEngine::getAddrRanges() const { diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh index bcbd497e8a..5abe63fcc6 100644 --- a/src/dev/amdgpu/sdma_engine.hh +++ b/src/dev/amdgpu/sdma_engine.hh @@ -245,6 +245,8 @@ class SDMAEngine : public DmaVirtDevice uint64_t *dmaBuffer); void atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt, uint64_t *dmaBuffer); + void constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header); + void constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data); /** * Methods for getting SDMA MMIO base address and size. These are set by diff --git a/src/dev/amdgpu/sdma_packets.hh b/src/dev/amdgpu/sdma_packets.hh index 52a47d3a2d..07d3f12600 100644 --- a/src/dev/amdgpu/sdma_packets.hh +++ b/src/dev/amdgpu/sdma_packets.hh @@ -37,7 +37,7 @@ namespace gem5 { /** - * SDMA packets + * SDMA packets - see src/core/inc/sdma_registers.h in ROCR-Runtime */ typedef struct GEM5_PACKED { @@ -80,6 +80,23 @@ typedef struct GEM5_PACKED } sdmaConstFill; static_assert(sizeof(sdmaConstFill) == 16); +typedef struct GEM5_PACKED +{ + union + { + struct + { + uint32_t op : 8; + uint32_t sub_op : 8; + uint32_t sw : 2; + uint32_t res0 : 12; + uint32_t fillsize : 2; + }; + uint32_t ordinal; + }; +} sdmaConstFillHeader; +static_assert(sizeof(sdmaConstFillHeader) == 4); + typedef struct GEM5_PACKED { uint32_t key0;