From eee42275eeea15e4814b1f9df6709b3f69e87b22 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 18 Nov 2022 16:47:50 -0800 Subject: [PATCH] dev-amdgpu: Writeback RLC queue MQD when unmapped Currently when RLC queues (user mode queues) are mapped, the read/write pointers of the ring buffer are set to zero. However, these queues could be unmapped and then remapped later. In that situation the read/write pointers should be the previous value before unmapping occurred. Since the read pointer gets reset to zero, the queue begins reading from the start of the ring, which usually contains older packets. There is a 99% chance those packets contain addresses which are no longer in the page tables which will cause a page fault. To fix this we update the MQD with the current read/write pointer values and then writeback the MQD to memory when the queue is unmapped. This requires adding a pointer to the MQD and the host address of the MQD where it should be written back to. The interface for registering RLC queue is also simplified. Since we need to pass the MQD anyway, we can get values from it as well. Fixes b+tree and streamcluster from rodinia (when using RLC queues). Change-Id: Ie5dad4d7d90ea240c3e9f0cddf3e844a3cd34c4f Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/65791 Tested-by: kokoro Maintainer: Matt Sinclair Reviewed-by: Matt Sinclair --- src/dev/amdgpu/pm4_packet_processor.cc | 4 +- src/dev/amdgpu/pm4_queues.hh | 24 +++++++++-- src/dev/amdgpu/sdma_engine.cc | 58 ++++++++++++++++++++++---- src/dev/amdgpu/sdma_engine.hh | 12 ++++-- 4 files changed, 79 insertions(+), 19 deletions(-) diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index f78f8333a6..152fd4da73 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -458,9 +458,7 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, SDMAEngine *sdma_eng = gpuDevice->getSDMAById(pkt->engineSel - 2); // Register RLC queue with SDMA - sdma_eng->registerRLCQueue(pkt->doorbellOffset << 2, - mqd->rb_base << 8, rlc_size, - rptr_wb_addr); + sdma_eng->registerRLCQueue(pkt->doorbellOffset << 2, addr, mqd); // Register doorbell with GPU device gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng); diff --git a/src/dev/amdgpu/pm4_queues.hh b/src/dev/amdgpu/pm4_queues.hh index 8b6626d176..ddadd6543b 100644 --- a/src/dev/amdgpu/pm4_queues.hh +++ b/src/dev/amdgpu/pm4_queues.hh @@ -33,6 +33,8 @@ #ifndef __DEV_AMDGPU_PM4_QUEUES_HH__ #define __DEV_AMDGPU_PM4_QUEUES_HH__ +#include "dev/amdgpu/pm4_defines.hh" + namespace gem5 { @@ -201,10 +203,24 @@ typedef struct GEM5_PACKED }; uint64_t rb_base; }; - uint32_t sdmax_rlcx_rb_rptr; - uint32_t sdmax_rlcx_rb_rptr_hi; - uint32_t sdmax_rlcx_rb_wptr; - uint32_t sdmax_rlcx_rb_wptr_hi; + union + { + struct + { + uint32_t sdmax_rlcx_rb_rptr; + uint32_t sdmax_rlcx_rb_rptr_hi; + }; + uint64_t rptr; + }; + union + { + struct + { + uint32_t sdmax_rlcx_rb_wptr; + uint32_t sdmax_rlcx_rb_wptr_hi; + }; + uint64_t wptr; + }; uint32_t sdmax_rlcx_rb_wptr_poll_cntl; uint32_t sdmax_rlcx_rb_rptr_addr_hi; uint32_t sdmax_rlcx_rb_rptr_addr_lo; diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index 02203c8178..4c03bf57b2 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -165,30 +165,40 @@ SDMAEngine::translate(Addr vaddr, Addr size) } void -SDMAEngine::registerRLCQueue(Addr doorbell, Addr rb_base, uint32_t size, - Addr rptr_wb_addr) +SDMAEngine::registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd) { + uint32_t rlc_size = 4UL << bits(mqd->sdmax_rlcx_rb_cntl, 6, 1); + Addr rptr_wb_addr = mqd->sdmax_rlcx_rb_rptr_addr_hi; + rptr_wb_addr <<= 32; + rptr_wb_addr |= mqd->sdmax_rlcx_rb_rptr_addr_lo; + // Get first free RLC if (!rlc0.valid()) { DPRINTF(SDMAEngine, "Doorbell %lx mapped to RLC0\n", doorbell); rlcInfo[0] = doorbell; rlc0.valid(true); - rlc0.base(rb_base); + rlc0.base(mqd->rb_base << 8); + rlc0.size(rlc_size); rlc0.rptr(0); - rlc0.wptr(0); + rlc0.incRptr(mqd->rptr); + rlc0.setWptr(mqd->wptr); rlc0.rptrWbAddr(rptr_wb_addr); rlc0.processing(false); - rlc0.size(size); + rlc0.setMQD(mqd); + rlc0.setMQDAddr(mqdAddr); } else if (!rlc1.valid()) { DPRINTF(SDMAEngine, "Doorbell %lx mapped to RLC1\n", doorbell); rlcInfo[1] = doorbell; rlc1.valid(true); - rlc1.base(rb_base); + rlc1.base(mqd->rb_base << 8); + rlc1.size(rlc_size); rlc1.rptr(0); - rlc1.wptr(0); + rlc1.incRptr(mqd->rptr); + rlc1.setWptr(mqd->wptr); rlc1.rptrWbAddr(rptr_wb_addr); rlc1.processing(false); - rlc1.size(size); + rlc1.setMQD(mqd); + rlc1.setMQDAddr(mqdAddr); } else { panic("No free RLCs. Check they are properly unmapped."); } @@ -199,9 +209,37 @@ SDMAEngine::unregisterRLCQueue(Addr doorbell) { DPRINTF(SDMAEngine, "Unregistering RLC queue at %#lx\n", doorbell); if (rlcInfo[0] == doorbell) { + SDMAQueueDesc *mqd = rlc0.getMQD(); + if (mqd) { + DPRINTF(SDMAEngine, "Writing RLC0 SDMAMQD back to %#lx\n", + rlc0.getMQDAddr()); + + mqd->rptr = rlc0.globalRptr(); + mqd->wptr = rlc0.getWptr(); + + auto cb = new DmaVirtCallback( + [ = ] (const uint32_t &) { }); + dmaWriteVirt(rlc0.getMQDAddr(), sizeof(SDMAQueueDesc), cb, mqd); + } else { + warn("RLC0 SDMAMQD address invalid\n"); + } rlc0.valid(false); rlcInfo[0] = 0; } else if (rlcInfo[1] == doorbell) { + SDMAQueueDesc *mqd = rlc1.getMQD(); + if (mqd) { + DPRINTF(SDMAEngine, "Writing RLC1 SDMAMQD back to %#lx\n", + rlc1.getMQDAddr()); + + mqd->rptr = rlc1.globalRptr(); + mqd->wptr = rlc1.getWptr(); + + auto cb = new DmaVirtCallback( + [ = ] (const uint32_t &) { }); + dmaWriteVirt(rlc1.getMQDAddr(), sizeof(SDMAQueueDesc), cb, mqd); + } else { + warn("RLC1 SDMAMQD address invalid\n"); + } rlc1.valid(false); rlcInfo[1] = 0; } else { @@ -213,7 +251,9 @@ void SDMAEngine::deallocateRLCQueues() { for (auto doorbell: rlcInfo) { - unregisterRLCQueue(doorbell); + if (doorbell) { + unregisterRLCQueue(doorbell); + } } } diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh index 0bfee126c9..27c169193b 100644 --- a/src/dev/amdgpu/sdma_engine.hh +++ b/src/dev/amdgpu/sdma_engine.hh @@ -34,6 +34,7 @@ #include "base/bitunion.hh" #include "dev/amdgpu/amdgpu_device.hh" +#include "dev/amdgpu/pm4_queues.hh" #include "dev/amdgpu/sdma_packets.hh" #include "dev/dma_virt_device.hh" #include "params/SDMAEngine.hh" @@ -65,9 +66,11 @@ class SDMAEngine : public DmaVirtDevice SDMAQueue *_parent; SDMAQueue *_ib; SDMAType _type; + SDMAQueueDesc *_mqd; + Addr _mqd_addr = 0; public: SDMAQueue() : _rptr(0), _wptr(0), _valid(false), _processing(false), - _parent(nullptr), _ib(nullptr), _type(SDMAGfx) {} + _parent(nullptr), _ib(nullptr), _type(SDMAGfx), _mqd(nullptr) {} Addr base() { return _base; } Addr rptr() { return _base + _rptr; } @@ -82,6 +85,8 @@ class SDMAEngine : public DmaVirtDevice SDMAQueue* parent() { return _parent; } SDMAQueue* ib() { return _ib; } SDMAType queueType() { return _type; } + SDMAQueueDesc* getMQD() { return _mqd; } + Addr getMQDAddr() { return _mqd_addr; } void base(Addr value) { _base = value; } @@ -114,6 +119,8 @@ class SDMAEngine : public DmaVirtDevice void parent(SDMAQueue* q) { _parent = q; } void ib(SDMAQueue* ib) { _ib = ib; } void queueType(SDMAType type) { _type = type; } + void setMQD(SDMAQueueDesc *mqd) { _mqd = mqd; } + void setMQDAddr(Addr mqdAddr) { _mqd_addr = mqdAddr; } }; /* SDMA Engine ID */ @@ -280,8 +287,7 @@ class SDMAEngine : public DmaVirtDevice /** * Methods for RLC queues */ - void registerRLCQueue(Addr doorbell, Addr rb_base, uint32_t size, - Addr rptr_wb_addr); + void registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd); void unregisterRLCQueue(Addr doorbell); void deallocateRLCQueues();