From 21f1e54ecd6bc6f22afee5a0d81f6f8b43751e7c Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Mon, 19 Aug 2024 17:27:32 -0700 Subject: [PATCH] dev-amdgpu: Implement UNMAP_QUEUES queue_sel==2 Unmap queues with queue_sel of 2 unmaps all queues while queue_sel of 3 unmaps all non-static queues. The implementation of 3 was actually correct for 2. Static queues are queues which were mapped using a map queues packet with a queue_type of 1 or 2. This commit adds ability to mark a queue as static. When unmap queues with queue_sel of 2 is sent, the existing code is now executed. With a value of 3, we now check if the queue was marked static and do not unmap it if marked. Change-Id: I87d7cf78a0600c7baa516c01f42c294d3c4e90c5 --- src/dev/amdgpu/amdgpu_device.cc | 4 +- src/dev/amdgpu/amdgpu_device.hh | 2 +- src/dev/amdgpu/pm4_packet_processor.cc | 91 ++++++++++++++++---------- src/dev/amdgpu/pm4_packet_processor.hh | 2 + src/dev/amdgpu/pm4_queues.hh | 6 +- src/dev/amdgpu/sdma_engine.cc | 24 +++++-- src/dev/amdgpu/sdma_engine.hh | 10 ++- 7 files changed, 94 insertions(+), 45 deletions(-) diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index fc977a2de0..f8ecad3805 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -943,13 +943,13 @@ AMDGPUDevice::deallocatePasid(uint16_t pasid) } void -AMDGPUDevice::deallocateAllQueues() +AMDGPUDevice::deallocateAllQueues(bool unmap_static) { idMap.erase(idMap.begin(), idMap.end()); usedVMIDs.erase(usedVMIDs.begin(), usedVMIDs.end()); for (auto& it : sdmaEngs) { - it.second->deallocateRLCQueues(); + it.second->deallocateRLCQueues(unmap_static); } // "All" queues implicitly refers to all user queues. User queues begin at diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index 83b79a1d05..cae48f0de0 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -217,7 +217,7 @@ class AMDGPUDevice : public PciDevice uint16_t allocateVMID(uint16_t pasid); void deallocateVmid(uint16_t vmid); void deallocatePasid(uint16_t pasid); - void deallocateAllQueues(); + void deallocateAllQueues(bool unmap_static); void mapDoorbellToVMID(Addr doorbell, uint16_t vmid); uint16_t getVMID(Addr doorbell) { return doorbellVMIDMap[doorbell]; } std::unordered_map>& getUsedVMIDs(); diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index 9a8ba13914..a550c0f282 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -518,8 +518,11 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, assert(pkt->engineSel == 2 || pkt->engineSel == 3); SDMAEngine *sdma_eng = gpuDevice->getSDMAById(pkt->engineSel - 2); + // Queue type 1 and 2 are "static" queues + bool is_static = (pkt->queueType == 2) || (pkt->queueType == 3); + // Register RLC queue with SDMA - sdma_eng->registerRLCQueue(pkt->doorbellOffset << 2, addr, mqd); + sdma_eng->registerRLCQueue(pkt->doorbellOffset << 2, addr, mqd, is_static); // Register doorbell with GPU device gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng); @@ -586,6 +589,47 @@ PM4PacketProcessor::updateReadIndex(Addr offset, uint64_t rd_idx) queuesMap[offset]->getMQD()->mqdReadIndex = rd_idx; } +void +PM4PacketProcessor::unmapAllQueues(bool unmap_static) +{ + auto &hsa_pp = gpuDevice->CP()->hsaPacketProc(); + for (auto iter : gpuDevice->getUsedVMIDs()) { + for (auto id : iter.second) { + assert(queues.count(id)); + + // Do not unmap KMD queues. + if (queues[id]->privileged()) { + continue; + } + + // Do not unmap static queues if requested. + if (!unmap_static && queues[id]->isStatic()) { + continue; + } + + QueueDesc *mqd = queues[id]->getMQD(); + DPRINTF(PM4PacketProcessor, "Unmapping queue %d with read " + "index %ld\n", id, mqd->mqdReadIndex); + + // Partially writing the mqd with an offset of 96 dwords as gem5 + // does not use the full MQD and begins 96 dwords from the start + // of the full MQD structure. See src/dev/amdgpu/pm4_queues.hh. + Addr addr = getGARTAddr(queues[id]->mqdBase() + + 96 * sizeof(uint32_t)); + Addr mqd_base = queues[id]->mqdBase(); + auto cb = new DmaVirtCallback( + [ = ] (const uint32_t &) { + doneMQDWrite(mqd_base, addr); + }); + mqd->base >>= 8; + dmaWriteVirt(addr, sizeof(QueueDesc), cb, mqd); + queues.erase(id); + hsa_pp.unsetDeviceQueueDesc(id, 8); + delete mqd; + } + } +} + void PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt) { @@ -634,38 +678,13 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt) gpuDevice->deallocatePasid(pkt->pasid); break; case 2: - panic("Unmapping queue selection 2 unimplemented\n"); + unmapAllQueues(true); + gpuDevice->deallocateAllQueues(true); break; - case 3: { - auto &hsa_pp = gpuDevice->CP()->hsaPacketProc(); - for (auto iter : gpuDevice->getUsedVMIDs()) { - for (auto id : iter.second) { - assert(queues.count(id)); - - // Do not unmap KMD queues - if (queues[id]->privileged()) { - continue; - } - QueueDesc *mqd = queues[id]->getMQD(); - DPRINTF(PM4PacketProcessor, "Unmapping queue %d with read " - "index %ld\n", id, mqd->mqdReadIndex); - // Partially writing the mqd with an offset of 96 dwords - Addr addr = getGARTAddr(queues[id]->mqdBase() + - 96 * sizeof(uint32_t)); - Addr mqd_base = queues[id]->mqdBase(); - auto cb = new DmaVirtCallback( - [ = ] (const uint32_t &) { - doneMQDWrite(mqd_base, addr); - }); - mqd->base >>= 8; - dmaWriteVirt(addr, sizeof(QueueDesc), cb, mqd); - queues.erase(id); - hsa_pp.unsetDeviceQueueDesc(id, 8); - delete mqd; - } - } - gpuDevice->deallocateAllQueues(); - } break; + case 3: + unmapAllQueues(false); + gpuDevice->deallocateAllQueues(false); + break; default: panic("Unrecognized options\n"); break; @@ -1127,6 +1146,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const uint32_t pipe[num_queues]; uint32_t queue[num_queues]; bool privileged[num_queues]; + uint32_t queue_type[num_queues]; uint32_t hqd_active[num_queues]; uint32_t hqd_vmid[num_queues]; Addr aql_rptr[num_queues]; @@ -1157,6 +1177,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const pipe[i] = q->pipe(); queue[i] = q->queue(); privileged[i] = q->privileged(); + queue_type[i] = q->queueType(); hqd_active[i] = q->getMQD()->hqd_active; hqd_vmid[i] = q->getMQD()->hqd_vmid; aql_rptr[i] = q->getMQD()->aqlRptr; @@ -1183,6 +1204,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const SERIALIZE_ARRAY(pipe, num_queues); SERIALIZE_ARRAY(queue, num_queues); SERIALIZE_ARRAY(privileged, num_queues); + SERIALIZE_ARRAY(queue_type, num_queues); SERIALIZE_ARRAY(hqd_active, num_queues); SERIALIZE_ARRAY(hqd_vmid, num_queues); SERIALIZE_ARRAY(aql_rptr, num_queues); @@ -1216,6 +1238,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) uint32_t pipe[num_queues]; uint32_t queue[num_queues]; bool privileged[num_queues]; + uint32_t queue_type[num_queues]; uint32_t hqd_active[num_queues]; uint32_t hqd_vmid[num_queues]; Addr aql_rptr[num_queues]; @@ -1239,6 +1262,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) UNSERIALIZE_ARRAY(pipe, num_queues); UNSERIALIZE_ARRAY(queue, num_queues); UNSERIALIZE_ARRAY(privileged, num_queues); + UNSERIALIZE_ARRAY(queue_type, num_queues); UNSERIALIZE_ARRAY(hqd_active, num_queues); UNSERIALIZE_ARRAY(hqd_vmid, num_queues); UNSERIALIZE_ARRAY(aql_rptr, num_queues); @@ -1269,7 +1293,8 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) queues[id[i]]->ib(ib[i]); queues[id[i]]->offset(offset[i]); queues[id[i]]->processing(processing[i]); - queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i]); + queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i], + queue_type[i]); queues[id[i]]->getMQD()->hqd_active = hqd_active[i]; queues[id[i]]->getMQD()->hqd_vmid = hqd_vmid[i]; queues[id[i]]->getMQD()->aqlRptr = aql_rptr[i]; diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh index 71271415fd..96046861ec 100644 --- a/src/dev/amdgpu/pm4_packet_processor.hh +++ b/src/dev/amdgpu/pm4_packet_processor.hh @@ -67,6 +67,8 @@ class PM4PacketProcessor : public DmaVirtDevice int _ipId; AddrRange _mmioRange; + void unmapAllQueues(bool unmap_static); + public: PM4PacketProcessor(const PM4PacketProcessorParams &p); diff --git a/src/dev/amdgpu/pm4_queues.hh b/src/dev/amdgpu/pm4_queues.hh index 9c99e10ce3..f3887fa020 100644 --- a/src/dev/amdgpu/pm4_queues.hh +++ b/src/dev/amdgpu/pm4_queues.hh @@ -486,12 +486,16 @@ class PM4Queue uint32_t pipe() { return _pkt.pipe; } uint32_t queue() { return _pkt.queueSlot; } bool privileged() { return _pkt.queueSel == 0 ? 1 : 0; } + uint32_t queueType() { return _pkt.queueType; } + bool isStatic() { return (_pkt.queueType != 0); } PM4MapQueues* getPkt() { return &_pkt; } - void setPkt(uint32_t me, uint32_t pipe, uint32_t queue, bool privileged) { + void setPkt(uint32_t me, uint32_t pipe, uint32_t queue, bool privileged, + uint32_t queueType) { _pkt.me = me - 1; _pkt.pipe = pipe; _pkt.queueSlot = queue; _pkt.queueSel = (privileged == 0) ? 1 : 0; + _pkt.queueType = queueType; } // Same computation as processMQD. See comment there for details. diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index 6955837a09..a83b97d1a0 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -179,7 +179,8 @@ SDMAEngine::translate(Addr vaddr, Addr size) } void -SDMAEngine::registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd) +SDMAEngine::registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd, + bool isStatic) { uint32_t rlc_size = 4UL << bits(mqd->sdmax_rlcx_rb_cntl, 6, 1); Addr rptr_wb_addr = mqd->sdmax_rlcx_rb_rptr_addr_hi; @@ -202,6 +203,7 @@ SDMAEngine::registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd) rlc0.setMQD(mqd); rlc0.setMQDAddr(mqdAddr); rlc0.setPriv(priv); + rlc0.setStatic(isStatic); } else if (!rlc1.valid()) { DPRINTF(SDMAEngine, "Doorbell %lx mapped to RLC1\n", doorbell); rlcInfo[1] = doorbell; @@ -216,16 +218,22 @@ SDMAEngine::registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd) rlc1.setMQD(mqd); rlc1.setMQDAddr(mqdAddr); rlc1.setPriv(priv); + rlc1.setStatic(isStatic); } else { panic("No free RLCs. Check they are properly unmapped."); } } void -SDMAEngine::unregisterRLCQueue(Addr doorbell) +SDMAEngine::unregisterRLCQueue(Addr doorbell, bool unmap_static) { DPRINTF(SDMAEngine, "Unregistering RLC queue at %#lx\n", doorbell); if (rlcInfo[0] == doorbell) { + if (!unmap_static && rlc0.isStatic()) { + DPRINTF(SDMAEngine, "RLC0 is static. Will not unregister.\n"); + return; + } + SDMAQueueDesc *mqd = rlc0.getMQD(); if (mqd) { DPRINTF(SDMAEngine, "Writing RLC0 SDMAMQD back to %#lx\n", @@ -243,6 +251,11 @@ SDMAEngine::unregisterRLCQueue(Addr doorbell) rlc0.valid(false); rlcInfo[0] = 0; } else if (rlcInfo[1] == doorbell) { + if (!unmap_static && rlc1.isStatic()) { + DPRINTF(SDMAEngine, "RLC1 is static. Will not unregister.\n"); + return; + } + SDMAQueueDesc *mqd = rlc1.getMQD(); if (mqd) { DPRINTF(SDMAEngine, "Writing RLC1 SDMAMQD back to %#lx\n", @@ -262,15 +275,16 @@ SDMAEngine::unregisterRLCQueue(Addr doorbell) } else { panic("Cannot unregister: no RLC queue at %#lx\n", doorbell); } + + gpuDevice->unsetDoorbell(doorbell); } void -SDMAEngine::deallocateRLCQueues() +SDMAEngine::deallocateRLCQueues(bool unmap_static) { for (auto doorbell: rlcInfo) { if (doorbell) { - unregisterRLCQueue(doorbell); - gpuDevice->unsetDoorbell(doorbell); + unregisterRLCQueue(doorbell, unmap_static); } } } diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh index d5fe646fac..b0391597ec 100644 --- a/src/dev/amdgpu/sdma_engine.hh +++ b/src/dev/amdgpu/sdma_engine.hh @@ -69,6 +69,7 @@ class SDMAEngine : public DmaVirtDevice SDMAQueueDesc *_mqd; Addr _mqd_addr = 0; bool _priv = true; // Only used for RLC queues. True otherwise. + bool _static = false; public: SDMAQueue() : _rptr(0), _wptr(0), _valid(false), _processing(false), _parent(nullptr), _ib(nullptr), _type(SDMAGfx), _mqd(nullptr) {} @@ -89,6 +90,7 @@ class SDMAEngine : public DmaVirtDevice SDMAQueueDesc* getMQD() { return _mqd; } Addr getMQDAddr() { return _mqd_addr; } bool priv() { return _priv; } + bool isStatic() { return _static; } void base(Addr value) { _base = value; } @@ -124,6 +126,7 @@ class SDMAEngine : public DmaVirtDevice void setMQD(SDMAQueueDesc *mqd) { _mqd = mqd; } void setMQDAddr(Addr mqdAddr) { _mqd_addr = mqdAddr; } void setPriv(bool priv) { _priv = priv; } + void setStatic(bool isStatic) { _static = isStatic; } }; /* SDMA Engine ID */ @@ -307,9 +310,10 @@ class SDMAEngine : public DmaVirtDevice /** * Methods for RLC queues */ - void registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd); - void unregisterRLCQueue(Addr doorbell); - void deallocateRLCQueues(); + void registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd, + bool isStatic); + void unregisterRLCQueue(Addr doorbell, bool unmap_static); + void deallocateRLCQueues(bool unmap_static); int cur_vmid = 0; };