From 37da1c45f328e45fc1e07ea55197742bae007d7d Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 13 Oct 2023 14:59:56 -0500 Subject: [PATCH] dev-amdgpu: Better handling for queue remapping The amdgpu driver can, at *any* time, tell the device to unmap a queue to force the queue descriptor to be written back to main memory in the form of a memory queue descriptor (MQD). It will then immediately remap the queue and continue writing the doorbell to the queue. It is possible that the doorbell write occurs after the queue is unmapped but before it is remapped. In this situation, we need to check the updated value of the doorbell for the queue and write that to the queue after it is mapped. To handle this, a pending doorbell packet map is created to hold a packet to replay when the queue is mapped. Because PCI in gem5 implements only the atomic protocol port, we cannot use the original packet as it must respond in the same Tick. This patch fixes issues with the doorbell maps not being cleared on unmapping to ensure the doorbell is not found in writeDoorbell and places in the pending doorbell map. This includes fixing the doorbell offset value in the doorbell to VMID map which was is now multiplied by four as it is a dword address. This was tested using tensorflow 2.0's MNIST example which was seeing this issue consistently. With this patch it now makes progress and does issue pending doorbell writes. Change-Id: Ic6b401d3fe7fc46b7bcbf19a769cdea6814e7d1e --- src/dev/amdgpu/amdgpu_device.cc | 31 +++++++++++++++++++++++++- src/dev/amdgpu/amdgpu_device.hh | 2 ++ src/dev/amdgpu/pm4_packet_processor.cc | 10 ++++++++- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 1b81c4d0b2..b25ffbf79f 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -466,7 +466,17 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) panic("Write to unkown queue type!"); } } else { - warn("Unknown doorbell offset: %lx\n", offset); + warn("Unknown doorbell offset: %lx. Saving to pending doorbells.\n", + offset); + + // We have to ACK the PCI packet immediately, so create a copy of the + // packet here to send again. + RequestPtr pending_req(pkt->req); + PacketPtr pending_pkt = Packet::createWrite(pending_req); + uint8_t *pending_data = new uint8_t[pkt->getSize()]; + pending_pkt->dataDynamic(pending_data); + + pendingDoorbellPkts.emplace(offset, pending_pkt); } } @@ -589,6 +599,17 @@ AMDGPUDevice::write(PacketPtr pkt) return pioDelay; } +void +AMDGPUDevice::processPendingDoorbells(uint32_t offset) +{ + if (pendingDoorbellPkts.count(offset)) { + DPRINTF(AMDGPUDevice, "Sending pending doorbell %x\n", offset); + writeDoorbell(pendingDoorbellPkts[offset], offset); + delete pendingDoorbellPkts[offset]; + pendingDoorbellPkts.erase(offset); + } +} + bool AMDGPUDevice::haveRegVal(uint32_t addr) { @@ -812,6 +833,14 @@ AMDGPUDevice::deallocateAllQueues() for (auto& it : sdmaEngs) { it.second->deallocateRLCQueues(); } + + // "All" queues implicitly refers to all user queues. User queues begin at + // doorbell address 0x4000, so unmap any queue at or above that address. + for (auto [offset, vmid] : doorbellVMIDMap) { + if (offset >= 0x4000) { + doorbells.erase(offset); + } + } } void diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index 7f69ec19f6..b6b6e2a81a 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -90,6 +90,7 @@ class AMDGPUDevice : public PciDevice using GPURegMap = std::unordered_map; GPURegMap regs; std::unordered_map doorbells; + std::unordered_map pendingDoorbellPkts; /** * VGA ROM methods @@ -187,6 +188,7 @@ class AMDGPUDevice : public PciDevice * Set handles to GPU blocks. */ void setDoorbellType(uint32_t offset, QueueType qt); + void processPendingDoorbells(uint32_t offset); void setSDMAEngine(Addr offset, SDMAEngine *eng); /** diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index fdb6f9d7ce..352af400b0 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -384,7 +384,10 @@ PM4PacketProcessor::mapQueues(PM4Queue *q, PM4MapQueues *pkt) "Mapping mqd from %p %p (vmid %d - last vmid %d).\n", addr, pkt->mqdAddr, pkt->vmid, gpuDevice->lastVMID()); - gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset, + // The doorbellOffset is a dword address. We shift by two / multiply + // by four to get the byte address to match doorbell addresses in + // the GPU device. + gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset << 2, gpuDevice->lastVMID()); QueueDesc *mqd = new QueueDesc(); @@ -444,6 +447,8 @@ PM4PacketProcessor::processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, DPRINTF(PM4PacketProcessor, "PM4 mqd read completed, base %p, mqd %p, " "hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql); + + gpuDevice->processPendingDoorbells(offset); } void @@ -472,6 +477,8 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, // Register doorbell with GPU device gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng); gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC); + + gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2); } void @@ -576,6 +583,7 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt) gpuDevice->deallocatePasid(pkt->pasid); break; case 2: + panic("Unmapping queue selection 2 unimplemented\n"); break; case 3: { auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();