diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 1b81c4d0b2..b25ffbf79f 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -466,7 +466,17 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) panic("Write to unkown queue type!"); } } else { - warn("Unknown doorbell offset: %lx\n", offset); + warn("Unknown doorbell offset: %lx. Saving to pending doorbells.\n", + offset); + + // We have to ACK the PCI packet immediately, so create a copy of the + // packet here to send again. + RequestPtr pending_req(pkt->req); + PacketPtr pending_pkt = Packet::createWrite(pending_req); + uint8_t *pending_data = new uint8_t[pkt->getSize()]; + pending_pkt->dataDynamic(pending_data); + + pendingDoorbellPkts.emplace(offset, pending_pkt); } } @@ -589,6 +599,17 @@ AMDGPUDevice::write(PacketPtr pkt) return pioDelay; } +void +AMDGPUDevice::processPendingDoorbells(uint32_t offset) +{ + if (pendingDoorbellPkts.count(offset)) { + DPRINTF(AMDGPUDevice, "Sending pending doorbell %x\n", offset); + writeDoorbell(pendingDoorbellPkts[offset], offset); + delete pendingDoorbellPkts[offset]; + pendingDoorbellPkts.erase(offset); + } +} + bool AMDGPUDevice::haveRegVal(uint32_t addr) { @@ -812,6 +833,14 @@ AMDGPUDevice::deallocateAllQueues() for (auto& it : sdmaEngs) { it.second->deallocateRLCQueues(); } + + // "All" queues implicitly refers to all user queues. User queues begin at + // doorbell address 0x4000, so unmap any queue at or above that address. + for (auto [offset, vmid] : doorbellVMIDMap) { + if (offset >= 0x4000) { + doorbells.erase(offset); + } + } } void diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index 7f69ec19f6..b6b6e2a81a 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -90,6 +90,7 @@ class AMDGPUDevice : public PciDevice using GPURegMap = std::unordered_map; GPURegMap regs; std::unordered_map doorbells; + std::unordered_map pendingDoorbellPkts; /** * VGA ROM methods @@ -187,6 +188,7 @@ class AMDGPUDevice : public PciDevice * Set handles to GPU blocks. */ void setDoorbellType(uint32_t offset, QueueType qt); + void processPendingDoorbells(uint32_t offset); void setSDMAEngine(Addr offset, SDMAEngine *eng); /** diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index fdb6f9d7ce..352af400b0 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -384,7 +384,10 @@ PM4PacketProcessor::mapQueues(PM4Queue *q, PM4MapQueues *pkt) "Mapping mqd from %p %p (vmid %d - last vmid %d).\n", addr, pkt->mqdAddr, pkt->vmid, gpuDevice->lastVMID()); - gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset, + // The doorbellOffset is a dword address. We shift by two / multiply + // by four to get the byte address to match doorbell addresses in + // the GPU device. + gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset << 2, gpuDevice->lastVMID()); QueueDesc *mqd = new QueueDesc(); @@ -444,6 +447,8 @@ PM4PacketProcessor::processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, DPRINTF(PM4PacketProcessor, "PM4 mqd read completed, base %p, mqd %p, " "hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql); + + gpuDevice->processPendingDoorbells(offset); } void @@ -472,6 +477,8 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, // Register doorbell with GPU device gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng); gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC); + + gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2); } void @@ -576,6 +583,7 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt) gpuDevice->deallocatePasid(pkt->pasid); break; case 2: + panic("Unmapping queue selection 2 unimplemented\n"); break; case 3: { auto &hsa_pp = gpuDevice->CP()->hsaPacketProc(); diff --git a/src/dev/hsa/hsa_packet.hh b/src/dev/hsa/hsa_packet.hh index 8c7d694431..8eab8385a6 100644 --- a/src/dev/hsa/hsa_packet.hh +++ b/src/dev/hsa/hsa_packet.hh @@ -100,6 +100,14 @@ struct _hsa_barrier_or_packet_t uint64_t completion_signal; }; +struct _hsa_generic_vendor_pkt +{ + uint32_t padding[14]; + Addr completion_signal; +}; +// All HSA AQL packets are 64 bytes. Confirm that here. +static_assert(sizeof(_hsa_generic_vendor_pkt) == 64); + } // namespace gem5 #endif // __DEV_HSA_HSA_PACKET_HH__ diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index ecc5f1d98b..05c9a95eed 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -116,28 +116,52 @@ void GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr) { - static int dynamic_task_id = 0; _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt; assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1))); /** - * we need to read a pointer in the application's address - * space to pull out the kernel code descriptor. + * Need to use a raw pointer for DmaVirtDevice API. This is deleted + * in the dispatchKernelObject method. */ - auto *tc = sys->threads[0]; - - TranslatingPortProxy fs_proxy(tc); - SETranslatingPortProxy se_proxy(tc); - PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy; + AMDKernelCode *akc = new AMDKernelCode; /** - * In full system mode, the page table entry may point to a system page - * or a device page. System pages use the proxy as normal, but a device - * page needs to be read from device memory. Check what type it is here. + * The kernel_object is a pointer to the machine code, whose entry + * point is an 'amd_kernel_code_t' type, which is included in the + * kernel binary, and describes various aspects of the kernel. The + * desired entry is the 'kernel_code_entry_byte_offset' field, + * which provides the byte offset (positive or negative) from the + * address of the amd_kernel_code_t to the start of the machine + * instructions. + * + * For SE mode we can read from the port proxy. In FS mode, we may need + * to wait for the guest OS to setup translations, especially when using + * the KVM CPU, so it is preferred to read the code object using a timing + * DMA request. */ - bool is_system_page = true; - Addr phys_addr = disp_pkt->kernel_object; - if (FullSystem) { + if (!FullSystem) { + /** + * we need to read a pointer in the application's address + * space to pull out the kernel code descriptor. + */ + auto *tc = sys->threads[0]; + SETranslatingPortProxy virt_proxy(tc); + + DPRINTF(GPUCommandProc, "reading kernel_object using proxy\n"); + virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)akc, + sizeof(AMDKernelCode)); + + dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr); + } else { + /** + * In full system mode, the page table entry may point to a system + * page or a device page. System pages use the proxy as normal, but + * a device page needs to be read from device memory. Check what type + * it is here. + */ + bool is_system_page = true; + Addr phys_addr = disp_pkt->kernel_object; + /** * Full system currently only supports running on single VMID (one * virtual memory space), i.e., one application running on GPU at a @@ -149,61 +173,68 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid), phys_addr, tmp_bytes, BaseMMU::Mode::Read, is_system_page); - } - DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n", - disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode), - is_system_page); + DPRINTF(GPUCommandProc, "kernel_object vaddr %#lx paddr %#lx size %d" + " s:%d\n", disp_pkt->kernel_object, phys_addr, + sizeof(AMDKernelCode), is_system_page); - /** - * The kernel_object is a pointer to the machine code, whose entry - * point is an 'amd_kernel_code_t' type, which is included in the - * kernel binary, and describes various aspects of the kernel. The - * desired entry is the 'kernel_code_entry_byte_offset' field, - * which provides the byte offset (positive or negative) from the - * address of the amd_kernel_code_t to the start of the machine - * instructions. - */ - AMDKernelCode akc; - if (is_system_page) { - DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n"); - virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc, - sizeof(AMDKernelCode)); - } else { - assert(FullSystem); - DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n"); + /** + * System objects use DMA device. Device objects need to use device + * memory. + */ + if (is_system_page) { + DPRINTF(GPUCommandProc, + "sending system DMA read for kernel_object\n"); - // Read from GPU memory manager one cache line at a time to prevent - // rare cases where the AKC spans two memory pages. - ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode), - system()->cacheLineSize()); - for (; !gen.done(); gen.next()) { - Addr chunk_addr = gen.addr(); - int vmid = 1; - unsigned dummy; - walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid), - chunk_addr, dummy, BaseMMU::Mode::Read, - is_system_page); + auto dma_callback = new DmaVirtCallback( + [=](const uint32_t&) { + dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr); + }); - Request::Flags flags = Request::PHYSICAL; - RequestPtr request = std::make_shared(chunk_addr, - system()->cacheLineSize(), flags, walker->getDevRequestor()); - Packet *readPkt = new Packet(request, MemCmd::ReadReq); - readPkt->dataStatic((uint8_t *)&akc + gen.complete()); - system()->getDeviceMemory(readPkt)->access(readPkt); - delete readPkt; + dmaReadVirt(disp_pkt->kernel_object, sizeof(AMDKernelCode), + dma_callback, (void *)akc); + } else { + DPRINTF(GPUCommandProc, + "kernel_object in device, using device mem\n"); + + // Read from GPU memory manager one cache line at a time to prevent + // rare cases where the AKC spans two memory pages. + ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode), + system()->cacheLineSize()); + for (; !gen.done(); gen.next()) { + Addr chunk_addr = gen.addr(); + int vmid = 1; + unsigned dummy; + walker->startFunctional( + gpuDevice->getVM().getPageTableBase(vmid), chunk_addr, + dummy, BaseMMU::Mode::Read, is_system_page); + + Request::Flags flags = Request::PHYSICAL; + RequestPtr request = std::make_shared(chunk_addr, + system()->cacheLineSize(), flags, + walker->getDevRequestor()); + Packet *readPkt = new Packet(request, MemCmd::ReadReq); + readPkt->dataStatic((uint8_t *)akc + gen.complete()); + system()->getDeviceMemory(readPkt)->access(readPkt); + delete readPkt; + } + + dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr); } } +} + +void +GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, + uint32_t queue_id, Addr host_pkt_addr) +{ + _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt; DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the " - "kernel object\n", akc.kernel_code_entry_byte_offset); - - DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n", - (uint64_t)tc->cpuId()); - + "kernel object\n", akc->kernel_code_entry_byte_offset); Addr machine_code_addr = (Addr)disp_pkt->kernel_object - + akc.kernel_code_entry_byte_offset; + + akc->kernel_code_entry_byte_offset; DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n", machine_code_addr); @@ -219,7 +250,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, * APUs to implement asynchronous memcopy operations from 2 pointers in * host memory. I have no idea what BLIT stands for. * */ - if (akc.runtime_loader_kernel_symbol) { + if (akc->runtime_loader_kernel_symbol) { kernel_name = "Some kernel"; } else { kernel_name = "Blit kernel"; @@ -230,7 +261,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion() : driver()->getGfxVersion(); HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id, - dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr, + dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr, gfxVersion); DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), " @@ -252,6 +283,8 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, // The driver expects the start time to be in ns Tick start_ts = curTick() / sim_clock::as_int::ns; dispatchStartTime.insert({disp_pkt->completion_signal, start_ts}); + + delete akc; } void @@ -473,18 +506,27 @@ GPUCommandProcessor::driver() */ /** - * TODO: For now we simply tell the HSAPP to finish the packet, - * however a future patch will update this method to provide - * the proper handling of any required vendor-specific packets. - * In the version of ROCm that is currently supported (1.6) - * the runtime will send packets that direct the CP to - * invalidate the GPUs caches. We do this automatically on - * each kernel launch in the CU, so this is safe for now. + * TODO: For now we simply tell the HSAPP to finish the packet and write a + * completion signal, if any. However, in the future proper handing may be + * required for vendor specific packets. + * + * In the version of ROCm that is currently supported the runtime will send + * packets that direct the CP to invalidate the GPU caches. We do this + * automatically on each kernel launch in the CU, so that situation is safe + * for now. */ void GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr) { + auto vendor_pkt = (_hsa_generic_vendor_pkt *)raw_pkt; + + if (vendor_pkt->completion_signal) { + sendCompletionSignal(vendor_pkt->completion_signal); + } + + warn("Ignoring vendor packet\n"); + hsaPP->finishPkt(raw_pkt, queue_id); } diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh index f6783834eb..85b2a44494 100644 --- a/src/gpu-compute/gpu_command_processor.hh +++ b/src/gpu-compute/gpu_command_processor.hh @@ -99,6 +99,8 @@ class GPUCommandProcessor : public DmaVirtDevice Addr host_pkt_addr); void attachDriver(GPUComputeDriver *driver); + void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, + uint32_t queue_id, Addr host_pkt_addr); void dispatchPkt(HSAQueueEntry *task); void signalWakeupEvent(uint32_t event_id); @@ -149,6 +151,9 @@ class GPUCommandProcessor : public DmaVirtDevice HSAPacketProcessor *hsaPP; TranslationGenPtr translate(Addr vaddr, Addr size) override; + // Running counter of dispatched tasks + int dynamic_task_id = 0; + // Keep track of start times for task dispatches. std::unordered_map dispatchStartTime; diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index 4083c1c85a..84ae139127 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -70,8 +70,6 @@ class HSAQueueEntry _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x, (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y, (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}}, - numVgprs(akc->workitem_vgpr_count), - numSgprs(akc->wavefront_sgpr_count), _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt), _hostDispPktAddr(host_pkt_addr), _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt) @@ -88,40 +86,36 @@ class HSAQueueEntry _globalWgId(0), dispatchComplete(false) { - // Precompiled BLIT kernels actually violate the spec a bit - // and don't set many of the required akc fields. For these kernels, - // we need to rip register usage from the resource registers. - // - // We can't get an exact number of registers from the resource - // registers because they round, but we can get an upper bound on it. - // We determine the number of registers by solving for "vgprs_used" - // in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html + // Use the resource descriptors to determine number of GPRs. This will + // round up in some cases, however the exact number field in the AMD + // kernel code struct is not backwards compatible and that field is + // not populated in newer compiles. The resource descriptor dword must + // be backwards compatible, so use that always. + // LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html // #code-object-v3-kernel-descriptor + // // Currently, the only supported gfx version in gem5 that computes - // this differently is gfx90a. - if (!numVgprs) { - if (gfx_version == GfxVersion::gfx90a) { - numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8; - } else { - numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; - } + // VGPR count differently is gfx90a. + if (gfx_version == GfxVersion::gfx90a) { + numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8; + } else { + numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; } - if (!numSgprs || numSgprs == - std::numeric_limitswavefront_sgpr_count)>::max()) { - // Supported major generation numbers: 0 (BLIT kernels), 8, and 9 - uint16_t version = akc->amd_machine_version_major; - assert((version == 0) || (version == 8) || (version == 9)); - // SGPR allocation granularies: - // - GFX8: 8 - // - GFX9: 16 - // Source: https://llvm.org/docs/AMDGPUUsage.html - if ((version == 0) || (version == 8)) { - // We assume that BLIT kernels use the same granularity as GFX8 - numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8; - } else if (version == 9) { - numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2; - } + // SGPR allocation granularies: + // - GFX8: 8 + // - GFX9: 16 + // Source: https://llvm.org/docs/.html + if (gfx_version == GfxVersion::gfx801 || + gfx_version == GfxVersion::gfx803) { + numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8; + } else if (gfx_version == GfxVersion::gfx900 || + gfx_version == GfxVersion::gfx902 || + gfx_version == GfxVersion::gfx908 || + gfx_version == GfxVersion::gfx90a) { + numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2; + } else { + panic("Saw unknown gfx version setting up GPR counts\n"); } initialVgprState.reset();