From 96a86780ee5746675add3d167b7928ace8c697d6 Mon Sep 17 00:00:00 2001 From: Matt Sinclair Date: Fri, 8 Oct 2021 00:51:39 -0500 Subject: [PATCH] dev-hsa,gpu-compute: fix bug with gfx8 VAs for HSA Queues GFX7 (not supported in gem5) and GFX8 have a bug with how virtual addresses are calculated for their HSA queues. The ROCr component of ROCm solves this problem by doubling the HSA queue size that is requested, then mapping all virtual addresses in the second half of the queue to the same virtual addresses as the first half of the queue. This commit fixes gem5's support to mimic this behavior. Note that this change does not affect Vega's HSA queue support, because according to the ROCm documentation, Vega does not have the same problem as GCN3. Change-Id: I133cf1acc3a00a0baded0c4c3c2a25f39effdb51 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/51371 Maintainer: Matt Sinclair Tested-by: kokoro Reviewed-by: Matthew Poremba --- src/dev/hsa/hsa_packet_processor.cc | 7 ++-- src/dev/hsa/hsa_packet_processor.hh | 50 +++++++++++++++++++++------ src/dev/hsa/hw_scheduler.cc | 5 +-- src/dev/hsa/hw_scheduler.hh | 4 ++- src/gpu-compute/gpu_compute_driver.cc | 2 +- 5 files changed, 51 insertions(+), 17 deletions(-) diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc index 0427def4aa..22124b1a27 100644 --- a/src/dev/hsa/hsa_packet_processor.cc +++ b/src/dev/hsa/hsa_packet_processor.cc @@ -44,6 +44,7 @@ #include "dev/dma_device.hh" #include "dev/hsa/hsa_packet.hh" #include "dev/hsa/hw_scheduler.hh" +#include "enums/GfxVersion.hh" #include "gpu-compute/gpu_command_processor.hh" #include "mem/packet_access.hh" #include "mem/page_table.hh" @@ -100,13 +101,15 @@ void HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, - uint32_t size, int doorbellSize) + uint32_t size, int doorbellSize, + GfxVersion gfxVersion) { DPRINTF(HSAPacketProcessor, "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__, (void *)basePointer, queue_id, size); hwSchdlr->registerNewQueue(hostReadIndexPointer, - basePointer, queue_id, size, doorbellSize); + basePointer, queue_id, size, doorbellSize, + gfxVersion); } AddrRangeList diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh index 9545006cb5..aabe24e1a9 100644 --- a/src/dev/hsa/hsa_packet_processor.hh +++ b/src/dev/hsa/hsa_packet_processor.hh @@ -39,9 +39,11 @@ #include #include "base/types.hh" +#include "debug/HSAPacketProcessor.hh" #include "dev/dma_virt_device.hh" #include "dev/hsa/hsa.h" #include "dev/hsa/hsa_queue.hh" +#include "enums/GfxVersion.hh" #include "params/HSAPacketProcessor.hh" #include "sim/eventq.hh" @@ -84,14 +86,16 @@ class HSAQueueDescriptor uint64_t hostReadIndexPtr; bool stalledOnDmaBufAvailability; bool dmaInProgress; + GfxVersion gfxVersion; HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr, - uint64_t hri_ptr, uint32_t size) + uint64_t hri_ptr, uint32_t size, + GfxVersion gfxVersion) : basePointer(base_ptr), doorbellPointer(db_ptr), writeIndex(0), readIndex(0), numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr), stalledOnDmaBufAvailability(false), - dmaInProgress(false) + dmaInProgress(false), gfxVersion(gfxVersion) { } uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); } uint64_t spaceUsed() { return writeIndex - readIndex; } @@ -102,15 +106,38 @@ class HSAQueueDescriptor uint64_t ptr(uint64_t ix) { - /** - * Sometimes queues report that their size is 512k, which would - * indicate numElts of 0x2000. However, they only have 256k - * mapped which means any index over 0x1000 will fail an - * address translation. + /* + * Based on ROCm Documentation: + * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/ + 10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/ + rocr/src/core/runtime/amd_aql_queue.cpp#L99 + * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/ + 10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/ + rocr/src/core/runtime/amd_aql_queue.cpp#L624 + * + * GFX7 and GFX8 will allocate twice as much space for their HSA + * queues as they actually access (using mod operations to map the + * virtual addresses from the upper half of the queue to the same + * virtual addresses as the lower half). Thus, we need to check if + * the ISA is GFX8 and mod the address by half of the queue size if + * so. */ - assert(ix % numElts < 0x1000); - return basePointer + - ((ix % numElts) * objSize()); + uint64_t retAddr = 0ll; + if ((gfxVersion == GfxVersion::gfx801) || + (gfxVersion == GfxVersion::gfx803)) { + retAddr = basePointer + ((ix % (numElts/2)) * objSize()); + DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, " + "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, " + "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix, + numElts, numElts/2, objSize(), retAddr); + } else { + retAddr = basePointer + ((ix % numElts) * objSize()); + DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, " + "index: 0x%x, numElts: 0x%x, objSize: 0x%x, " + "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(), + retAddr); + } + return retAddr; } }; @@ -325,7 +352,8 @@ class HSAPacketProcessor: public DmaVirtDevice void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, - uint32_t size, int doorbellSize); + uint32_t size, int doorbellSize, + GfxVersion gfxVersion); void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize); void setDevice(GPUCommandProcessor * dev); void updateReadIndex(int, uint32_t); diff --git a/src/dev/hsa/hw_scheduler.cc b/src/dev/hsa/hw_scheduler.cc index 48f4e694eb..f42dedef1c 100644 --- a/src/dev/hsa/hw_scheduler.cc +++ b/src/dev/hsa/hw_scheduler.cc @@ -87,7 +87,8 @@ void HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, - uint32_t size, int doorbellSize) + uint32_t size, int doorbellSize, + GfxVersion gfxVersion) { assert(queue_id < MAX_ACTIVE_QUEUES); // Map queue ID to doorbell. @@ -108,7 +109,7 @@ HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer, HSAQueueDescriptor* q_desc = new HSAQueueDescriptor(basePointer, db_offset, - hostReadIndexPointer, size); + hostReadIndexPointer, size, gfxVersion); AQLRingBuffer* aql_buf = new AQLRingBuffer(NUM_DMA_BUFS, hsaPP->name()); QCntxt q_cntxt(q_desc, aql_buf); diff --git a/src/dev/hsa/hw_scheduler.hh b/src/dev/hsa/hw_scheduler.hh index b50273d4ce..8c043ec8ef 100644 --- a/src/dev/hsa/hw_scheduler.hh +++ b/src/dev/hsa/hw_scheduler.hh @@ -39,6 +39,7 @@ #include "base/types.hh" #include "dev/hsa/hsa_packet_processor.hh" +#include "enums/GfxVersion.hh" #include "sim/eventq.hh" // We allocate one PIO page for doorbells and each @@ -59,7 +60,8 @@ class HWScheduler void registerNewQueue(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, - uint32_t size, int doorbellSize); + uint32_t size, int doorbellSize, + GfxVersion gfxVersion); void unregisterQueue(uint64_t queue_id, int doorbellSize); void wakeup(); void schedWakeup(); diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc index 3adfadd0ea..dd8e140ad8 100644 --- a/src/gpu-compute/gpu_compute_driver.cc +++ b/src/gpu-compute/gpu_compute_driver.cc @@ -173,7 +173,7 @@ GPUComputeDriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf) auto &hsa_pp = device->hsaPacketProc(); hsa_pp.setDeviceQueueDesc(args->read_pointer_address, args->ring_base_address, args->queue_id, - args->ring_size, doorbellSize()); + args->ring_size, doorbellSize(), gfxVersion); args.copyOut(mem_proxy); }