diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc index 0427def4aa..22124b1a27 100644 --- a/src/dev/hsa/hsa_packet_processor.cc +++ b/src/dev/hsa/hsa_packet_processor.cc @@ -44,6 +44,7 @@ #include "dev/dma_device.hh" #include "dev/hsa/hsa_packet.hh" #include "dev/hsa/hw_scheduler.hh" +#include "enums/GfxVersion.hh" #include "gpu-compute/gpu_command_processor.hh" #include "mem/packet_access.hh" #include "mem/page_table.hh" @@ -100,13 +101,15 @@ void HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, - uint32_t size, int doorbellSize) + uint32_t size, int doorbellSize, + GfxVersion gfxVersion) { DPRINTF(HSAPacketProcessor, "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__, (void *)basePointer, queue_id, size); hwSchdlr->registerNewQueue(hostReadIndexPointer, - basePointer, queue_id, size, doorbellSize); + basePointer, queue_id, size, doorbellSize, + gfxVersion); } AddrRangeList diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh index 9545006cb5..aabe24e1a9 100644 --- a/src/dev/hsa/hsa_packet_processor.hh +++ b/src/dev/hsa/hsa_packet_processor.hh @@ -39,9 +39,11 @@ #include #include "base/types.hh" +#include "debug/HSAPacketProcessor.hh" #include "dev/dma_virt_device.hh" #include "dev/hsa/hsa.h" #include "dev/hsa/hsa_queue.hh" +#include "enums/GfxVersion.hh" #include "params/HSAPacketProcessor.hh" #include "sim/eventq.hh" @@ -84,14 +86,16 @@ class HSAQueueDescriptor uint64_t hostReadIndexPtr; bool stalledOnDmaBufAvailability; bool dmaInProgress; + GfxVersion gfxVersion; HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr, - uint64_t hri_ptr, uint32_t size) + uint64_t hri_ptr, uint32_t size, + GfxVersion gfxVersion) : basePointer(base_ptr), doorbellPointer(db_ptr), writeIndex(0), readIndex(0), numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr), stalledOnDmaBufAvailability(false), - dmaInProgress(false) + dmaInProgress(false), gfxVersion(gfxVersion) { } uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); } uint64_t spaceUsed() { return writeIndex - readIndex; } @@ -102,15 +106,38 @@ class HSAQueueDescriptor uint64_t ptr(uint64_t ix) { - /** - * Sometimes queues report that their size is 512k, which would - * indicate numElts of 0x2000. However, they only have 256k - * mapped which means any index over 0x1000 will fail an - * address translation. + /* + * Based on ROCm Documentation: + * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/ + 10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/ + rocr/src/core/runtime/amd_aql_queue.cpp#L99 + * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/ + 10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/ + rocr/src/core/runtime/amd_aql_queue.cpp#L624 + * + * GFX7 and GFX8 will allocate twice as much space for their HSA + * queues as they actually access (using mod operations to map the + * virtual addresses from the upper half of the queue to the same + * virtual addresses as the lower half). Thus, we need to check if + * the ISA is GFX8 and mod the address by half of the queue size if + * so. */ - assert(ix % numElts < 0x1000); - return basePointer + - ((ix % numElts) * objSize()); + uint64_t retAddr = 0ll; + if ((gfxVersion == GfxVersion::gfx801) || + (gfxVersion == GfxVersion::gfx803)) { + retAddr = basePointer + ((ix % (numElts/2)) * objSize()); + DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, " + "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, " + "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix, + numElts, numElts/2, objSize(), retAddr); + } else { + retAddr = basePointer + ((ix % numElts) * objSize()); + DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, " + "index: 0x%x, numElts: 0x%x, objSize: 0x%x, " + "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(), + retAddr); + } + return retAddr; } }; @@ -325,7 +352,8 @@ class HSAPacketProcessor: public DmaVirtDevice void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, - uint32_t size, int doorbellSize); + uint32_t size, int doorbellSize, + GfxVersion gfxVersion); void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize); void setDevice(GPUCommandProcessor * dev); void updateReadIndex(int, uint32_t); diff --git a/src/dev/hsa/hw_scheduler.cc b/src/dev/hsa/hw_scheduler.cc index 48f4e694eb..f42dedef1c 100644 --- a/src/dev/hsa/hw_scheduler.cc +++ b/src/dev/hsa/hw_scheduler.cc @@ -87,7 +87,8 @@ void HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, - uint32_t size, int doorbellSize) + uint32_t size, int doorbellSize, + GfxVersion gfxVersion) { assert(queue_id < MAX_ACTIVE_QUEUES); // Map queue ID to doorbell. @@ -108,7 +109,7 @@ HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer, HSAQueueDescriptor* q_desc = new HSAQueueDescriptor(basePointer, db_offset, - hostReadIndexPointer, size); + hostReadIndexPointer, size, gfxVersion); AQLRingBuffer* aql_buf = new AQLRingBuffer(NUM_DMA_BUFS, hsaPP->name()); QCntxt q_cntxt(q_desc, aql_buf); diff --git a/src/dev/hsa/hw_scheduler.hh b/src/dev/hsa/hw_scheduler.hh index b50273d4ce..8c043ec8ef 100644 --- a/src/dev/hsa/hw_scheduler.hh +++ b/src/dev/hsa/hw_scheduler.hh @@ -39,6 +39,7 @@ #include "base/types.hh" #include "dev/hsa/hsa_packet_processor.hh" +#include "enums/GfxVersion.hh" #include "sim/eventq.hh" // We allocate one PIO page for doorbells and each @@ -59,7 +60,8 @@ class HWScheduler void registerNewQueue(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, - uint32_t size, int doorbellSize); + uint32_t size, int doorbellSize, + GfxVersion gfxVersion); void unregisterQueue(uint64_t queue_id, int doorbellSize); void wakeup(); void schedWakeup(); diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc index 3adfadd0ea..dd8e140ad8 100644 --- a/src/gpu-compute/gpu_compute_driver.cc +++ b/src/gpu-compute/gpu_compute_driver.cc @@ -173,7 +173,7 @@ GPUComputeDriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf) auto &hsa_pp = device->hsaPacketProc(); hsa_pp.setDeviceQueueDesc(args->read_pointer_address, args->ring_base_address, args->queue_id, - args->ring_size, doorbellSize()); + args->ring_size, doorbellSize(), gfxVersion); args.copyOut(mem_proxy); }