dev-hsa,gpu-compute: fix bug with gfx8 VAs for HSA Queues

GFX7 (not supported in gem5) and GFX8 have a bug with how virtual
addresses are calculated for their HSA queues.  The ROCr component of
ROCm solves this problem by doubling the HSA queue size that is
requested, then mapping all virtual addresses in the second half of the
queue to the same virtual addresses as the first half of the queue.
This commit fixes gem5's support to mimic this behavior.

Note that this change does not affect Vega's HSA queue support, because
according to the ROCm documentation, Vega does not have the same problem
as GCN3.

Change-Id: I133cf1acc3a00a0baded0c4c3c2a25f39effdb51
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/51371
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
This commit is contained in:
Matt Sinclair
2021-10-08 00:51:39 -05:00
parent ac63b7e294
commit 96a86780ee
5 changed files with 51 additions and 17 deletions

View File

@@ -44,6 +44,7 @@
#include "dev/dma_device.hh"
#include "dev/hsa/hsa_packet.hh"
#include "dev/hsa/hw_scheduler.hh"
#include "enums/GfxVersion.hh"
#include "gpu-compute/gpu_command_processor.hh"
#include "mem/packet_access.hh"
#include "mem/page_table.hh"
@@ -100,13 +101,15 @@ void
HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
uint64_t basePointer,
uint64_t queue_id,
uint32_t size, int doorbellSize)
uint32_t size, int doorbellSize,
GfxVersion gfxVersion)
{
DPRINTF(HSAPacketProcessor,
"%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
(void *)basePointer, queue_id, size);
hwSchdlr->registerNewQueue(hostReadIndexPointer,
basePointer, queue_id, size, doorbellSize);
basePointer, queue_id, size, doorbellSize,
gfxVersion);
}
AddrRangeList

View File

@@ -39,9 +39,11 @@
#include <vector>
#include "base/types.hh"
#include "debug/HSAPacketProcessor.hh"
#include "dev/dma_virt_device.hh"
#include "dev/hsa/hsa.h"
#include "dev/hsa/hsa_queue.hh"
#include "enums/GfxVersion.hh"
#include "params/HSAPacketProcessor.hh"
#include "sim/eventq.hh"
@@ -84,14 +86,16 @@ class HSAQueueDescriptor
uint64_t hostReadIndexPtr;
bool stalledOnDmaBufAvailability;
bool dmaInProgress;
GfxVersion gfxVersion;
HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
uint64_t hri_ptr, uint32_t size)
uint64_t hri_ptr, uint32_t size,
GfxVersion gfxVersion)
: basePointer(base_ptr), doorbellPointer(db_ptr),
writeIndex(0), readIndex(0),
numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr),
stalledOnDmaBufAvailability(false),
dmaInProgress(false)
dmaInProgress(false), gfxVersion(gfxVersion)
{ }
uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
uint64_t spaceUsed() { return writeIndex - readIndex; }
@@ -102,15 +106,38 @@ class HSAQueueDescriptor
uint64_t ptr(uint64_t ix)
{
/**
* Sometimes queues report that their size is 512k, which would
* indicate numElts of 0x2000. However, they only have 256k
* mapped which means any index over 0x1000 will fail an
* address translation.
/*
* Based on ROCm Documentation:
* - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
rocr/src/core/runtime/amd_aql_queue.cpp#L99
* - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
rocr/src/core/runtime/amd_aql_queue.cpp#L624
*
* GFX7 and GFX8 will allocate twice as much space for their HSA
* queues as they actually access (using mod operations to map the
* virtual addresses from the upper half of the queue to the same
* virtual addresses as the lower half). Thus, we need to check if
* the ISA is GFX8 and mod the address by half of the queue size if
* so.
*/
assert(ix % numElts < 0x1000);
return basePointer +
((ix % numElts) * objSize());
uint64_t retAddr = 0ll;
if ((gfxVersion == GfxVersion::gfx801) ||
(gfxVersion == GfxVersion::gfx803)) {
retAddr = basePointer + ((ix % (numElts/2)) * objSize());
DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, "
"index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, "
"objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix,
numElts, numElts/2, objSize(), retAddr);
} else {
retAddr = basePointer + ((ix % numElts) * objSize());
DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
"index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
"retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
retAddr);
}
return retAddr;
}
};
@@ -325,7 +352,8 @@ class HSAPacketProcessor: public DmaVirtDevice
void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
uint64_t basePointer,
uint64_t queue_id,
uint32_t size, int doorbellSize);
uint32_t size, int doorbellSize,
GfxVersion gfxVersion);
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
void setDevice(GPUCommandProcessor * dev);
void updateReadIndex(int, uint32_t);

View File

@@ -87,7 +87,8 @@ void
HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
uint64_t basePointer,
uint64_t queue_id,
uint32_t size, int doorbellSize)
uint32_t size, int doorbellSize,
GfxVersion gfxVersion)
{
assert(queue_id < MAX_ACTIVE_QUEUES);
// Map queue ID to doorbell.
@@ -108,7 +109,7 @@ HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
HSAQueueDescriptor* q_desc =
new HSAQueueDescriptor(basePointer, db_offset,
hostReadIndexPointer, size);
hostReadIndexPointer, size, gfxVersion);
AQLRingBuffer* aql_buf =
new AQLRingBuffer(NUM_DMA_BUFS, hsaPP->name());
QCntxt q_cntxt(q_desc, aql_buf);

View File

@@ -39,6 +39,7 @@
#include "base/types.hh"
#include "dev/hsa/hsa_packet_processor.hh"
#include "enums/GfxVersion.hh"
#include "sim/eventq.hh"
// We allocate one PIO page for doorbells and each
@@ -59,7 +60,8 @@ class HWScheduler
void registerNewQueue(uint64_t hostReadIndexPointer,
uint64_t basePointer,
uint64_t queue_id,
uint32_t size, int doorbellSize);
uint32_t size, int doorbellSize,
GfxVersion gfxVersion);
void unregisterQueue(uint64_t queue_id, int doorbellSize);
void wakeup();
void schedWakeup();

View File

@@ -173,7 +173,7 @@ GPUComputeDriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
auto &hsa_pp = device->hsaPacketProc();
hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
args->ring_base_address, args->queue_id,
args->ring_size, doorbellSize());
args->ring_size, doorbellSize(), gfxVersion);
args.copyOut(mem_proxy);
}