gpu-compute: Support dynamic scratch allocations

dGPUs in all versions of ROCm and APUs starting with ROCM 2.2 can
under-allocate scratch resources.  This patch adds support for
the CP to trigger a recoverable error so that the host can attempt to
re-allocate scratch to satisfy the currently stalled kernel.

Note that this patch does not include a mechanism to handle dynamic
scratch allocation for queues with in-flight kernels, as these queues
would first need to be drained and descheduled, which would require some
additional effort in the hsaPP and HW queue scheduler.  If the CP
encounters this scenerio it will assert.  I suspect this is not a
particularly common occurence in most of our applications so it is left
as a TODO.

This patch also fixes a few memory leaks and updates the old DMA callback
object interface to use a much cleaner c++11 lambda interface.

Change-Id: Ica8a5fc88888283415507544d6cc49fa748fe84d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/42201
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
This commit is contained in:
Michael LeBeane
2019-04-26 15:02:38 -04:00
committed by Matt Sinclair
parent cc1bb34827
commit 25e8a14a6b
2 changed files with 146 additions and 60 deletions

View File

@@ -157,7 +157,8 @@ GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
}
void
GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value)
GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
HsaSignalCallbackFunction function)
{
// The signal value is aligned 8 bytes from
// the actual handle in the runtime
@@ -166,10 +167,9 @@ GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value)
Addr event_addr = getHsaSignalEventAddr(signal_handle);
DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
Addr *new_signal = new Addr;
*new_signal = signal_value;
auto cb = new CPDmaCallback<uint64_t>(function, signal_value);
dmaWriteVirt(value_addr, sizeof(Addr), nullptr, new_signal, 0);
dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
auto tc = system()->threads[0];
ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
@@ -297,14 +297,15 @@ GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
void
GPUCommandProcessor::initABI(HSAQueueEntry *task)
{
auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
auto cb = new CPDmaCallback<uint32_t>(
[ = ] (const uint32_t &readDispIdOffset)
{ ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);
Addr hostReadIdxPtr
= hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
&readDispIdOffEvent->readDispIdOffset);
sizeof(uint32_t), cb, &cb->dmaBuffer);
}
System*