gpu-compute: Support dynamic scratch allocations
dGPUs in all versions of ROCm and APUs starting with ROCM 2.2 can under-allocate scratch resources. This patch adds support for the CP to trigger a recoverable error so that the host can attempt to re-allocate scratch to satisfy the currently stalled kernel. Note that this patch does not include a mechanism to handle dynamic scratch allocation for queues with in-flight kernels, as these queues would first need to be drained and descheduled, which would require some additional effort in the hsaPP and HW queue scheduler. If the CP encounters this scenerio it will assert. I suspect this is not a particularly common occurence in most of our applications so it is left as a TODO. This patch also fixes a few memory leaks and updates the old DMA callback object interface to use a much cleaner c++11 lambda interface. Change-Id: Ica8a5fc88888283415507544d6cc49fa748fe84d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/42201 Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
This commit is contained in:
committed by
Matt Sinclair
parent
cc1bb34827
commit
25e8a14a6b
@@ -157,7 +157,8 @@ GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
|
||||
}
|
||||
|
||||
void
|
||||
GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value)
|
||||
GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
|
||||
HsaSignalCallbackFunction function)
|
||||
{
|
||||
// The signal value is aligned 8 bytes from
|
||||
// the actual handle in the runtime
|
||||
@@ -166,10 +167,9 @@ GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value)
|
||||
Addr event_addr = getHsaSignalEventAddr(signal_handle);
|
||||
DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
|
||||
|
||||
Addr *new_signal = new Addr;
|
||||
*new_signal = signal_value;
|
||||
auto cb = new CPDmaCallback<uint64_t>(function, signal_value);
|
||||
|
||||
dmaWriteVirt(value_addr, sizeof(Addr), nullptr, new_signal, 0);
|
||||
dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
|
||||
|
||||
auto tc = system()->threads[0];
|
||||
ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
|
||||
@@ -297,14 +297,15 @@ GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
|
||||
void
|
||||
GPUCommandProcessor::initABI(HSAQueueEntry *task)
|
||||
{
|
||||
auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
|
||||
auto cb = new CPDmaCallback<uint32_t>(
|
||||
[ = ] (const uint32_t &readDispIdOffset)
|
||||
{ ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);
|
||||
|
||||
Addr hostReadIdxPtr
|
||||
= hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
|
||||
|
||||
dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
|
||||
sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
|
||||
&readDispIdOffEvent->readDispIdOffset);
|
||||
sizeof(uint32_t), cb, &cb->dmaBuffer);
|
||||
}
|
||||
|
||||
System*
|
||||
|
||||
Reference in New Issue
Block a user