gpu-compute: Support dynamic scratch allocations

dGPUs in all versions of ROCm and APUs starting with ROCM 2.2 can under-allocate scratch resources. This patch adds support for the CP to trigger a recoverable error so that the host can attempt to re-allocate scratch to satisfy the currently stalled kernel. Note that this patch does not include a mechanism to handle dynamic scratch allocation for queues with in-flight kernels, as these queues would first need to be drained and descheduled, which would require some additional effort in the hsaPP and HW queue scheduler. If the CP encounters this scenerio it will assert. I suspect this is not a particularly common occurence in most of our applications so it is left as a TODO. This patch also fixes a few memory leaks and updates the old DMA callback object interface to use a much cleaner c++11 lambda interface. Change-Id: Ica8a5fc88888283415507544d6cc49fa748fe84d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/42201 Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
2019-04-26 15:02:38 -04:00
parent cc1bb34827
commit 25e8a14a6b
2 changed files with 146 additions and 60 deletions
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -157,7 +157,8 @@ GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
 }

 void
-GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value)
+GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
+                                     HsaSignalCallbackFunction function)
 {
    // The signal value is aligned 8 bytes from
    // the actual handle in the runtime
@@ -166,10 +167,9 @@ GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value)
    Addr event_addr = getHsaSignalEventAddr(signal_handle);
    DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);

-    Addr *new_signal = new Addr;
-    *new_signal = signal_value;
+    auto cb = new CPDmaCallback<uint64_t>(function, signal_value);

-    dmaWriteVirt(value_addr, sizeof(Addr), nullptr, new_signal, 0);
+    dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);

    auto tc = system()->threads[0];
    ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
@@ -297,14 +297,15 @@ GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
 void
 GPUCommandProcessor::initABI(HSAQueueEntry *task)
 {
-    auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
+    auto cb = new CPDmaCallback<uint32_t>(
+        [ = ] (const uint32_t &readDispIdOffset)
+            { ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);

    Addr hostReadIdxPtr
        = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;

    dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
-        sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
-            &readDispIdOffEvent->readDispIdOffset);
+        sizeof(uint32_t), cb, &cb->dmaBuffer);
 }

 System*