gpu-compute: Fix dynamic scratch allocation on GPUFS
When GPU needs more scratch it requests from the runtime. In the method to wait for response, a dmaReadVirt is called with the same method as the callback with zero delay. This means that effectively there is an infinite loop in the event queue if the scratch setup is not successful on the first attempt. In the case of GPUFS, it is never successfully instantly so a delay must be added. Without added delay, the host CPU is never scheduled to make progress setting up more scratch space. The value 1e9 is choosen to match the KVM quantum and hopefully give KVM a chance to schedule an event. For reference, the driver timeout is 200ms so this is still fairly aggressive checking of the signal response. This value is also balanced around the GPUCommandProc DPRINTF to prevent the print in this method from overwhelming debug output. Change-Id: I0e0e1d75cd66f7c47815b13a4bfc3c0188e16220 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/61651 Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
This commit is contained in:
@@ -220,10 +220,6 @@ class GPUCommandProcessor : public DmaVirtDevice
|
||||
task->amdQueue.compute_tmpring_size_wavesize * 1024,
|
||||
task->privMemPerItem());
|
||||
|
||||
// Currently this is not supported in GPU full system
|
||||
fatal_if(FullSystem,
|
||||
"Runtime dynamic scratch allocation not supported");
|
||||
|
||||
updateHsaSignal(task->amdQueue.queue_inactive_signal.handle, 1,
|
||||
[ = ] (const uint64_t &dma_buffer)
|
||||
{ WaitScratchDmaEvent(task, dma_buffer); });
|
||||
@@ -273,7 +269,15 @@ class GPUCommandProcessor : public DmaVirtDevice
|
||||
auto cb = new DmaVirtCallback<uint64_t>(
|
||||
[ = ] (const uint64_t &dma_buffer)
|
||||
{ WaitScratchDmaEvent(task, dma_buffer); } );
|
||||
dmaReadVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer);
|
||||
|
||||
/**
|
||||
* Delay for a large amount of ticks to give the CPU time to
|
||||
* setup the scratch space. The delay should be non-zero to since
|
||||
* this method calls back itself and can cause an infinite loop
|
||||
* in the event queue if the allocation is not completed by the
|
||||
* first time this is called.
|
||||
*/
|
||||
dmaReadVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 1e9);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user