From 2b97f17fe1714ef5429e59de12232b0c39599704 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 4 Oct 2023 09:09:56 -0500 Subject: [PATCH] gpu-compute: Fix dynamic scratch size test ROCm supports dynamically allocating scratch space, which resides in framebuffer memory, to reduce the amount of memory allocated for kernels that have not yet launched. The size of the scratch space allocated is located in task->amdQueue.compute_tmpring_size_wavesize. This size is in kilobytes. The AQL task contains the number of bytes requested *per work item*, however we currently check if there is enough tmpring space by comparing a single work item. This should instead check the size *per wavefront*. This causes problems in applications where multiple kernels use dynamic scratch allocation and a later kernel requires more space than the earlier kernel. The only application being tested that does this is LULESH. This was resulting in the scratch space being too small, resulting in workgroups clobbering each other's private memory leading to some nasty bugs. It is fixed by this patch as task->amdQueue will be re-read from the host and will contain the updated tmpring size. After this there is enough scratch space and LULESH makes forward progress. Change-Id: Ie9e0f92bb98fd3c3d6c2da3db9ee65352f9ae070 --- src/gpu-compute/gpu_command_processor.hh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh index d2ddf5c78f..10407b9f93 100644 --- a/src/gpu-compute/gpu_command_processor.hh +++ b/src/gpu-compute/gpu_command_processor.hh @@ -46,6 +46,7 @@ #include #include +#include "arch/amdgpu/vega/gpu_registers.hh" #include "base/logging.hh" #include "base/trace.hh" #include "base/types.hh" @@ -206,7 +207,7 @@ class GPUCommandProcessor : public DmaVirtDevice * the signal is reset we should check that the runtime was * successful and then proceed to launch the kernel. */ - if (task->privMemPerItem() > + if ((task->privMemPerItem() * VegaISA::NumVecElemPerVecReg) > task->amdQueue.compute_tmpring_size_wavesize * 1024) { // TODO: Raising this signal will potentially nuke scratch // space for in-flight kernels that were launched from this