From 2b97f17fe1714ef5429e59de12232b0c39599704 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 4 Oct 2023 09:09:56 -0500
Subject: [PATCH] gpu-compute: Fix dynamic scratch size test

ROCm supports dynamically allocating scratch space, which resides in
framebuffer memory, to reduce the amount of memory allocated for kernels
that have not yet launched. The size of the scratch space allocated is
located in task->amdQueue.compute_tmpring_size_wavesize. This size is in
kilobytes. The AQL task contains the number of bytes requested *per work
item*, however we currently check if there is enough tmpring space by
comparing a single work item. This should instead check the size *per
wavefront*.

This causes problems in applications where multiple kernels use dynamic
scratch allocation and a later kernel requires more space than the
earlier kernel. The only application being tested that does this is
LULESH. This was resulting in the scratch space being too small,
resulting in workgroups clobbering each other's private memory leading
to some nasty bugs. It is fixed by this patch as task->amdQueue will be
re-read from the host and will contain the updated tmpring size. After
this there is enough scratch space and LULESH makes forward progress.

Change-Id: Ie9e0f92bb98fd3c3d6c2da3db9ee65352f9ae070
---
 src/gpu-compute/gpu_command_processor.hh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index d2ddf5c78f..10407b9f93 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -46,6 +46,7 @@
 #include <cstdint>
 #include <functional>
 
+#include "arch/amdgpu/vega/gpu_registers.hh"
 #include "base/logging.hh"
 #include "base/trace.hh"
 #include "base/types.hh"
@@ -206,7 +207,7 @@ class GPUCommandProcessor : public DmaVirtDevice
          *  the signal is reset we should check that the runtime was
          *  successful and then proceed to launch the kernel.
          */
-        if (task->privMemPerItem() >
+        if ((task->privMemPerItem() * VegaISA::NumVecElemPerVecReg) >
             task->amdQueue.compute_tmpring_size_wavesize * 1024) {
             // TODO: Raising this signal will potentially nuke scratch
             // space for in-flight kernels that were launched from this