From 90a518e885dba14f6e6f0101ae5d354caa8e8741 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 23 Aug 2023 19:21:55 -0500
Subject: [PATCH] gpu-compute,arch-vega: Fix ALU-only LDS counters

There are a few LDS instructions that perform local ALU operations and
writeback which are marked as loads. These are marked as loads because
they fit in the pipeline logic better, according to a several year old
comment. In the VEGA ISA these instructions (swizzle, permute, bpermute)
are not decrementing the LDS load counter. As a result, the counter will
gradually increase over time. Since wavefront slots are persistent, this
can cause applications with a few thousand kernels to eventually hang
thinking there are not enough resources.

This changeset fixes this by decrementing the LDS load counter for these
instructions. This fix was already integrated in the GCN3 ISA in the
exact same way. This changeset moves it near a similar comment about
scheduling register file writes.

Change-Id: Ife5237a2cae7213948c32ef266f4f8f22917351c
---
 src/arch/amdgpu/vega/insts/instructions.cc | 15 +++++++++++++++
 src/gpu-compute/compute_unit.cc            |  7 +++++++
 2 files changed, 22 insertions(+)

diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc
index ab9c1cecf2..b0f8c908ed 100644
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -35971,6 +35971,11 @@ namespace VegaISA
          */
         wf->computeUnit->vrf[wf->simdId]->
             scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
     } // execute
     // --- Inst_DS__DS_PERMUTE_B32 class methods ---
 
@@ -36054,6 +36059,11 @@ namespace VegaISA
          */
         wf->computeUnit->vrf[wf->simdId]->
             scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
     } // execute
     // --- Inst_DS__DS_BPERMUTE_B32 class methods ---
 
@@ -36137,6 +36147,11 @@ namespace VegaISA
          */
         wf->computeUnit->vrf[wf->simdId]->
             scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
     } // execute
 
     // --- Inst_DS__DS_ADD_U64 class methods ---
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 06fe28f5b8..ea903455d5 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -383,6 +383,13 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
 
     stats.waveLevelParallelism.sample(activeWaves);
     activeWaves++;
+
+    panic_if(w->wrGmReqsInPipe, "GM write counter for wavefront non-zero\n");
+    panic_if(w->rdGmReqsInPipe, "GM read counter for wavefront non-zero\n");
+    panic_if(w->wrLmReqsInPipe, "LM write counter for wavefront non-zero\n");
+    panic_if(w->rdLmReqsInPipe, "GM read counter for wavefront non-zero\n");
+    panic_if(w->outstandingReqs,
+             "Outstanding reqs counter for wavefront non-zero\n");
 }
 
 /**