From 9f4d334644ea8eb552e25dc12252bd9eb8d605ed Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 6 Oct 2023 10:30:45 -0500
Subject: [PATCH] gpu-compute: Update tokens for flat global/scratch

Memory instructions acquire coalescer tokens in the schedule stage.
Currently this is only done for buffer and flat instructions, but not
flat global or flat scratch. This change now acquires tokens for flat
global and flat scratch instructions. This provides back-pressure to the
CUs and helps to avoid deadlocks in Ruby.

The change also handles returning tokens for buffer, flat global, and
flat scratch instructions. This was previously only being done for
normal flat instructions leading to deadlocks in some applications when
the tokens were exhausted.

To simplify the logic, added a needsToken() method to GPUDynInst which
return if the instruction is buffer or any flat segment.

The waitcnts were also incorrect for flat global and flat scratch. We
should always decrement vmem and exp count for stores and only normal
flat instructions should decrement lgkm. Currently vmem/exp are not
decremented for flat global and flat scratch which can lead to deadlock.
This change set fixes this by always decrementing vmem/exp and lgkm only
for normal flat instructions.

Change-Id: I673f4ac6121e4b5a5e8491bc9130c6d825d95fc5
---
 src/arch/amdgpu/vega/insts/instructions.cc | 102 ++++++++++++++-------
 src/gpu-compute/gpu_dyn_inst.cc            |   6 ++
 src/gpu-compute/gpu_dyn_inst.hh            |   1 +
 src/gpu-compute/schedule_stage.cc          |   2 +-
 src/gpu-compute/wavefront.cc               |   2 +-
 5 files changed, 77 insertions(+), 36 deletions(-)

diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc
index b0f8c908ed..bb6a2233cd 100644
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -43894,9 +43894,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
@@ -43978,9 +43980,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
@@ -44063,9 +44067,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
@@ -44118,9 +44124,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
@@ -44173,9 +44181,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
@@ -44237,9 +44247,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
@@ -44304,9 +44316,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             wf->decExpInstsIssued();
             return;
         }
@@ -44361,9 +44375,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             wf->decExpInstsIssued();
             return;
         }
@@ -44418,9 +44434,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             wf->decExpInstsIssued();
             return;
         }
@@ -44476,9 +44494,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             wf->decExpInstsIssued();
             return;
         }
@@ -44534,9 +44554,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             wf->decExpInstsIssued();
             return;
         }
@@ -44600,9 +44622,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             wf->decExpInstsIssued();
             return;
         }
@@ -44677,9 +44701,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
@@ -44757,9 +44783,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
@@ -44837,9 +44865,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
@@ -45370,9 +45400,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
@@ -45451,9 +45483,11 @@ namespace VegaISA
     {
         Wavefront *wf = gpuDynInst->wavefront();
 
-        if (gpuDynInst->exec_mask.none() && isFlat()) {
+        if (gpuDynInst->exec_mask.none()) {
             wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
             return;
         }
 
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 0b394e7e36..c59317d2c4 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -582,6 +582,12 @@ GPUDynInst::readsFlatScratch() const
     return false;
 }
 
+bool
+GPUDynInst::needsToken() const
+{
+    return isGlobalMem() || isFlat() || isFlatGlobal() || isFlatScratch();
+}
+
 bool
 GPUDynInst::isAtomicAnd() const
 {
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index 558cce8431..6551fa417a 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -257,6 +257,7 @@ class GPUDynInst : public GPUExecContext
     bool writesFlatScratch() const;
     bool readsExecMask() const;
     bool writesExecMask() const;
+    bool needsToken() const;
 
     bool isAtomicAnd() const;
     bool isAtomicOr() const;
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
index 4c4028b152..0d475c577e 100644
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -579,7 +579,7 @@ ScheduleStage::fillDispatchList()
                     // operation.
                     GPUDynInstPtr mp = schIter->first;
                     if (!mp->isMemSync() && !mp->isScalar() &&
-                        (mp->isGlobalMem() || mp->isFlat())) {
+                        mp->needsToken()) {
                         computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
                     }
 
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 8a1adfe802..0bca152e08 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -1082,7 +1082,7 @@ Wavefront::exec()
      * we return here to avoid spurious errors related to flat insts
      * and their address segment resolution.
      */
-    if (execMask().none() && ii->isFlat()) {
+    if (execMask().none() && ii->needsToken()) {
         computeUnit->getTokenManager()->recvTokens(1);
         return;
     }