From 90a518e885dba14f6e6f0101ae5d354caa8e8741 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 23 Aug 2023 19:21:55 -0500 Subject: [PATCH] gpu-compute,arch-vega: Fix ALU-only LDS counters There are a few LDS instructions that perform local ALU operations and writeback which are marked as loads. These are marked as loads because they fit in the pipeline logic better, according to a several year old comment. In the VEGA ISA these instructions (swizzle, permute, bpermute) are not decrementing the LDS load counter. As a result, the counter will gradually increase over time. Since wavefront slots are persistent, this can cause applications with a few thousand kernels to eventually hang thinking there are not enough resources. This changeset fixes this by decrementing the LDS load counter for these instructions. This fix was already integrated in the GCN3 ISA in the exact same way. This changeset moves it near a similar comment about scheduling register file writes. Change-Id: Ife5237a2cae7213948c32ef266f4f8f22917351c --- src/arch/amdgpu/vega/insts/instructions.cc | 15 +++++++++++++++ src/gpu-compute/compute_unit.cc | 7 +++++++ 2 files changed, 22 insertions(+) diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc index ab9c1cecf2..b0f8c908ed 100644 --- a/src/arch/amdgpu/vega/insts/instructions.cc +++ b/src/arch/amdgpu/vega/insts/instructions.cc @@ -35971,6 +35971,11 @@ namespace VegaISA */ wf->computeUnit->vrf[wf->simdId]-> scheduleWriteOperandsFromLoad(wf, gpuDynInst); + /** + * Similarly, this counter could build up over time, even across + * multiple wavefronts, and cause a deadlock. + */ + wf->rdLmReqsInPipe--; } // execute // --- Inst_DS__DS_PERMUTE_B32 class methods --- @@ -36054,6 +36059,11 @@ namespace VegaISA */ wf->computeUnit->vrf[wf->simdId]-> scheduleWriteOperandsFromLoad(wf, gpuDynInst); + /** + * Similarly, this counter could build up over time, even across + * multiple wavefronts, and cause a deadlock. + */ + wf->rdLmReqsInPipe--; } // execute // --- Inst_DS__DS_BPERMUTE_B32 class methods --- @@ -36137,6 +36147,11 @@ namespace VegaISA */ wf->computeUnit->vrf[wf->simdId]-> scheduleWriteOperandsFromLoad(wf, gpuDynInst); + /** + * Similarly, this counter could build up over time, even across + * multiple wavefronts, and cause a deadlock. + */ + wf->rdLmReqsInPipe--; } // execute // --- Inst_DS__DS_ADD_U64 class methods --- diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 06fe28f5b8..ea903455d5 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -383,6 +383,13 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, stats.waveLevelParallelism.sample(activeWaves); activeWaves++; + + panic_if(w->wrGmReqsInPipe, "GM write counter for wavefront non-zero\n"); + panic_if(w->rdGmReqsInPipe, "GM read counter for wavefront non-zero\n"); + panic_if(w->wrLmReqsInPipe, "LM write counter for wavefront non-zero\n"); + panic_if(w->rdLmReqsInPipe, "GM read counter for wavefront non-zero\n"); + panic_if(w->outstandingReqs, + "Outstanding reqs counter for wavefront non-zero\n"); } /**