From 9f4d334644ea8eb552e25dc12252bd9eb8d605ed Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 6 Oct 2023 10:30:45 -0500 Subject: [PATCH] gpu-compute: Update tokens for flat global/scratch Memory instructions acquire coalescer tokens in the schedule stage. Currently this is only done for buffer and flat instructions, but not flat global or flat scratch. This change now acquires tokens for flat global and flat scratch instructions. This provides back-pressure to the CUs and helps to avoid deadlocks in Ruby. The change also handles returning tokens for buffer, flat global, and flat scratch instructions. This was previously only being done for normal flat instructions leading to deadlocks in some applications when the tokens were exhausted. To simplify the logic, added a needsToken() method to GPUDynInst which return if the instruction is buffer or any flat segment. The waitcnts were also incorrect for flat global and flat scratch. We should always decrement vmem and exp count for stores and only normal flat instructions should decrement lgkm. Currently vmem/exp are not decremented for flat global and flat scratch which can lead to deadlock. This change set fixes this by always decrementing vmem/exp and lgkm only for normal flat instructions. Change-Id: I673f4ac6121e4b5a5e8491bc9130c6d825d95fc5 --- src/arch/amdgpu/vega/insts/instructions.cc | 102 ++++++++++++++------- src/gpu-compute/gpu_dyn_inst.cc | 6 ++ src/gpu-compute/gpu_dyn_inst.hh | 1 + src/gpu-compute/schedule_stage.cc | 2 +- src/gpu-compute/wavefront.cc | 2 +- 5 files changed, 77 insertions(+), 36 deletions(-) diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc index b0f8c908ed..bb6a2233cd 100644 --- a/src/arch/amdgpu/vega/insts/instructions.cc +++ b/src/arch/amdgpu/vega/insts/instructions.cc @@ -43894,9 +43894,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } @@ -43978,9 +43980,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } @@ -44063,9 +44067,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } @@ -44118,9 +44124,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } @@ -44173,9 +44181,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } @@ -44237,9 +44247,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } @@ -44304,9 +44316,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } wf->decExpInstsIssued(); return; } @@ -44361,9 +44375,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } wf->decExpInstsIssued(); return; } @@ -44418,9 +44434,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } wf->decExpInstsIssued(); return; } @@ -44476,9 +44494,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } wf->decExpInstsIssued(); return; } @@ -44534,9 +44554,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } wf->decExpInstsIssued(); return; } @@ -44600,9 +44622,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } wf->decExpInstsIssued(); return; } @@ -44677,9 +44701,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } @@ -44757,9 +44783,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } @@ -44837,9 +44865,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } @@ -45370,9 +45400,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } @@ -45451,9 +45483,11 @@ namespace VegaISA { Wavefront *wf = gpuDynInst->wavefront(); - if (gpuDynInst->exec_mask.none() && isFlat()) { + if (gpuDynInst->exec_mask.none()) { wf->decVMemInstsIssued(); - wf->decLGKMInstsIssued(); + if (isFlat()) { + wf->decLGKMInstsIssued(); + } return; } diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 0b394e7e36..c59317d2c4 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -582,6 +582,12 @@ GPUDynInst::readsFlatScratch() const return false; } +bool +GPUDynInst::needsToken() const +{ + return isGlobalMem() || isFlat() || isFlatGlobal() || isFlatScratch(); +} + bool GPUDynInst::isAtomicAnd() const { diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index 558cce8431..6551fa417a 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -257,6 +257,7 @@ class GPUDynInst : public GPUExecContext bool writesFlatScratch() const; bool readsExecMask() const; bool writesExecMask() const; + bool needsToken() const; bool isAtomicAnd() const; bool isAtomicOr() const; diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index 4c4028b152..0d475c577e 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -579,7 +579,7 @@ ScheduleStage::fillDispatchList() // operation. GPUDynInstPtr mp = schIter->first; if (!mp->isMemSync() && !mp->isScalar() && - (mp->isGlobalMem() || mp->isFlat())) { + mp->needsToken()) { computeUnit.globalMemoryPipe.acqCoalescerToken(mp); } diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index 8a1adfe802..0bca152e08 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -1082,7 +1082,7 @@ Wavefront::exec() * we return here to avoid spurious errors related to flat insts * and their address segment resolution. */ - if (execMask().none() && ii->isFlat()) { + if (execMask().none() && ii->needsToken()) { computeUnit->getTokenManager()->recvTokens(1); return; }