gpu-compute: Update tokens for flat global/scratch

Memory instructions acquire coalescer tokens in the schedule stage.
Currently this is only done for buffer and flat instructions, but not
flat global or flat scratch. This change now acquires tokens for flat
global and flat scratch instructions. This provides back-pressure to the
CUs and helps to avoid deadlocks in Ruby.

The change also handles returning tokens for buffer, flat global, and
flat scratch instructions. This was previously only being done for
normal flat instructions leading to deadlocks in some applications when
the tokens were exhausted.

To simplify the logic, added a needsToken() method to GPUDynInst which
return if the instruction is buffer or any flat segment.

The waitcnts were also incorrect for flat global and flat scratch. We
should always decrement vmem and exp count for stores and only normal
flat instructions should decrement lgkm. Currently vmem/exp are not
decremented for flat global and flat scratch which can lead to deadlock.
This change set fixes this by always decrementing vmem/exp and lgkm only
for normal flat instructions.

Change-Id: I673f4ac6121e4b5a5e8491bc9130c6d825d95fc5
This commit is contained in:
Matthew Poremba
2023-10-06 10:30:45 -05:00
parent ae104cc431
commit 9f4d334644
5 changed files with 77 additions and 36 deletions

View File

@@ -43894,9 +43894,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
@@ -43978,9 +43980,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
@@ -44063,9 +44067,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
@@ -44118,9 +44124,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
@@ -44173,9 +44181,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
@@ -44237,9 +44247,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
@@ -44304,9 +44316,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
wf->decExpInstsIssued();
return;
}
@@ -44361,9 +44375,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
wf->decExpInstsIssued();
return;
}
@@ -44418,9 +44434,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
wf->decExpInstsIssued();
return;
}
@@ -44476,9 +44494,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
wf->decExpInstsIssued();
return;
}
@@ -44534,9 +44554,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
wf->decExpInstsIssued();
return;
}
@@ -44600,9 +44622,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
wf->decExpInstsIssued();
return;
}
@@ -44677,9 +44701,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
@@ -44757,9 +44783,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
@@ -44837,9 +44865,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
@@ -45370,9 +45400,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}
@@ -45451,9 +45483,11 @@ namespace VegaISA
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none() && isFlat()) {
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
wf->decLGKMInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
return;
}

View File

@@ -582,6 +582,12 @@ GPUDynInst::readsFlatScratch() const
return false;
}
bool
GPUDynInst::needsToken() const
{
return isGlobalMem() || isFlat() || isFlatGlobal() || isFlatScratch();
}
bool
GPUDynInst::isAtomicAnd() const
{

View File

@@ -257,6 +257,7 @@ class GPUDynInst : public GPUExecContext
bool writesFlatScratch() const;
bool readsExecMask() const;
bool writesExecMask() const;
bool needsToken() const;
bool isAtomicAnd() const;
bool isAtomicOr() const;

View File

@@ -579,7 +579,7 @@ ScheduleStage::fillDispatchList()
// operation.
GPUDynInstPtr mp = schIter->first;
if (!mp->isMemSync() && !mp->isScalar() &&
(mp->isGlobalMem() || mp->isFlat())) {
mp->needsToken()) {
computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
}

View File

@@ -1082,7 +1082,7 @@ Wavefront::exec()
* we return here to avoid spurious errors related to flat insts
* and their address segment resolution.
*/
if (execMask().none() && ii->isFlat()) {
if (execMask().none() && ii->needsToken()) {
computeUnit->getTokenManager()->recvTokens(1);
return;
}