arch-gcn3: Implement LDS accesses in Flat instructions

Add support for LDS accesses by allowing Flat instructions to dispatch
into the local memory pipeline if the requested address is in the group
aperture.

This requires implementing LDS accesses in the Flat initMemRead/Write
functions, in a similar fashion to the DS functions of the same name.

Because we now can potentially dispatch to the local memory pipeline,
this change also adds a check to regain any tokens we requested as a
flat instruction.

Change-Id: Id26191f7ee43291a5e5ca5f39af06af981ec23ab
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48343
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Kyle Roarty
2021-07-20 14:41:17 -05:00
committed by Matt Sinclair
parent 523a92f7f0
commit 9a7fc4ff69
4 changed files with 184 additions and 32 deletions

View File

@@ -36314,7 +36314,7 @@ namespace Gcn3ISA
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -36363,7 +36363,7 @@ namespace Gcn3ISA
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
void
@@ -39384,8 +39384,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
} // execute
@@ -39448,8 +39451,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -39511,8 +39517,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -39603,8 +39612,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -39667,8 +39679,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -39731,8 +39746,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -39804,8 +39822,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -39889,8 +39910,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
} // execute
@@ -39952,8 +39976,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -40015,8 +40042,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -40079,8 +40109,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -40151,8 +40184,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -40227,8 +40263,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe
.issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -40294,8 +40333,11 @@ namespace Gcn3ISA
"Flats to private aperture not tested yet\n");
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
ConstVecOperandU32 data(gpuDynInst, extData.DATA);
@@ -40408,8 +40450,11 @@ namespace Gcn3ISA
"Flats to private aperture not tested yet\n");
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -40492,8 +40537,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -40576,8 +40624,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
void
@@ -40834,8 +40885,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -40918,8 +40972,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -41044,8 +41101,11 @@ namespace Gcn3ISA
"Flats to private aperture not tested yet\n");
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -41129,8 +41189,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -41215,8 +41278,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -41483,8 +41549,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}
@@ -41570,8 +41639,11 @@ namespace Gcn3ISA
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
gpuDynInst->computeUnit()->globalMemoryPipe.
issueRequest(gpuDynInst);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
gpuDynInst->computeUnit()->localMemoryPipe
.issueRequest(gpuDynInst);
} else {
fatal("Non global flat instructions not implemented yet.\n");
fatal("Unsupported scope for flat instruction.\n");
}
}

View File

@@ -799,35 +799,107 @@ namespace Gcn3ISA
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
= wf->ldsChunk->read<T>(vaddr);
}
}
}
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
for (int i = 0; i < N; ++i) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]
= wf->ldsChunk->read<VecElemU32>(
vaddr + i*sizeof(VecElemU32));
}
}
}
}
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
wf->ldsChunk->write<T>(vaddr,
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
}
}
}
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
for (int i = 0; i < N; ++i) {
wf->ldsChunk->write<VecElemU32>(
vaddr + i*sizeof(VecElemU32),
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]);
}
}
}
}
}
template<typename T>
void
initAtomicAccess(GPUDynInstPtr gpuDynInst)
{
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
} else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
AtomicOpFunctor* amo_op =
gpuDynInst->makeAtomicOpFunctor<T>(
&(reinterpret_cast<T*>(
gpuDynInst->a_data))[lane],
&(reinterpret_cast<T*>(
gpuDynInst->x_data))[lane]).get();
T tmp = wf->ldsChunk->read<T>(vaddr);
(*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
wf->ldsChunk->write<T>(vaddr, tmp);
(reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
}
}
}
}
void

View File

@@ -834,7 +834,10 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
if (mask[lane]) {
// flat address calculation goes here.
// addr[lane] = segmented address
panic("Flat group memory operation is unimplemented!\n");
addr[lane] = addr[lane] -
wavefront()->computeUnit->shader->ldsApe().base;
assert(addr[lane] <
wavefront()->computeUnit->getLds().getAddrRange().size());
}
}
wavefront()->execUnitId = wavefront()->flatLmUnitId;

View File

@@ -76,6 +76,11 @@ LocalMemPipeline::exec()
lmReturnedRequests.pop();
w = m->wavefront();
if (m->isFlat() && !m->isMemSync() && !m->isEndOfKernel()
&& m->allLanesZero()) {
computeUnit.getTokenManager()->recvTokens(1);
}
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
m->completeAcc(m);