From 9a7fc4ff6911ebecb303928491b63b0e7a04f7f9 Mon Sep 17 00:00:00 2001 From: Kyle Roarty Date: Tue, 20 Jul 2021 14:41:17 -0500 Subject: [PATCH] arch-gcn3: Implement LDS accesses in Flat instructions Add support for LDS accesses by allowing Flat instructions to dispatch into the local memory pipeline if the requested address is in the group aperture. This requires implementing LDS accesses in the Flat initMemRead/Write functions, in a similar fashion to the DS functions of the same name. Because we now can potentially dispatch to the local memory pipeline, this change also adds a check to regain any tokens we requested as a flat instruction. Change-Id: Id26191f7ee43291a5e5ca5f39af06af981ec23ab Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48343 Reviewed-by: Matt Sinclair Reviewed-by: Matthew Poremba Maintainer: Matt Sinclair Tested-by: kokoro --- src/arch/amdgpu/gcn3/insts/instructions.cc | 124 ++++++++++++++++----- src/arch/amdgpu/gcn3/insts/op_encodings.hh | 82 +++++++++++++- src/gpu-compute/gpu_dyn_inst.cc | 5 +- src/gpu-compute/local_memory_pipeline.cc | 5 + 4 files changed, 184 insertions(+), 32 deletions(-) diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc index 79af7ac156..65d008bbc7 100644 --- a/src/arch/amdgpu/gcn3/insts/instructions.cc +++ b/src/arch/amdgpu/gcn3/insts/instructions.cc @@ -36314,7 +36314,7 @@ namespace Gcn3ISA gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -36363,7 +36363,7 @@ namespace Gcn3ISA gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } void @@ -39384,8 +39384,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } // execute @@ -39448,8 +39451,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39511,8 +39517,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39603,8 +39612,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39667,8 +39679,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39731,8 +39746,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39804,8 +39822,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39889,8 +39910,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } // execute @@ -39952,8 +39976,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40015,8 +40042,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40079,8 +40109,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40151,8 +40184,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40227,8 +40263,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40294,8 +40333,11 @@ namespace Gcn3ISA "Flats to private aperture not tested yet\n"); gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } ConstVecOperandU32 data(gpuDynInst, extData.DATA); @@ -40408,8 +40450,11 @@ namespace Gcn3ISA "Flats to private aperture not tested yet\n"); gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40492,8 +40537,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40576,8 +40624,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } void @@ -40834,8 +40885,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40918,8 +40972,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -41044,8 +41101,11 @@ namespace Gcn3ISA "Flats to private aperture not tested yet\n"); gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -41129,8 +41189,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -41215,8 +41278,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -41483,8 +41549,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -41570,8 +41639,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.hh b/src/arch/amdgpu/gcn3/insts/op_encodings.hh index a0612858db..27b9b99aa6 100644 --- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh +++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh @@ -799,35 +799,107 @@ namespace Gcn3ISA void initMemRead(GPUDynInstPtr gpuDynInst) { - initMemReqHelper(gpuDynInst, MemCmd::ReadReq); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + initMemReqHelper(gpuDynInst, MemCmd::ReadReq); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + Wavefront *wf = gpuDynInst->wavefront(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane]; + (reinterpret_cast(gpuDynInst->d_data))[lane] + = wf->ldsChunk->read(vaddr); + } + } + } } template void initMemRead(GPUDynInstPtr gpuDynInst) { - initMemReqHelper(gpuDynInst, MemCmd::ReadReq); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + initMemReqHelper(gpuDynInst, MemCmd::ReadReq); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + Wavefront *wf = gpuDynInst->wavefront(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane]; + for (int i = 0; i < N; ++i) { + (reinterpret_cast( + gpuDynInst->d_data))[lane * N + i] + = wf->ldsChunk->read( + vaddr + i*sizeof(VecElemU32)); + } + } + } + } } template void initMemWrite(GPUDynInstPtr gpuDynInst) { - initMemReqHelper(gpuDynInst, MemCmd::WriteReq); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + initMemReqHelper(gpuDynInst, MemCmd::WriteReq); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + Wavefront *wf = gpuDynInst->wavefront(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane]; + wf->ldsChunk->write(vaddr, + (reinterpret_cast(gpuDynInst->d_data))[lane]); + } + } + } } template void initMemWrite(GPUDynInstPtr gpuDynInst) { - initMemReqHelper(gpuDynInst, MemCmd::WriteReq); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + initMemReqHelper(gpuDynInst, MemCmd::WriteReq); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + Wavefront *wf = gpuDynInst->wavefront(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane]; + for (int i = 0; i < N; ++i) { + wf->ldsChunk->write( + vaddr + i*sizeof(VecElemU32), + (reinterpret_cast( + gpuDynInst->d_data))[lane * N + i]); + } + } + } + } } template void initAtomicAccess(GPUDynInstPtr gpuDynInst) { - initMemReqHelper(gpuDynInst, MemCmd::SwapReq, true); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + initMemReqHelper(gpuDynInst, MemCmd::SwapReq, true); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + Wavefront *wf = gpuDynInst->wavefront(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane]; + AtomicOpFunctor* amo_op = + gpuDynInst->makeAtomicOpFunctor( + &(reinterpret_cast( + gpuDynInst->a_data))[lane], + &(reinterpret_cast( + gpuDynInst->x_data))[lane]).get(); + + T tmp = wf->ldsChunk->read(vaddr); + (*amo_op)(reinterpret_cast(&tmp)); + wf->ldsChunk->write(vaddr, tmp); + (reinterpret_cast(gpuDynInst->d_data))[lane] = tmp; + } + } + } } void diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index fb9bf07844..937e572f03 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -834,7 +834,10 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) if (mask[lane]) { // flat address calculation goes here. // addr[lane] = segmented address - panic("Flat group memory operation is unimplemented!\n"); + addr[lane] = addr[lane] - + wavefront()->computeUnit->shader->ldsApe().base; + assert(addr[lane] < + wavefront()->computeUnit->getLds().getAddrRange().size()); } } wavefront()->execUnitId = wavefront()->flatLmUnitId; diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index 995ea75090..c99be00468 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -76,6 +76,11 @@ LocalMemPipeline::exec() lmReturnedRequests.pop(); w = m->wavefront(); + if (m->isFlat() && !m->isMemSync() && !m->isEndOfKernel() + && m->allLanesZero()) { + computeUnit.getTokenManager()->recvTokens(1); + } + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n", m->cu_id, m->simdId, m->wfSlotId, m->disassemble()); m->completeAcc(m);