diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc index 79af7ac156..65d008bbc7 100644 --- a/src/arch/amdgpu/gcn3/insts/instructions.cc +++ b/src/arch/amdgpu/gcn3/insts/instructions.cc @@ -36314,7 +36314,7 @@ namespace Gcn3ISA gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -36363,7 +36363,7 @@ namespace Gcn3ISA gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } void @@ -39384,8 +39384,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } // execute @@ -39448,8 +39451,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39511,8 +39517,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39603,8 +39612,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39667,8 +39679,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39731,8 +39746,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39804,8 +39822,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -39889,8 +39910,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } // execute @@ -39952,8 +39976,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40015,8 +40042,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40079,8 +40109,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40151,8 +40184,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40227,8 +40263,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe .issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40294,8 +40333,11 @@ namespace Gcn3ISA "Flats to private aperture not tested yet\n"); gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } ConstVecOperandU32 data(gpuDynInst, extData.DATA); @@ -40408,8 +40450,11 @@ namespace Gcn3ISA "Flats to private aperture not tested yet\n"); gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40492,8 +40537,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40576,8 +40624,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } void @@ -40834,8 +40885,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -40918,8 +40972,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -41044,8 +41101,11 @@ namespace Gcn3ISA "Flats to private aperture not tested yet\n"); gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -41129,8 +41189,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -41215,8 +41278,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -41483,8 +41549,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } @@ -41570,8 +41639,11 @@ namespace Gcn3ISA if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { gpuDynInst->computeUnit()->globalMemoryPipe. issueRequest(gpuDynInst); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); } else { - fatal("Non global flat instructions not implemented yet.\n"); + fatal("Unsupported scope for flat instruction.\n"); } } diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.hh b/src/arch/amdgpu/gcn3/insts/op_encodings.hh index a0612858db..27b9b99aa6 100644 --- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh +++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh @@ -799,35 +799,107 @@ namespace Gcn3ISA void initMemRead(GPUDynInstPtr gpuDynInst) { - initMemReqHelper(gpuDynInst, MemCmd::ReadReq); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + initMemReqHelper(gpuDynInst, MemCmd::ReadReq); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + Wavefront *wf = gpuDynInst->wavefront(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane]; + (reinterpret_cast(gpuDynInst->d_data))[lane] + = wf->ldsChunk->read(vaddr); + } + } + } } template void initMemRead(GPUDynInstPtr gpuDynInst) { - initMemReqHelper(gpuDynInst, MemCmd::ReadReq); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + initMemReqHelper(gpuDynInst, MemCmd::ReadReq); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + Wavefront *wf = gpuDynInst->wavefront(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane]; + for (int i = 0; i < N; ++i) { + (reinterpret_cast( + gpuDynInst->d_data))[lane * N + i] + = wf->ldsChunk->read( + vaddr + i*sizeof(VecElemU32)); + } + } + } + } } template void initMemWrite(GPUDynInstPtr gpuDynInst) { - initMemReqHelper(gpuDynInst, MemCmd::WriteReq); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + initMemReqHelper(gpuDynInst, MemCmd::WriteReq); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + Wavefront *wf = gpuDynInst->wavefront(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane]; + wf->ldsChunk->write(vaddr, + (reinterpret_cast(gpuDynInst->d_data))[lane]); + } + } + } } template void initMemWrite(GPUDynInstPtr gpuDynInst) { - initMemReqHelper(gpuDynInst, MemCmd::WriteReq); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + initMemReqHelper(gpuDynInst, MemCmd::WriteReq); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + Wavefront *wf = gpuDynInst->wavefront(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane]; + for (int i = 0; i < N; ++i) { + wf->ldsChunk->write( + vaddr + i*sizeof(VecElemU32), + (reinterpret_cast( + gpuDynInst->d_data))[lane * N + i]); + } + } + } + } } template void initAtomicAccess(GPUDynInstPtr gpuDynInst) { - initMemReqHelper(gpuDynInst, MemCmd::SwapReq, true); + if (gpuDynInst->executedAs() == enums::SC_GLOBAL) { + initMemReqHelper(gpuDynInst, MemCmd::SwapReq, true); + } else if (gpuDynInst->executedAs() == enums::SC_GROUP) { + Wavefront *wf = gpuDynInst->wavefront(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane]; + AtomicOpFunctor* amo_op = + gpuDynInst->makeAtomicOpFunctor( + &(reinterpret_cast( + gpuDynInst->a_data))[lane], + &(reinterpret_cast( + gpuDynInst->x_data))[lane]).get(); + + T tmp = wf->ldsChunk->read(vaddr); + (*amo_op)(reinterpret_cast(&tmp)); + wf->ldsChunk->write(vaddr, tmp); + (reinterpret_cast(gpuDynInst->d_data))[lane] = tmp; + } + } + } } void diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index fb9bf07844..937e572f03 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -834,7 +834,10 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) if (mask[lane]) { // flat address calculation goes here. // addr[lane] = segmented address - panic("Flat group memory operation is unimplemented!\n"); + addr[lane] = addr[lane] - + wavefront()->computeUnit->shader->ldsApe().base; + assert(addr[lane] < + wavefront()->computeUnit->getLds().getAddrRange().size()); } } wavefront()->execUnitId = wavefront()->flatLmUnitId; diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index 995ea75090..c99be00468 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -76,6 +76,11 @@ LocalMemPipeline::exec() lmReturnedRequests.pop(); w = m->wavefront(); + if (m->isFlat() && !m->isMemSync() && !m->isEndOfKernel() + && m->allLanesZero()) { + computeUnit.getTokenManager()->recvTokens(1); + } + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n", m->cu_id, m->simdId, m->wfSlotId, m->disassemble()); m->completeAcc(m);