From 3112a7f0d0034cab48e9577f1e9dbbb547390b2d Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 1 Sep 2021 14:30:35 -0500 Subject: [PATCH] arch-gcn3,gpu-compute: Move GCN3 specific TLB to arch Move GpuTLB and TLBCoalescer to GCN3 as the TLB format is specific to GCN3 and SE mode / APU simulation. Vega will have its own TLB, coalescer, and walker suitable for a dGPU. This also adds a using alias for the TLB translation state to reduce the number of references to TheISA and X86ISA. X86 specific includes are also removed. Change-Id: I34448bb4e5ddb9980b34a55bc717bbcea0e03db5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/49847 Reviewed-by: Jason Lowe-Power Reviewed-by: Matt Sinclair Maintainer: Matt Sinclair Tested-by: kokoro --- src/arch/amdgpu/gcn3/SConscript | 5 +++ .../amdgpu/gcn3}/X86GPUTLB.py | 5 +-- src/arch/amdgpu/gcn3/gpu_isa.hh | 1 + .../gpu_tlb.cc => arch/amdgpu/gcn3/tlb.cc} | 11 +++--- .../gpu_tlb.hh => arch/amdgpu/gcn3/tlb.hh} | 2 ++ .../amdgpu/gcn3}/tlb_coalescer.cc | 30 ++++++++-------- .../amdgpu/gcn3}/tlb_coalescer.hh | 2 +- src/gpu-compute/SConscript | 3 -- src/gpu-compute/compute_unit.cc | 35 +++++++++---------- src/gpu-compute/fetch_unit.cc | 8 ++--- src/gpu-compute/shader.cc | 8 ++--- src/gpu-compute/shader.hh | 2 +- 12 files changed, 57 insertions(+), 55 deletions(-) rename src/{gpu-compute => arch/amdgpu/gcn3}/X86GPUTLB.py (97%) rename src/{gpu-compute/gpu_tlb.cc => arch/amdgpu/gcn3/tlb.cc} (99%) rename src/{gpu-compute/gpu_tlb.hh => arch/amdgpu/gcn3/tlb.hh} (99%) rename src/{gpu-compute => arch/amdgpu/gcn3}/tlb_coalescer.cc (95%) rename src/{gpu-compute => arch/amdgpu/gcn3}/tlb_coalescer.hh (99%) diff --git a/src/arch/amdgpu/gcn3/SConscript b/src/arch/amdgpu/gcn3/SConscript index 61c93c3391..dc4660f7c9 100644 --- a/src/arch/amdgpu/gcn3/SConscript +++ b/src/arch/amdgpu/gcn3/SConscript @@ -39,10 +39,15 @@ if not env['BUILD_GPU']: Return() if env['TARGET_GPU_ISA'] == 'gcn3': + SimObject('X86GPUTLB.py') + Source('decoder.cc') Source('insts/gpu_static_inst.cc') Source('insts/instructions.cc') Source('insts/op_encodings.cc') Source('isa.cc') Source('registers.cc') + Source('tlb.cc') + Source('tlb_coalescer.cc') + DebugFlag('GCN3', 'Debug flag for GCN3 GPU ISA') diff --git a/src/gpu-compute/X86GPUTLB.py b/src/arch/amdgpu/gcn3/X86GPUTLB.py similarity index 97% rename from src/gpu-compute/X86GPUTLB.py rename to src/arch/amdgpu/gcn3/X86GPUTLB.py index ab14bf8881..1c7f1d0247 100644 --- a/src/gpu-compute/X86GPUTLB.py +++ b/src/arch/amdgpu/gcn3/X86GPUTLB.py @@ -39,7 +39,7 @@ from m5.SimObject import SimObject class X86GPUTLB(ClockedObject): type = 'X86GPUTLB' cxx_class = 'gem5::X86ISA::GpuTLB' - cxx_header = 'gpu-compute/gpu_tlb.hh' + cxx_header = 'arch/amdgpu/gcn3/tlb.hh' size = Param.Int(64, "TLB size (number of entries)") assoc = Param.Int(64, "TLB associativity") @@ -63,7 +63,8 @@ class X86GPUTLB(ClockedObject): class TLBCoalescer(ClockedObject): type = 'TLBCoalescer' cxx_class = 'gem5::TLBCoalescer' - cxx_header = 'gpu-compute/tlb_coalescer.hh' + cxx_header = 'arch/amdgpu/gcn3/tlb_coalescer.hh' + probesPerCycle = Param.Int(2, "Number of TLB probes per cycle") coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks") cpu_side_ports = VectorResponsePort("Port on side closer to CPU/CU") diff --git a/src/arch/amdgpu/gcn3/gpu_isa.hh b/src/arch/amdgpu/gcn3/gpu_isa.hh index 65136bb3ad..205f097c4b 100644 --- a/src/arch/amdgpu/gcn3/gpu_isa.hh +++ b/src/arch/amdgpu/gcn3/gpu_isa.hh @@ -38,6 +38,7 @@ #include #include "arch/amdgpu/gcn3/gpu_registers.hh" +#include "arch/amdgpu/gcn3/tlb.hh" #include "gpu-compute/dispatcher.hh" #include "gpu-compute/hsa_queue_entry.hh" #include "gpu-compute/misc.hh" diff --git a/src/gpu-compute/gpu_tlb.cc b/src/arch/amdgpu/gcn3/tlb.cc similarity index 99% rename from src/gpu-compute/gpu_tlb.cc rename to src/arch/amdgpu/gcn3/tlb.cc index e2225a0ffd..4a59c32b63 100644 --- a/src/gpu-compute/gpu_tlb.cc +++ b/src/arch/amdgpu/gcn3/tlb.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * Copyright (c) 2011-2021 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only @@ -14,9 +14,9 @@ * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -30,10 +30,9 @@ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * Author: Lisa Hsu */ -#include "gpu-compute/gpu_tlb.hh" +#include "arch/amdgpu/gcn3/tlb.hh" #include #include diff --git a/src/gpu-compute/gpu_tlb.hh b/src/arch/amdgpu/gcn3/tlb.hh similarity index 99% rename from src/gpu-compute/gpu_tlb.hh rename to src/arch/amdgpu/gcn3/tlb.hh index 4652a73d04..944c0ac59c 100644 --- a/src/gpu-compute/gpu_tlb.hh +++ b/src/arch/amdgpu/gcn3/tlb.hh @@ -438,6 +438,8 @@ namespace X86ISA }; } +using GpuTranslationState = X86ISA::GpuTLB::TranslationState; + } // namespace gem5 #endif // __GPU_TLB_HH__ diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/arch/amdgpu/gcn3/tlb_coalescer.cc similarity index 95% rename from src/gpu-compute/tlb_coalescer.cc rename to src/arch/amdgpu/gcn3/tlb_coalescer.cc index d82fa7ea85..9b53db8688 100644 --- a/src/gpu-compute/tlb_coalescer.cc +++ b/src/arch/amdgpu/gcn3/tlb_coalescer.cc @@ -31,7 +31,7 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include "gpu-compute/tlb_coalescer.hh" +#include "arch/amdgpu/gcn3/tlb_coalescer.hh" #include @@ -101,11 +101,11 @@ TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt) if (disableCoalescing) return false; - TheISA::GpuTLB::TranslationState *incoming_state = - safe_cast(incoming_pkt->senderState); + GpuTranslationState *incoming_state = + safe_cast(incoming_pkt->senderState); - TheISA::GpuTLB::TranslationState *coalesced_state = - safe_cast(coalesced_pkt->senderState); + GpuTranslationState *coalesced_state = + safe_cast(coalesced_pkt->senderState); // Rule 1: Coalesce requests only if they // fall within the same virtual page @@ -148,8 +148,8 @@ TLBCoalescer::updatePhysAddresses(PacketPtr pkt) DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n", issuedTranslationsTable[virt_page_addr].size(), virt_page_addr); - TheISA::GpuTLB::TranslationState *sender_state = - safe_cast(pkt->senderState); + GpuTranslationState *sender_state = + safe_cast(pkt->senderState); TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry; assert(tlb_entry); @@ -167,8 +167,8 @@ TLBCoalescer::updatePhysAddresses(PacketPtr pkt) for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) { PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i]; - TheISA::GpuTLB::TranslationState *sender_state = - safe_cast( + GpuTranslationState *sender_state = + safe_cast( local_pkt->senderState); // we are sending the packet back, so pop the reqCnt associated @@ -238,8 +238,8 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) // number of coalesced reqs for a given window int coalescedReq_cnt = 0; - TheISA::GpuTLB::TranslationState *sender_state = - safe_cast(pkt->senderState); + GpuTranslationState *sender_state = + safe_cast(pkt->senderState); // push back the port to remember the path back sender_state->ports.push_back(this); @@ -337,8 +337,8 @@ void TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) { - TheISA::GpuTLB::TranslationState *sender_state = - safe_cast(pkt->senderState); + GpuTranslationState *sender_state = + safe_cast(pkt->senderState); bool update_stats = !sender_state->isPrefetch; @@ -460,8 +460,8 @@ TLBCoalescer::processProbeTLBEvent() rejected = true; ++vector_index; } else { - TheISA::GpuTLB::TranslationState *tmp_sender_state = - safe_cast + GpuTranslationState *tmp_sender_state = + safe_cast (first_packet->senderState); bool update_stats = !tmp_sender_state->isPrefetch; diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/arch/amdgpu/gcn3/tlb_coalescer.hh similarity index 99% rename from src/gpu-compute/tlb_coalescer.hh rename to src/arch/amdgpu/gcn3/tlb_coalescer.hh index fce87406b2..afe12c942a 100644 --- a/src/gpu-compute/tlb_coalescer.hh +++ b/src/arch/amdgpu/gcn3/tlb_coalescer.hh @@ -39,13 +39,13 @@ #include #include +#include "arch/amdgpu/gcn3/tlb.hh" #include "arch/generic/tlb.hh" #include "arch/x86/isa.hh" #include "arch/x86/pagetable.hh" #include "arch/x86/regs/segment.hh" #include "base/logging.hh" #include "base/statistics.hh" -#include "gpu-compute/gpu_tlb.hh" #include "mem/port.hh" #include "mem/request.hh" #include "params/TLBCoalescer.hh" diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index ae0bfab441..2ccf1b7c07 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -39,7 +39,6 @@ if not env['BUILD_GPU']: SimObject('GPU.py') SimObject('GPUStaticInstFlags.py') SimObject('LdsState.py') -SimObject('X86GPUTLB.py') Source('comm.cc') Source('compute_unit.cc') @@ -54,7 +53,6 @@ Source('gpu_dyn_inst.cc') Source('gpu_exec_context.cc') Source('gpu_render_driver.cc') Source('gpu_static_inst.cc') -Source('gpu_tlb.cc') Source('lds_state.cc') Source('local_memory_pipeline.cc') Source('pool_manager.cc') @@ -69,7 +67,6 @@ Source('shader.cc') Source('dyn_pool_manager.cc') Source('simple_pool_manager.cc') Source('static_register_manager_policy.cc') -Source('tlb_coalescer.cc') Source('vector_register_file.cc') Source('wavefront.cc') diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 5e3b8d2f1d..feef552bf2 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -35,7 +35,6 @@ #include -#include "arch/x86/page_size.hh" #include "base/output.hh" #include "debug/GPUDisp.hh" #include "debug/GPUExec.hh" @@ -1076,8 +1075,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index); // This is the senderState needed by the TLB hierarchy to function - X86ISA::GpuTLB::TranslationState *translation_state = - new X86ISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false, + GpuTranslationState *translation_state = + new GpuTranslationState(TLB_mode, shader->gpuTc, false, pkt->senderState); pkt->senderState = translation_state; @@ -1091,8 +1090,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) stats.hitsPerTLBLevel[hit_level]++; // New SenderState for the memory access - X86ISA::GpuTLB::TranslationState *sender_state = - safe_cast(pkt->senderState); + GpuTranslationState *sender_state = + safe_cast(pkt->senderState); delete sender_state->tlbEntry; delete sender_state->saved; @@ -1169,7 +1168,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) delete pkt->senderState; // Because it's atomic operation, only need TLB translation state - pkt->senderState = new X86ISA::GpuTLB::TranslationState(TLB_mode, + pkt->senderState = new GpuTranslationState(TLB_mode, shader->gpuTc); tlbPort[tlbPort_index].sendFunctional(pkt); @@ -1190,8 +1189,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) new_pkt->req->getPaddr()); // safe_cast the senderState - X86ISA::GpuTLB::TranslationState *sender_state = - safe_cast(pkt->senderState); + GpuTranslationState *sender_state = + safe_cast(pkt->senderState); delete sender_state->tlbEntry; delete new_pkt; @@ -1211,7 +1210,7 @@ ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt) new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst); pkt->senderState = - new X86ISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false, + new GpuTranslationState(tlb_mode, shader->gpuTc, false, pkt->senderState); if (scalarDTLBPort.isStalled()) { @@ -1397,8 +1396,8 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) computeUnit->stats.tlbCycles += curTick(); // pop off the TLB translation state - X86ISA::GpuTLB::TranslationState *translation_state = - safe_cast(pkt->senderState); + GpuTranslationState *translation_state = + safe_cast(pkt->senderState); // no PageFaults are permitted for data accesses if (!translation_state->tlbEntry) { @@ -1508,15 +1507,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) // Because it's atomic operation, only need TLB translation state prefetch_pkt->senderState = - new X86ISA::GpuTLB::TranslationState(TLB_mode, + new GpuTranslationState(TLB_mode, computeUnit->shader->gpuTc, true); // Currently prefetches are zero-latency, hence the sendFunctional sendFunctional(prefetch_pkt); /* safe_cast the senderState */ - X86ISA::GpuTLB::TranslationState *tlb_state = - safe_cast( + GpuTranslationState *tlb_state = + safe_cast( prefetch_pkt->senderState); @@ -1663,8 +1662,8 @@ ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt) { assert(pkt->senderState); - X86ISA::GpuTLB::TranslationState *translation_state = - safe_cast(pkt->senderState); + GpuTranslationState *translation_state = + safe_cast(pkt->senderState); // Page faults are not allowed fatal_if(!translation_state->tlbEntry, @@ -1728,8 +1727,8 @@ ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt) assert(pkt->senderState); // pop off the TLB translation state - X86ISA::GpuTLB::TranslationState *translation_state - = safe_cast(pkt->senderState); + GpuTranslationState *translation_state + = safe_cast(pkt->senderState); bool success = translation_state->tlbEntry != nullptr; delete translation_state->tlbEntry; diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc index 16ea7cc0c6..437a48daac 100644 --- a/src/gpu-compute/fetch_unit.cc +++ b/src/gpu-compute/fetch_unit.cc @@ -174,7 +174,7 @@ FetchUnit::initiateFetch(Wavefront *wavefront) // Sender State needed by TLB hierarchy pkt->senderState = - new TheISA::GpuTLB::TranslationState(BaseMMU::Execute, + new GpuTranslationState(BaseMMU::Execute, computeUnit.shader->gpuTc, false, pkt->senderState); @@ -201,13 +201,13 @@ FetchUnit::initiateFetch(Wavefront *wavefront) } } else { pkt->senderState = - new TheISA::GpuTLB::TranslationState(BaseMMU::Execute, + new GpuTranslationState(BaseMMU::Execute, computeUnit.shader->gpuTc); computeUnit.sqcTLBPort.sendFunctional(pkt); - TheISA::GpuTLB::TranslationState *sender_state = - safe_cast(pkt->senderState); + GpuTranslationState *sender_state = + safe_cast(pkt->senderState); delete sender_state->tlbEntry; delete sender_state; diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index df0295facb..ad18d01734 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -35,8 +35,6 @@ #include -#include "arch/x86/linux/linux.hh" -#include "arch/x86/page_size.hh" #include "base/chunk_generator.hh" #include "debug/GPUAgentDisp.hh" #include "debug/GPUDisp.hh" @@ -430,7 +428,7 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode) { // update senderState. Need to know the gpuTc and the TLB mode pkt->senderState = - new TheISA::GpuTLB::TranslationState(mode, gpuTc, false); + new GpuTranslationState(mode, gpuTc, false); // even when the perLaneTLB flag is turned on // it's ok tp send all accesses through lane 0 @@ -439,8 +437,8 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode) cuList[cu_id]->tlbPort[0].sendFunctional(pkt); /* safe_cast the senderState */ - TheISA::GpuTLB::TranslationState *sender_state = - safe_cast(pkt->senderState); + GpuTranslationState *sender_state = + safe_cast(pkt->senderState); delete sender_state->tlbEntry; delete pkt->senderState; diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index 5a891c3786..6108bdfd01 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -37,6 +37,7 @@ #include #include +#include "arch/gpu_isa.hh" #include "base/statistics.hh" #include "base/stats/group.hh" #include "base/types.hh" @@ -47,7 +48,6 @@ #include "cpu/thread_state.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" -#include "gpu-compute/gpu_tlb.hh" #include "gpu-compute/hsa_queue_entry.hh" #include "gpu-compute/lds_state.hh" #include "mem/page_table.hh"