From 3112a7f0d0034cab48e9577f1e9dbbb547390b2d Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 1 Sep 2021 14:30:35 -0500
Subject: [PATCH] arch-gcn3,gpu-compute: Move GCN3 specific TLB to arch

Move GpuTLB and TLBCoalescer to GCN3 as the TLB format is specific to
GCN3 and SE mode / APU simulation. Vega will have its own TLB,
coalescer, and walker suitable for a dGPU. This also adds a using alias
for the TLB translation state to reduce the number of references to
TheISA and X86ISA. X86 specific includes are also removed.

Change-Id: I34448bb4e5ddb9980b34a55bc717bbcea0e03db5
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/49847
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/arch/amdgpu/gcn3/SConscript               |  5 +++
 .../amdgpu/gcn3}/X86GPUTLB.py                 |  5 +--
 src/arch/amdgpu/gcn3/gpu_isa.hh               |  1 +
 .../gpu_tlb.cc => arch/amdgpu/gcn3/tlb.cc}    | 11 +++---
 .../gpu_tlb.hh => arch/amdgpu/gcn3/tlb.hh}    |  2 ++
 .../amdgpu/gcn3}/tlb_coalescer.cc             | 30 ++++++++--------
 .../amdgpu/gcn3}/tlb_coalescer.hh             |  2 +-
 src/gpu-compute/SConscript                    |  3 --
 src/gpu-compute/compute_unit.cc               | 35 +++++++++----------
 src/gpu-compute/fetch_unit.cc                 |  8 ++---
 src/gpu-compute/shader.cc                     |  8 ++---
 src/gpu-compute/shader.hh                     |  2 +-
 12 files changed, 57 insertions(+), 55 deletions(-)
 rename src/{gpu-compute => arch/amdgpu/gcn3}/X86GPUTLB.py (97%)
 rename src/{gpu-compute/gpu_tlb.cc => arch/amdgpu/gcn3/tlb.cc} (99%)
 rename src/{gpu-compute/gpu_tlb.hh => arch/amdgpu/gcn3/tlb.hh} (99%)
 rename src/{gpu-compute => arch/amdgpu/gcn3}/tlb_coalescer.cc (95%)
 rename src/{gpu-compute => arch/amdgpu/gcn3}/tlb_coalescer.hh (99%)

diff --git a/src/arch/amdgpu/gcn3/SConscript b/src/arch/amdgpu/gcn3/SConscript
index 61c93c3391..dc4660f7c9 100644
--- a/src/arch/amdgpu/gcn3/SConscript
+++ b/src/arch/amdgpu/gcn3/SConscript
@@ -39,10 +39,15 @@ if not env['BUILD_GPU']:
     Return()
 
 if env['TARGET_GPU_ISA'] == 'gcn3':
+    SimObject('X86GPUTLB.py')
+
     Source('decoder.cc')
     Source('insts/gpu_static_inst.cc')
     Source('insts/instructions.cc')
     Source('insts/op_encodings.cc')
     Source('isa.cc')
     Source('registers.cc')
+    Source('tlb.cc')
+    Source('tlb_coalescer.cc')
+
     DebugFlag('GCN3', 'Debug flag for GCN3 GPU ISA')
diff --git a/src/gpu-compute/X86GPUTLB.py b/src/arch/amdgpu/gcn3/X86GPUTLB.py
similarity index 97%
rename from src/gpu-compute/X86GPUTLB.py
rename to src/arch/amdgpu/gcn3/X86GPUTLB.py
index ab14bf8881..1c7f1d0247 100644
--- a/src/gpu-compute/X86GPUTLB.py
+++ b/src/arch/amdgpu/gcn3/X86GPUTLB.py
@@ -39,7 +39,7 @@ from m5.SimObject import SimObject
 class X86GPUTLB(ClockedObject):
     type = 'X86GPUTLB'
     cxx_class = 'gem5::X86ISA::GpuTLB'
-    cxx_header = 'gpu-compute/gpu_tlb.hh'
+    cxx_header = 'arch/amdgpu/gcn3/tlb.hh'
     size = Param.Int(64, "TLB size (number of entries)")
     assoc = Param.Int(64, "TLB associativity")
 
@@ -63,7 +63,8 @@ class X86GPUTLB(ClockedObject):
 class TLBCoalescer(ClockedObject):
     type = 'TLBCoalescer'
     cxx_class = 'gem5::TLBCoalescer'
-    cxx_header = 'gpu-compute/tlb_coalescer.hh'
+    cxx_header = 'arch/amdgpu/gcn3/tlb_coalescer.hh'
+
     probesPerCycle = Param.Int(2, "Number of TLB probes per cycle")
     coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks")
     cpu_side_ports = VectorResponsePort("Port on side closer to CPU/CU")
diff --git a/src/arch/amdgpu/gcn3/gpu_isa.hh b/src/arch/amdgpu/gcn3/gpu_isa.hh
index 65136bb3ad..205f097c4b 100644
--- a/src/arch/amdgpu/gcn3/gpu_isa.hh
+++ b/src/arch/amdgpu/gcn3/gpu_isa.hh
@@ -38,6 +38,7 @@
 #include <type_traits>
 
 #include "arch/amdgpu/gcn3/gpu_registers.hh"
+#include "arch/amdgpu/gcn3/tlb.hh"
 #include "gpu-compute/dispatcher.hh"
 #include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/misc.hh"
diff --git a/src/gpu-compute/gpu_tlb.cc b/src/arch/amdgpu/gcn3/tlb.cc
similarity index 99%
rename from src/gpu-compute/gpu_tlb.cc
rename to src/arch/amdgpu/gcn3/tlb.cc
index e2225a0ffd..4a59c32b63 100644
--- a/src/gpu-compute/gpu_tlb.cc
+++ b/src/arch/amdgpu/gcn3/tlb.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * Copyright (c) 2011-2021 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
@@ -14,9 +14,9 @@
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -30,10 +30,9 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * Author: Lisa Hsu
  */
 
-#include "gpu-compute/gpu_tlb.hh"
+#include "arch/amdgpu/gcn3/tlb.hh"
 
 #include <cmath>
 #include <cstring>
diff --git a/src/gpu-compute/gpu_tlb.hh b/src/arch/amdgpu/gcn3/tlb.hh
similarity index 99%
rename from src/gpu-compute/gpu_tlb.hh
rename to src/arch/amdgpu/gcn3/tlb.hh
index 4652a73d04..944c0ac59c 100644
--- a/src/gpu-compute/gpu_tlb.hh
+++ b/src/arch/amdgpu/gcn3/tlb.hh
@@ -438,6 +438,8 @@ namespace X86ISA
     };
 }
 
+using GpuTranslationState = X86ISA::GpuTLB::TranslationState;
+
 } // namespace gem5
 
 #endif // __GPU_TLB_HH__
diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/arch/amdgpu/gcn3/tlb_coalescer.cc
similarity index 95%
rename from src/gpu-compute/tlb_coalescer.cc
rename to src/arch/amdgpu/gcn3/tlb_coalescer.cc
index d82fa7ea85..9b53db8688 100644
--- a/src/gpu-compute/tlb_coalescer.cc
+++ b/src/arch/amdgpu/gcn3/tlb_coalescer.cc
@@ -31,7 +31,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "gpu-compute/tlb_coalescer.hh"
+#include "arch/amdgpu/gcn3/tlb_coalescer.hh"
 
 #include <cstring>
 
@@ -101,11 +101,11 @@ TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
     if (disableCoalescing)
         return false;
 
-    TheISA::GpuTLB::TranslationState *incoming_state =
-      safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
+    GpuTranslationState *incoming_state =
+      safe_cast<GpuTranslationState*>(incoming_pkt->senderState);
 
-    TheISA::GpuTLB::TranslationState *coalesced_state =
-     safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
+    GpuTranslationState *coalesced_state =
+     safe_cast<GpuTranslationState*>(coalesced_pkt->senderState);
 
     // Rule 1: Coalesce requests only if they
     // fall within the same virtual page
@@ -148,8 +148,8 @@ TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
     DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
             issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
 
-    TheISA::GpuTLB::TranslationState *sender_state =
-        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+    GpuTranslationState *sender_state =
+        safe_cast<GpuTranslationState*>(pkt->senderState);
 
     TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
     assert(tlb_entry);
@@ -167,8 +167,8 @@ TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
 
     for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
         PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
-        TheISA::GpuTLB::TranslationState *sender_state =
-            safe_cast<TheISA::GpuTLB::TranslationState*>(
+        GpuTranslationState *sender_state =
+            safe_cast<GpuTranslationState*>(
                     local_pkt->senderState);
 
         // we are sending the packet back, so pop the reqCnt associated
@@ -238,8 +238,8 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
     // number of coalesced reqs for a given window
     int coalescedReq_cnt = 0;
 
-    TheISA::GpuTLB::TranslationState *sender_state =
-        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+    GpuTranslationState *sender_state =
+        safe_cast<GpuTranslationState*>(pkt->senderState);
 
     // push back the port to remember the path back
     sender_state->ports.push_back(this);
@@ -337,8 +337,8 @@ void
 TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
 {
 
-    TheISA::GpuTLB::TranslationState *sender_state =
-        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+    GpuTranslationState *sender_state =
+        safe_cast<GpuTranslationState*>(pkt->senderState);
 
     bool update_stats = !sender_state->isPrefetch;
 
@@ -460,8 +460,8 @@ TLBCoalescer::processProbeTLBEvent()
                 rejected = true;
                 ++vector_index;
             } else {
-                TheISA::GpuTLB::TranslationState *tmp_sender_state =
-                    safe_cast<TheISA::GpuTLB::TranslationState*>
+                GpuTranslationState *tmp_sender_state =
+                    safe_cast<GpuTranslationState*>
                     (first_packet->senderState);
 
                 bool update_stats = !tmp_sender_state->isPrefetch;
diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/arch/amdgpu/gcn3/tlb_coalescer.hh
similarity index 99%
rename from src/gpu-compute/tlb_coalescer.hh
rename to src/arch/amdgpu/gcn3/tlb_coalescer.hh
index fce87406b2..afe12c942a 100644
--- a/src/gpu-compute/tlb_coalescer.hh
+++ b/src/arch/amdgpu/gcn3/tlb_coalescer.hh
@@ -39,13 +39,13 @@
 #include <string>
 #include <vector>
 
+#include "arch/amdgpu/gcn3/tlb.hh"
 #include "arch/generic/tlb.hh"
 #include "arch/x86/isa.hh"
 #include "arch/x86/pagetable.hh"
 #include "arch/x86/regs/segment.hh"
 #include "base/logging.hh"
 #include "base/statistics.hh"
-#include "gpu-compute/gpu_tlb.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
 #include "params/TLBCoalescer.hh"
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
index ae0bfab441..2ccf1b7c07 100644
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -39,7 +39,6 @@ if not env['BUILD_GPU']:
 SimObject('GPU.py')
 SimObject('GPUStaticInstFlags.py')
 SimObject('LdsState.py')
-SimObject('X86GPUTLB.py')
 
 Source('comm.cc')
 Source('compute_unit.cc')
@@ -54,7 +53,6 @@ Source('gpu_dyn_inst.cc')
 Source('gpu_exec_context.cc')
 Source('gpu_render_driver.cc')
 Source('gpu_static_inst.cc')
-Source('gpu_tlb.cc')
 Source('lds_state.cc')
 Source('local_memory_pipeline.cc')
 Source('pool_manager.cc')
@@ -69,7 +67,6 @@ Source('shader.cc')
 Source('dyn_pool_manager.cc')
 Source('simple_pool_manager.cc')
 Source('static_register_manager_policy.cc')
-Source('tlb_coalescer.cc')
 Source('vector_register_file.cc')
 Source('wavefront.cc')
 
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 5e3b8d2f1d..feef552bf2 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -35,7 +35,6 @@
 
 #include <limits>
 
-#include "arch/x86/page_size.hh"
 #include "base/output.hh"
 #include "debug/GPUDisp.hh"
 #include "debug/GPUExec.hh"
@@ -1076,8 +1075,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
         pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
 
         // This is the senderState needed by the TLB hierarchy to function
-        X86ISA::GpuTLB::TranslationState *translation_state =
-          new X86ISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
+        GpuTranslationState *translation_state =
+          new GpuTranslationState(TLB_mode, shader->gpuTc, false,
                                                pkt->senderState);
 
         pkt->senderState = translation_state;
@@ -1091,8 +1090,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
             stats.hitsPerTLBLevel[hit_level]++;
 
             // New SenderState for the memory access
-            X86ISA::GpuTLB::TranslationState *sender_state =
-                safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
+            GpuTranslationState *sender_state =
+                safe_cast<GpuTranslationState*>(pkt->senderState);
 
             delete sender_state->tlbEntry;
             delete sender_state->saved;
@@ -1169,7 +1168,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
         delete pkt->senderState;
 
         // Because it's atomic operation, only need TLB translation state
-        pkt->senderState = new X86ISA::GpuTLB::TranslationState(TLB_mode,
+        pkt->senderState = new GpuTranslationState(TLB_mode,
                                                                 shader->gpuTc);
 
         tlbPort[tlbPort_index].sendFunctional(pkt);
@@ -1190,8 +1189,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
                 new_pkt->req->getPaddr());
 
         // safe_cast the senderState
-        X86ISA::GpuTLB::TranslationState *sender_state =
-             safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
+        GpuTranslationState *sender_state =
+             safe_cast<GpuTranslationState*>(pkt->senderState);
 
         delete sender_state->tlbEntry;
         delete new_pkt;
@@ -1211,7 +1210,7 @@ ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
         new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst);
 
     pkt->senderState =
-        new X86ISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false,
+        new GpuTranslationState(tlb_mode, shader->gpuTc, false,
                                              pkt->senderState);
 
     if (scalarDTLBPort.isStalled()) {
@@ -1397,8 +1396,8 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
     computeUnit->stats.tlbCycles += curTick();
 
     // pop off the TLB translation state
-    X86ISA::GpuTLB::TranslationState *translation_state =
-               safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
+    GpuTranslationState *translation_state =
+               safe_cast<GpuTranslationState*>(pkt->senderState);
 
     // no PageFaults are permitted for data accesses
     if (!translation_state->tlbEntry) {
@@ -1508,15 +1507,15 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
 
             // Because it's atomic operation, only need TLB translation state
             prefetch_pkt->senderState =
-                new X86ISA::GpuTLB::TranslationState(TLB_mode,
+                new GpuTranslationState(TLB_mode,
                     computeUnit->shader->gpuTc, true);
 
             // Currently prefetches are zero-latency, hence the sendFunctional
             sendFunctional(prefetch_pkt);
 
             /* safe_cast the senderState */
-            X86ISA::GpuTLB::TranslationState *tlb_state =
-                 safe_cast<X86ISA::GpuTLB::TranslationState*>(
+            GpuTranslationState *tlb_state =
+                 safe_cast<GpuTranslationState*>(
                          prefetch_pkt->senderState);
 
 
@@ -1663,8 +1662,8 @@ ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
 {
     assert(pkt->senderState);
 
-    X86ISA::GpuTLB::TranslationState *translation_state =
-        safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
+    GpuTranslationState *translation_state =
+        safe_cast<GpuTranslationState*>(pkt->senderState);
 
     // Page faults are not allowed
     fatal_if(!translation_state->tlbEntry,
@@ -1728,8 +1727,8 @@ ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
     assert(pkt->senderState);
 
     // pop off the TLB translation state
-    X86ISA::GpuTLB::TranslationState *translation_state
-        = safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
+    GpuTranslationState *translation_state
+        = safe_cast<GpuTranslationState*>(pkt->senderState);
 
     bool success = translation_state->tlbEntry != nullptr;
     delete translation_state->tlbEntry;
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index 16ea7cc0c6..437a48daac 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -174,7 +174,7 @@ FetchUnit::initiateFetch(Wavefront *wavefront)
 
         // Sender State needed by TLB hierarchy
         pkt->senderState =
-            new TheISA::GpuTLB::TranslationState(BaseMMU::Execute,
+            new GpuTranslationState(BaseMMU::Execute,
                                                  computeUnit.shader->gpuTc,
                                                  false, pkt->senderState);
 
@@ -201,13 +201,13 @@ FetchUnit::initiateFetch(Wavefront *wavefront)
         }
     } else {
         pkt->senderState =
-            new TheISA::GpuTLB::TranslationState(BaseMMU::Execute,
+            new GpuTranslationState(BaseMMU::Execute,
                                                  computeUnit.shader->gpuTc);
 
         computeUnit.sqcTLBPort.sendFunctional(pkt);
 
-        TheISA::GpuTLB::TranslationState *sender_state =
-             safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+        GpuTranslationState *sender_state =
+             safe_cast<GpuTranslationState*>(pkt->senderState);
 
         delete sender_state->tlbEntry;
         delete sender_state;
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index df0295facb..ad18d01734 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -35,8 +35,6 @@
 
 #include <limits>
 
-#include "arch/x86/linux/linux.hh"
-#include "arch/x86/page_size.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUAgentDisp.hh"
 #include "debug/GPUDisp.hh"
@@ -430,7 +428,7 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
 {
     // update senderState. Need to know the gpuTc and the TLB mode
     pkt->senderState =
-        new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
+        new GpuTranslationState(mode, gpuTc, false);
 
     // even when the perLaneTLB flag is turned on
     // it's ok tp send all accesses through lane 0
@@ -439,8 +437,8 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
     cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
 
     /* safe_cast the senderState */
-    TheISA::GpuTLB::TranslationState *sender_state =
-               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+    GpuTranslationState *sender_state =
+               safe_cast<GpuTranslationState*>(pkt->senderState);
 
     delete sender_state->tlbEntry;
     delete pkt->senderState;
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 5a891c3786..6108bdfd01 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -37,6 +37,7 @@
 #include <functional>
 #include <string>
 
+#include "arch/gpu_isa.hh"
 #include "base/statistics.hh"
 #include "base/stats/group.hh"
 #include "base/types.hh"
@@ -47,7 +48,6 @@
 #include "cpu/thread_state.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
-#include "gpu-compute/gpu_tlb.hh"
 #include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/lds_state.hh"
 #include "mem/page_table.hh"