From dff879cf21ee609cca3662073cd89cb9322146be Mon Sep 17 00:00:00 2001
From: vramadas95 <vramadas@wisc.edu>
Date: Thu, 10 Nov 2022 20:42:25 -0600
Subject: [PATCH] configs, gpu-compute: Add configurable L1 scalar latencies

Previously the scalar cache path used the same latency parameter as the
vector cache path for memory requests. This commit adds new parameters
for the scalar cache path latencies. This commit also modifies the model
to use the new latency parameter to set the memory request latency in
the scalar cache. The new paramters are '--scalar-mem-req-latency' and
'--scalar-mem-resp-latency' and are set to default values of 50 and 0
respectively

Change-Id: I7483f780f2fc0cfbc320ed1fd0c2ee3e2dfc7af2
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/65511
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
---
 configs/example/apu_se.py       | 17 +++++++++++++++++
 src/gpu-compute/GPU.py          | 13 +++++++++++++
 src/gpu-compute/compute_unit.cc |  6 +++++-
 src/gpu-compute/compute_unit.hh |  2 ++
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index b33daa5b39..39def024fc 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -275,6 +275,21 @@ parser.add_argument(
     default=50,
     help="Latency for responses from ruby to the cu.",
 )
+parser.add_argument(
+    "--scalar-mem-req-latency",
+    type=int,
+    default=50,
+    help="Latency for scalar requests from the cu to ruby.",
+)
+parser.add_argument(
+    "--scalar-mem-resp-latency",
+    type=int,
+    # Set to 0 as the scalar cache response path does not model
+    # response latency yet and this parameter is currently not used
+    default=0,
+    help="Latency for scalar responses from ruby to the cu.",
+)
+
 parser.add_argument(
     "--TLB-prefetch", type=int, help="prefetch depth for" "TLBs"
 )
@@ -463,6 +478,8 @@ for i in range(n_cu):
             vrf_lm_bus_latency=args.vrf_lm_bus_latency,
             mem_req_latency=args.mem_req_latency,
             mem_resp_latency=args.mem_resp_latency,
+            scalar_mem_req_latency=args.scalar_mem_req_latency,
+            scalar_mem_resp_latency=args.scalar_mem_resp_latency,
             localDataStore=LdsState(
                 banks=args.numLdsBanks,
                 bankConflictPenalty=args.ldsBankConflictPenalty,
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 517d1801c0..0fdc0b75a7 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -178,6 +178,19 @@ class ComputeUnit(ClockedObject):
         "TCP and cu as well as TCP data array "
         "access. Specified in GPU clock cycles",
     )
+    scalar_mem_req_latency = Param.Int(
+        50,
+        "Latency for scalar requests from the cu to ruby. "
+        "Represents the pipeline to reach the TCP "
+        "and specified in GPU clock cycles",
+    )
+    scalar_mem_resp_latency = Param.Int(
+        50,
+        "Latency for scalar responses from ruby to the "
+        "cu. Represents the pipeline between the "
+        "TCP and cu as well as TCP data array "
+        "access. Specified in GPU clock cycles",
+    )
     system = Param.System(Parent.any, "system object")
     cu_id = Param.Int("CU id")
     vrf_to_coalescer_bus_width = Param.Int(
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 8498ea475e..62cfbf94cf 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -98,6 +98,10 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
     countPages(p.countPages),
     req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
     resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
+    scalar_req_tick_latency(
+            p.scalar_mem_req_latency * p.clk_domain->clockPeriod()),
+    scalar_resp_tick_latency(
+            p.scalar_mem_resp_latency * p.clk_domain->clockPeriod()),
     _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
     lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
     ldsPort(csprintf("%s-port", name()), this),
@@ -1786,7 +1790,7 @@ ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
             = new ComputeUnit::ScalarDataPort::MemReqEvent
                 (computeUnit->scalarDataPort, req_pkt);
     computeUnit->schedule(scalar_mem_req_event, curTick() +
-                          computeUnit->req_tick_latency);
+                          computeUnit->scalar_req_tick_latency);
 
     return true;
 }
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index a080e3dc1a..fcc4468ec1 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -354,6 +354,8 @@ class ComputeUnit : public ClockedObject
 
     Tick req_tick_latency;
     Tick resp_tick_latency;
+    Tick scalar_req_tick_latency;
+    Tick scalar_resp_tick_latency;
 
     /**
      * Number of WFs to schedule to each SIMD. This vector is populated