configs, gpu-compute: Add configurable L1 scalar latencies

Previously the scalar cache path used the same latency parameter as the vector cache path for memory requests. This commit adds new parameters for the scalar cache path latencies. This commit also modifies the model to use the new latency parameter to set the memory request latency in the scalar cache. The new paramters are '--scalar-mem-req-latency' and '--scalar-mem-resp-latency' and are set to default values of 50 and 0 respectively Change-Id: I7483f780f2fc0cfbc320ed1fd0c2ee3e2dfc7af2 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/65511 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Jason Lowe-Power <power.jg@gmail.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
2022-11-10 20:42:25 -06:00
parent 78b978686c
commit dff879cf21
4 changed files with 37 additions and 1 deletions
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -275,6 +275,21 @@ parser.add_argument(
    default=50,
    help="Latency for responses from ruby to the cu.",
 )
+parser.add_argument(
+    "--scalar-mem-req-latency",
+    type=int,
+    default=50,
+    help="Latency for scalar requests from the cu to ruby.",
+)
+parser.add_argument(
+    "--scalar-mem-resp-latency",
+    type=int,
+    # Set to 0 as the scalar cache response path does not model
+    # response latency yet and this parameter is currently not used
+    default=0,
+    help="Latency for scalar responses from ruby to the cu.",
+)
+
 parser.add_argument(
    "--TLB-prefetch", type=int, help="prefetch depth for" "TLBs"
 )
@@ -463,6 +478,8 @@ for i in range(n_cu):
            vrf_lm_bus_latency=args.vrf_lm_bus_latency,
            mem_req_latency=args.mem_req_latency,
            mem_resp_latency=args.mem_resp_latency,
+            scalar_mem_req_latency=args.scalar_mem_req_latency,
+            scalar_mem_resp_latency=args.scalar_mem_resp_latency,
            localDataStore=LdsState(
                banks=args.numLdsBanks,
                bankConflictPenalty=args.ldsBankConflictPenalty,
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -178,6 +178,19 @@ class ComputeUnit(ClockedObject):
        "TCP and cu as well as TCP data array "
        "access. Specified in GPU clock cycles",
    )
+    scalar_mem_req_latency = Param.Int(
+        50,
+        "Latency for scalar requests from the cu to ruby. "
+        "Represents the pipeline to reach the TCP "
+        "and specified in GPU clock cycles",
+    )
+    scalar_mem_resp_latency = Param.Int(
+        50,
+        "Latency for scalar responses from ruby to the "
+        "cu. Represents the pipeline between the "
+        "TCP and cu as well as TCP data array "
+        "access. Specified in GPU clock cycles",
+    )
    system = Param.System(Parent.any, "system object")
    cu_id = Param.Int("CU id")
    vrf_to_coalescer_bus_width = Param.Int(
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -98,6 +98,10 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
    countPages(p.countPages),
    req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
    resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
+    scalar_req_tick_latency(
+            p.scalar_mem_req_latency * p.clk_domain->clockPeriod()),
+    scalar_resp_tick_latency(
+            p.scalar_mem_resp_latency * p.clk_domain->clockPeriod()),
    _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
    lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
    ldsPort(csprintf("%s-port", name()), this),
@@ -1786,7 +1790,7 @@ ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
            = new ComputeUnit::ScalarDataPort::MemReqEvent
                (computeUnit->scalarDataPort, req_pkt);
    computeUnit->schedule(scalar_mem_req_event, curTick() +
-                          computeUnit->req_tick_latency);
+                          computeUnit->scalar_req_tick_latency);

    return true;
 }
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -354,6 +354,8 @@ class ComputeUnit : public ClockedObject

    Tick req_tick_latency;
    Tick resp_tick_latency;
+    Tick scalar_req_tick_latency;
+    Tick scalar_resp_tick_latency;

    /**
     * Number of WFs to schedule to each SIMD. This vector is populated