diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index b33daa5b39..39def024fc 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -275,6 +275,21 @@ parser.add_argument(
     default=50,
     help="Latency for responses from ruby to the cu.",
 )
+parser.add_argument(
+    "--scalar-mem-req-latency",
+    type=int,
+    default=50,
+    help="Latency for scalar requests from the cu to ruby.",
+)
+parser.add_argument(
+    "--scalar-mem-resp-latency",
+    type=int,
+    # Set to 0 as the scalar cache response path does not model
+    # response latency yet and this parameter is currently not used
+    default=0,
+    help="Latency for scalar responses from ruby to the cu.",
+)
+
 parser.add_argument(
     "--TLB-prefetch", type=int, help="prefetch depth for" "TLBs"
 )
@@ -463,6 +478,8 @@ for i in range(n_cu):
             vrf_lm_bus_latency=args.vrf_lm_bus_latency,
             mem_req_latency=args.mem_req_latency,
             mem_resp_latency=args.mem_resp_latency,
+            scalar_mem_req_latency=args.scalar_mem_req_latency,
+            scalar_mem_resp_latency=args.scalar_mem_resp_latency,
             localDataStore=LdsState(
                 banks=args.numLdsBanks,
                 bankConflictPenalty=args.ldsBankConflictPenalty,
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 517d1801c0..0fdc0b75a7 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -178,6 +178,19 @@ class ComputeUnit(ClockedObject):
         "TCP and cu as well as TCP data array "
         "access. Specified in GPU clock cycles",
     )
+    scalar_mem_req_latency = Param.Int(
+        50,
+        "Latency for scalar requests from the cu to ruby. "
+        "Represents the pipeline to reach the TCP "
+        "and specified in GPU clock cycles",
+    )
+    scalar_mem_resp_latency = Param.Int(
+        50,
+        "Latency for scalar responses from ruby to the "
+        "cu. Represents the pipeline between the "
+        "TCP and cu as well as TCP data array "
+        "access. Specified in GPU clock cycles",
+    )
     system = Param.System(Parent.any, "system object")
     cu_id = Param.Int("CU id")
     vrf_to_coalescer_bus_width = Param.Int(
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 8498ea475e..62cfbf94cf 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -98,6 +98,10 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
     countPages(p.countPages),
     req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
     resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
+    scalar_req_tick_latency(
+            p.scalar_mem_req_latency * p.clk_domain->clockPeriod()),
+    scalar_resp_tick_latency(
+            p.scalar_mem_resp_latency * p.clk_domain->clockPeriod()),
     _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
     lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
     ldsPort(csprintf("%s-port", name()), this),
@@ -1786,7 +1790,7 @@ ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
             = new ComputeUnit::ScalarDataPort::MemReqEvent
                 (computeUnit->scalarDataPort, req_pkt);
     computeUnit->schedule(scalar_mem_req_event, curTick() +
-                          computeUnit->req_tick_latency);
+                          computeUnit->scalar_req_tick_latency);
 
     return true;
 }
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index a080e3dc1a..fcc4468ec1 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -354,6 +354,8 @@ class ComputeUnit : public ClockedObject
 
     Tick req_tick_latency;
     Tick resp_tick_latency;
+    Tick scalar_req_tick_latency;
+    Tick scalar_resp_tick_latency;
 
     /**
      * Number of WFs to schedule to each SIMD. This vector is populated