From 23cb3a9fa151108e0701d05b83697391cc4128bf Mon Sep 17 00:00:00 2001
From: Tom Rollet <tom.rollet@huawei.com>
Date: Tue, 1 Jun 2021 14:45:48 +0200
Subject: [PATCH] cpu-o3: Add loadToUse stat

Add stat in o3 model to track the latency of load instructions
(no SWP) between issue and waking up of dependent instructions.

The max latency tracked in the stat histogram is curently
fixed to 299 and should be changed if someone wants to
track more precisely high latency memory acess.

Change-Id: I5973a4aa279bcc388d1a32b706c2e4f5e3f25e75
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/46679
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/cpu/o3/dyn_inst.hh   |  4 ++++
 src/cpu/o3/inst_queue.cc |  5 +++++
 src/cpu/o3/lsq_unit.cc   | 20 ++++++++++++++++++--
 src/cpu/o3/lsq_unit.hh   |  4 ++++
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh
index 24665a553e..31a0a110d4 100644
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -1094,6 +1094,10 @@ class DynInst : public ExecContext, public RefCounted
     int32_t storeTick = -1;
 #endif
 
+    /* Values used by LoadToUse stat */
+    Tick firstIssue = -1;
+    Tick lastWakeDependents = -1;
+
     /** Reads a misc. register, including any side-effects the read
      * might have as defined by the architecture.
      */
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index f59d3e62c0..d736410644 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -875,6 +875,9 @@ InstructionQueue::scheduleReadyInsts()
             issuing_inst->issueTick = curTick() - issuing_inst->fetchTick;
 #endif
 
+            if (issuing_inst->firstIssue == -1)
+                issuing_inst->firstIssue = curTick();
+
             if (!issuing_inst->isMemRef()) {
                 // Memory instructions can not be freed from the IQ until they
                 // complete.
@@ -966,6 +969,8 @@ InstructionQueue::wakeDependents(const DynInstPtr &completed_inst)
         iqIOStats.intInstQueueWakeupAccesses++;
     }
 
+    completed_inst->lastWakeDependents = curTick();
+
     DPRINTF(IQ, "Waking dependents of completed instruction.\n");
 
     assert(!completed_inst->isSquashed());
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index eb0e1e90e3..a2b1c6d67f 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -275,8 +275,13 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent)
                "Number of loads that were rescheduled"),
       ADD_STAT(blockedByCache, statistics::units::Count::get(),
                "Number of times an access to memory failed due to the cache "
-               "being blocked")
+               "being blocked"),
+      ADD_STAT(loadToUse, "Distribution of cycle latency between the "
+                "first time a load is issued and its completion")
 {
+    loadToUse
+        .init(0, 299, 10)
+        .flags(statistics::nozero);
 }
 
 void
@@ -713,8 +718,19 @@ LSQUnit::commitLoad()
 {
     assert(loadQueue.front().valid());
 
+    DynInstPtr inst = loadQueue.front().instruction();
+
     DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n",
-            loadQueue.front().instruction()->pcState());
+            inst->pcState());
+
+    // Update histogram with memory latency from load
+    // Only take latency from load demand that where issued and did not fault
+    if (!inst->isInstPrefetch() && !inst->isDataPrefetch()
+            && inst->firstIssue != -1
+            && inst->lastWakeDependents != -1) {
+        stats.loadToUse.sample(cpu->ticksToCycles(
+                    inst->lastWakeDependents - inst->firstIssue));
+    }
 
     loadQueue.front().clear();
     loadQueue.pop_front();
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 0681b1377c..790054f6b6 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -571,6 +571,10 @@ class LSQUnit
 
         /** Number of times the LSQ is blocked due to the cache. */
         statistics::Scalar blockedByCache;
+
+        /** Distribution of cycle latency between the first time a load
+         * is issued and its completion */
+        statistics::Distribution loadToUse;
     } stats;
 
   public: