cpu-o3: Add loadToUse stat

Add stat in o3 model to track the latency of load instructions (no SWP) between issue and waking up of dependent instructions. The max latency tracked in the stat histogram is curently fixed to 299 and should be changed if someone wants to track more precisely high latency memory acess. Change-Id: I5973a4aa279bcc388d1a32b706c2e4f5e3f25e75 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/46679 Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Jason Lowe-Power <power.jg@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2021-06-01 14:45:48 +02:00
parent a83c2f30df
commit 23cb3a9fa1
4 changed files with 31 additions and 2 deletions
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -1094,6 +1094,10 @@ class DynInst : public ExecContext, public RefCounted
    int32_t storeTick = -1;
 #endif

+    /* Values used by LoadToUse stat */
+    Tick firstIssue = -1;
+    Tick lastWakeDependents = -1;
+
    /** Reads a misc. register, including any side-effects the read
     * might have as defined by the architecture.
     */
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -875,6 +875,9 @@ InstructionQueue::scheduleReadyInsts()
            issuing_inst->issueTick = curTick() - issuing_inst->fetchTick;
 #endif

+            if (issuing_inst->firstIssue == -1)
+                issuing_inst->firstIssue = curTick();
+
            if (!issuing_inst->isMemRef()) {
                // Memory instructions can not be freed from the IQ until they
                // complete.
@@ -966,6 +969,8 @@ InstructionQueue::wakeDependents(const DynInstPtr &completed_inst)
        iqIOStats.intInstQueueWakeupAccesses++;
    }

+    completed_inst->lastWakeDependents = curTick();
+
    DPRINTF(IQ, "Waking dependents of completed instruction.\n");

    assert(!completed_inst->isSquashed());
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -275,8 +275,13 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent)
               "Number of loads that were rescheduled"),
      ADD_STAT(blockedByCache, statistics::units::Count::get(),
               "Number of times an access to memory failed due to the cache "
-               "being blocked")
+               "being blocked"),
+      ADD_STAT(loadToUse, "Distribution of cycle latency between the "
+                "first time a load is issued and its completion")
 {
+    loadToUse
+        .init(0, 299, 10)
+        .flags(statistics::nozero);
 }

 void
@@ -713,8 +718,19 @@ LSQUnit::commitLoad()
 {
    assert(loadQueue.front().valid());

+    DynInstPtr inst = loadQueue.front().instruction();
+
    DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n",
-            loadQueue.front().instruction()->pcState());
+            inst->pcState());
+
+    // Update histogram with memory latency from load
+    // Only take latency from load demand that where issued and did not fault
+    if (!inst->isInstPrefetch() && !inst->isDataPrefetch()
+            && inst->firstIssue != -1
+            && inst->lastWakeDependents != -1) {
+        stats.loadToUse.sample(cpu->ticksToCycles(
+                    inst->lastWakeDependents - inst->firstIssue));
+    }

    loadQueue.front().clear();
    loadQueue.pop_front();
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -571,6 +571,10 @@ class LSQUnit

        /** Number of times the LSQ is blocked due to the cache. */
        statistics::Scalar blockedByCache;
+
+        /** Distribution of cycle latency between the first time a load
+         * is issued and its completion */
+        statistics::Distribution loadToUse;
    } stats;

  public: