From 23cb3a9fa151108e0701d05b83697391cc4128bf Mon Sep 17 00:00:00 2001 From: Tom Rollet Date: Tue, 1 Jun 2021 14:45:48 +0200 Subject: [PATCH] cpu-o3: Add loadToUse stat Add stat in o3 model to track the latency of load instructions (no SWP) between issue and waking up of dependent instructions. The max latency tracked in the stat histogram is curently fixed to 299 and should be changed if someone wants to track more precisely high latency memory acess. Change-Id: I5973a4aa279bcc388d1a32b706c2e4f5e3f25e75 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/46679 Reviewed-by: Jason Lowe-Power Maintainer: Jason Lowe-Power Tested-by: kokoro --- src/cpu/o3/dyn_inst.hh | 4 ++++ src/cpu/o3/inst_queue.cc | 5 +++++ src/cpu/o3/lsq_unit.cc | 20 ++++++++++++++++++-- src/cpu/o3/lsq_unit.hh | 4 ++++ 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh index 24665a553e..31a0a110d4 100644 --- a/src/cpu/o3/dyn_inst.hh +++ b/src/cpu/o3/dyn_inst.hh @@ -1094,6 +1094,10 @@ class DynInst : public ExecContext, public RefCounted int32_t storeTick = -1; #endif + /* Values used by LoadToUse stat */ + Tick firstIssue = -1; + Tick lastWakeDependents = -1; + /** Reads a misc. register, including any side-effects the read * might have as defined by the architecture. */ diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index f59d3e62c0..d736410644 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -875,6 +875,9 @@ InstructionQueue::scheduleReadyInsts() issuing_inst->issueTick = curTick() - issuing_inst->fetchTick; #endif + if (issuing_inst->firstIssue == -1) + issuing_inst->firstIssue = curTick(); + if (!issuing_inst->isMemRef()) { // Memory instructions can not be freed from the IQ until they // complete. @@ -966,6 +969,8 @@ InstructionQueue::wakeDependents(const DynInstPtr &completed_inst) iqIOStats.intInstQueueWakeupAccesses++; } + completed_inst->lastWakeDependents = curTick(); + DPRINTF(IQ, "Waking dependents of completed instruction.\n"); assert(!completed_inst->isSquashed()); diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index eb0e1e90e3..a2b1c6d67f 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -275,8 +275,13 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent) "Number of loads that were rescheduled"), ADD_STAT(blockedByCache, statistics::units::Count::get(), "Number of times an access to memory failed due to the cache " - "being blocked") + "being blocked"), + ADD_STAT(loadToUse, "Distribution of cycle latency between the " + "first time a load is issued and its completion") { + loadToUse + .init(0, 299, 10) + .flags(statistics::nozero); } void @@ -713,8 +718,19 @@ LSQUnit::commitLoad() { assert(loadQueue.front().valid()); + DynInstPtr inst = loadQueue.front().instruction(); + DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n", - loadQueue.front().instruction()->pcState()); + inst->pcState()); + + // Update histogram with memory latency from load + // Only take latency from load demand that where issued and did not fault + if (!inst->isInstPrefetch() && !inst->isDataPrefetch() + && inst->firstIssue != -1 + && inst->lastWakeDependents != -1) { + stats.loadToUse.sample(cpu->ticksToCycles( + inst->lastWakeDependents - inst->firstIssue)); + } loadQueue.front().clear(); loadQueue.pop_front(); diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 0681b1377c..790054f6b6 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -571,6 +571,10 @@ class LSQUnit /** Number of times the LSQ is blocked due to the cache. */ statistics::Scalar blockedByCache; + + /** Distribution of cycle latency between the first time a load + * is issued and its completion */ + statistics::Distribution loadToUse; } stats; public: