diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh index 24665a553e..31a0a110d4 100644 --- a/src/cpu/o3/dyn_inst.hh +++ b/src/cpu/o3/dyn_inst.hh @@ -1094,6 +1094,10 @@ class DynInst : public ExecContext, public RefCounted int32_t storeTick = -1; #endif + /* Values used by LoadToUse stat */ + Tick firstIssue = -1; + Tick lastWakeDependents = -1; + /** Reads a misc. register, including any side-effects the read * might have as defined by the architecture. */ diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index f59d3e62c0..d736410644 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -875,6 +875,9 @@ InstructionQueue::scheduleReadyInsts() issuing_inst->issueTick = curTick() - issuing_inst->fetchTick; #endif + if (issuing_inst->firstIssue == -1) + issuing_inst->firstIssue = curTick(); + if (!issuing_inst->isMemRef()) { // Memory instructions can not be freed from the IQ until they // complete. @@ -966,6 +969,8 @@ InstructionQueue::wakeDependents(const DynInstPtr &completed_inst) iqIOStats.intInstQueueWakeupAccesses++; } + completed_inst->lastWakeDependents = curTick(); + DPRINTF(IQ, "Waking dependents of completed instruction.\n"); assert(!completed_inst->isSquashed()); diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index eb0e1e90e3..a2b1c6d67f 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -275,8 +275,13 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent) "Number of loads that were rescheduled"), ADD_STAT(blockedByCache, statistics::units::Count::get(), "Number of times an access to memory failed due to the cache " - "being blocked") + "being blocked"), + ADD_STAT(loadToUse, "Distribution of cycle latency between the " + "first time a load is issued and its completion") { + loadToUse + .init(0, 299, 10) + .flags(statistics::nozero); } void @@ -713,8 +718,19 @@ LSQUnit::commitLoad() { assert(loadQueue.front().valid()); + DynInstPtr inst = loadQueue.front().instruction(); + DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n", - loadQueue.front().instruction()->pcState()); + inst->pcState()); + + // Update histogram with memory latency from load + // Only take latency from load demand that where issued and did not fault + if (!inst->isInstPrefetch() && !inst->isDataPrefetch() + && inst->firstIssue != -1 + && inst->lastWakeDependents != -1) { + stats.loadToUse.sample(cpu->ticksToCycles( + inst->lastWakeDependents - inst->firstIssue)); + } loadQueue.front().clear(); loadQueue.pop_front(); diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 0681b1377c..790054f6b6 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -571,6 +571,10 @@ class LSQUnit /** Number of times the LSQ is blocked due to the cache. */ statistics::Scalar blockedByCache; + + /** Distribution of cycle latency between the first time a load + * is issued and its completion */ + statistics::Distribution loadToUse; } stats; public: