diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh
index 24665a553e..31a0a110d4 100644
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -1094,6 +1094,10 @@ class DynInst : public ExecContext, public RefCounted
     int32_t storeTick = -1;
 #endif
 
+    /* Values used by LoadToUse stat */
+    Tick firstIssue = -1;
+    Tick lastWakeDependents = -1;
+
     /** Reads a misc. register, including any side-effects the read
      * might have as defined by the architecture.
      */
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index f59d3e62c0..d736410644 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -875,6 +875,9 @@ InstructionQueue::scheduleReadyInsts()
             issuing_inst->issueTick = curTick() - issuing_inst->fetchTick;
 #endif
 
+            if (issuing_inst->firstIssue == -1)
+                issuing_inst->firstIssue = curTick();
+
             if (!issuing_inst->isMemRef()) {
                 // Memory instructions can not be freed from the IQ until they
                 // complete.
@@ -966,6 +969,8 @@ InstructionQueue::wakeDependents(const DynInstPtr &completed_inst)
         iqIOStats.intInstQueueWakeupAccesses++;
     }
 
+    completed_inst->lastWakeDependents = curTick();
+
     DPRINTF(IQ, "Waking dependents of completed instruction.\n");
 
     assert(!completed_inst->isSquashed());
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index eb0e1e90e3..a2b1c6d67f 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -275,8 +275,13 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent)
                "Number of loads that were rescheduled"),
       ADD_STAT(blockedByCache, statistics::units::Count::get(),
                "Number of times an access to memory failed due to the cache "
-               "being blocked")
+               "being blocked"),
+      ADD_STAT(loadToUse, "Distribution of cycle latency between the "
+                "first time a load is issued and its completion")
 {
+    loadToUse
+        .init(0, 299, 10)
+        .flags(statistics::nozero);
 }
 
 void
@@ -713,8 +718,19 @@ LSQUnit::commitLoad()
 {
     assert(loadQueue.front().valid());
 
+    DynInstPtr inst = loadQueue.front().instruction();
+
     DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n",
-            loadQueue.front().instruction()->pcState());
+            inst->pcState());
+
+    // Update histogram with memory latency from load
+    // Only take latency from load demand that where issued and did not fault
+    if (!inst->isInstPrefetch() && !inst->isDataPrefetch()
+            && inst->firstIssue != -1
+            && inst->lastWakeDependents != -1) {
+        stats.loadToUse.sample(cpu->ticksToCycles(
+                    inst->lastWakeDependents - inst->firstIssue));
+    }
 
     loadQueue.front().clear();
     loadQueue.pop_front();
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 0681b1377c..790054f6b6 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -571,6 +571,10 @@ class LSQUnit
 
         /** Number of times the LSQ is blocked due to the cache. */
         statistics::Scalar blockedByCache;
+
+        /** Distribution of cycle latency between the first time a load
+         * is issued and its completion */
+        statistics::Distribution loadToUse;
     } stats;
 
   public: