base-stats,misc: Rename Stats namespace as statistics

As part of recent decisions regarding namespace naming conventions, all namespaces will be changed to snake case. ::Stats became ::statistics. "statistics" was chosen over "stats" to avoid generating conflicts with the already existing variables (there are way too many "stats" in the codebase), which would make this patch even more disturbing for the users. Change-Id: If877b12d7dac356f86e3b3d941bf7558a4fd8719 Signed-off-by: Daniel R. Carvalho <odanrc@yahoo.com.br> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/45421 Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Jason Lowe-Power <power.jg@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2021-05-06 20:00:51 -03:00
parent fa505f1c23
commit 98ac080ec4
228 changed files with 3078 additions and 2970 deletions
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -2104,8 +2104,9 @@ ComputeUnit::LDSPort::recvReqRetry()
    }
 }

-ComputeUnit::ComputeUnitStats::ComputeUnitStats(Stats::Group *parent, int n_wf)
-    : Stats::Group(parent),
+ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent,
+    int n_wf)
+    : statistics::Group(parent),
      ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
      ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
               "per-wavefront."),
@@ -2290,7 +2291,8 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(Stats::Group *parent, int n_wf)
    activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4);
    activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4);

-    headTailLatency.init(0, 1000000, 10000).flags(Stats::pdf | Stats::oneline);
+    headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf |
+        statistics::oneline);
    waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
    instInterleave.init(cu->numVectorALUs, 0, 20, 1);

--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -939,152 +939,152 @@ class ComputeUnit : public ClockedObject
    void updateInstStats(GPUDynInstPtr gpuDynInst);
    int activeWaves;

-    struct ComputeUnitStats : public Stats::Group
+    struct ComputeUnitStats : public statistics::Group
    {
-        ComputeUnitStats(Stats::Group *parent, int n_wf);
+        ComputeUnitStats(statistics::Group *parent, int n_wf);

-        Stats::Scalar vALUInsts;
-        Stats::Formula vALUInstsPerWF;
-        Stats::Scalar sALUInsts;
-        Stats::Formula sALUInstsPerWF;
-        Stats::Scalar instCyclesVALU;
-        Stats::Scalar instCyclesSALU;
-        Stats::Scalar threadCyclesVALU;
-        Stats::Formula vALUUtilization;
-        Stats::Scalar ldsNoFlatInsts;
-        Stats::Formula ldsNoFlatInstsPerWF;
-        Stats::Scalar flatVMemInsts;
-        Stats::Formula flatVMemInstsPerWF;
-        Stats::Scalar flatLDSInsts;
-        Stats::Formula flatLDSInstsPerWF;
-        Stats::Scalar vectorMemWrites;
-        Stats::Formula vectorMemWritesPerWF;
-        Stats::Scalar vectorMemReads;
-        Stats::Formula vectorMemReadsPerWF;
-        Stats::Scalar scalarMemWrites;
-        Stats::Formula scalarMemWritesPerWF;
-        Stats::Scalar scalarMemReads;
-        Stats::Formula scalarMemReadsPerWF;
+        statistics::Scalar vALUInsts;
+        statistics::Formula vALUInstsPerWF;
+        statistics::Scalar sALUInsts;
+        statistics::Formula sALUInstsPerWF;
+        statistics::Scalar instCyclesVALU;
+        statistics::Scalar instCyclesSALU;
+        statistics::Scalar threadCyclesVALU;
+        statistics::Formula vALUUtilization;
+        statistics::Scalar ldsNoFlatInsts;
+        statistics::Formula ldsNoFlatInstsPerWF;
+        statistics::Scalar flatVMemInsts;
+        statistics::Formula flatVMemInstsPerWF;
+        statistics::Scalar flatLDSInsts;
+        statistics::Formula flatLDSInstsPerWF;
+        statistics::Scalar vectorMemWrites;
+        statistics::Formula vectorMemWritesPerWF;
+        statistics::Scalar vectorMemReads;
+        statistics::Formula vectorMemReadsPerWF;
+        statistics::Scalar scalarMemWrites;
+        statistics::Formula scalarMemWritesPerWF;
+        statistics::Scalar scalarMemReads;
+        statistics::Formula scalarMemReadsPerWF;

-        Stats::Formula vectorMemReadsPerKiloInst;
-        Stats::Formula vectorMemWritesPerKiloInst;
-        Stats::Formula vectorMemInstsPerKiloInst;
-        Stats::Formula scalarMemReadsPerKiloInst;
-        Stats::Formula scalarMemWritesPerKiloInst;
-        Stats::Formula scalarMemInstsPerKiloInst;
+        statistics::Formula vectorMemReadsPerKiloInst;
+        statistics::Formula vectorMemWritesPerKiloInst;
+        statistics::Formula vectorMemInstsPerKiloInst;
+        statistics::Formula scalarMemReadsPerKiloInst;
+        statistics::Formula scalarMemWritesPerKiloInst;
+        statistics::Formula scalarMemInstsPerKiloInst;

        // Cycles required to send register source (addr and data) from
        // register files to memory pipeline, per SIMD.
-        Stats::Vector instCyclesVMemPerSimd;
-        Stats::Vector instCyclesScMemPerSimd;
-        Stats::Vector instCyclesLdsPerSimd;
+        statistics::Vector instCyclesVMemPerSimd;
+        statistics::Vector instCyclesScMemPerSimd;
+        statistics::Vector instCyclesLdsPerSimd;

-        Stats::Scalar globalReads;
-        Stats::Scalar globalWrites;
-        Stats::Formula globalMemInsts;
-        Stats::Scalar argReads;
-        Stats::Scalar argWrites;
-        Stats::Formula argMemInsts;
-        Stats::Scalar spillReads;
-        Stats::Scalar spillWrites;
-        Stats::Formula spillMemInsts;
-        Stats::Scalar groupReads;
-        Stats::Scalar groupWrites;
-        Stats::Formula groupMemInsts;
-        Stats::Scalar privReads;
-        Stats::Scalar privWrites;
-        Stats::Formula privMemInsts;
-        Stats::Scalar readonlyReads;
-        Stats::Scalar readonlyWrites;
-        Stats::Formula readonlyMemInsts;
-        Stats::Scalar kernargReads;
-        Stats::Scalar kernargWrites;
-        Stats::Formula kernargMemInsts;
+        statistics::Scalar globalReads;
+        statistics::Scalar globalWrites;
+        statistics::Formula globalMemInsts;
+        statistics::Scalar argReads;
+        statistics::Scalar argWrites;
+        statistics::Formula argMemInsts;
+        statistics::Scalar spillReads;
+        statistics::Scalar spillWrites;
+        statistics::Formula spillMemInsts;
+        statistics::Scalar groupReads;
+        statistics::Scalar groupWrites;
+        statistics::Formula groupMemInsts;
+        statistics::Scalar privReads;
+        statistics::Scalar privWrites;
+        statistics::Formula privMemInsts;
+        statistics::Scalar readonlyReads;
+        statistics::Scalar readonlyWrites;
+        statistics::Formula readonlyMemInsts;
+        statistics::Scalar kernargReads;
+        statistics::Scalar kernargWrites;
+        statistics::Formula kernargMemInsts;

-        Stats::Distribution waveLevelParallelism;
+        statistics::Distribution waveLevelParallelism;

        // the following stats compute the avg. TLB accesslatency per
        // uncoalesced request (only for data)
-        Stats::Scalar tlbRequests;
-        Stats::Scalar tlbCycles;
-        Stats::Formula tlbLatency;
+        statistics::Scalar tlbRequests;
+        statistics::Scalar tlbCycles;
+        statistics::Formula tlbLatency;
        // hitsPerTLBLevel[x] are the hits in Level x TLB.
        // x = 0 is the page table.
-        Stats::Vector hitsPerTLBLevel;
+        statistics::Vector hitsPerTLBLevel;

-        Stats::Scalar ldsBankAccesses;
-        Stats::Distribution ldsBankConflictDist;
+        statistics::Scalar ldsBankAccesses;
+        statistics::Distribution ldsBankConflictDist;

        // over all memory instructions executed over all wavefronts
        // how many touched 0-4 pages, 4-8, ..., 60-64 pages
-        Stats::Distribution pageDivergenceDist;
+        statistics::Distribution pageDivergenceDist;
        // count of non-flat global memory vector instructions executed
-        Stats::Scalar dynamicGMemInstrCnt;
+        statistics::Scalar dynamicGMemInstrCnt;
        // count of flat global memory vector instructions executed
-        Stats::Scalar dynamicFlatMemInstrCnt;
-        Stats::Scalar dynamicLMemInstrCnt;
+        statistics::Scalar dynamicFlatMemInstrCnt;
+        statistics::Scalar dynamicLMemInstrCnt;

-        Stats::Scalar wgBlockedDueBarrierAllocation;
-        Stats::Scalar wgBlockedDueLdsAllocation;
+        statistics::Scalar wgBlockedDueBarrierAllocation;
+        statistics::Scalar wgBlockedDueLdsAllocation;
        // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
        // active when the instruction is committed, this number is still
        // incremented by 1
-        Stats::Scalar numInstrExecuted;
+        statistics::Scalar numInstrExecuted;
        // Number of cycles among successive instruction executions across all
        // wavefronts of the same CU
-        Stats::Distribution execRateDist;
+        statistics::Distribution execRateDist;
        // number of individual vector operations executed
-        Stats::Scalar numVecOpsExecuted;
+        statistics::Scalar numVecOpsExecuted;
        // number of individual f16 vector operations executed
-        Stats::Scalar numVecOpsExecutedF16;
+        statistics::Scalar numVecOpsExecutedF16;
        // number of individual f32 vector operations executed
-        Stats::Scalar numVecOpsExecutedF32;
+        statistics::Scalar numVecOpsExecutedF32;
        // number of individual f64 vector operations executed
-        Stats::Scalar numVecOpsExecutedF64;
+        statistics::Scalar numVecOpsExecutedF64;
        // number of individual FMA 16,32,64 vector operations executed
-        Stats::Scalar numVecOpsExecutedFMA16;
-        Stats::Scalar numVecOpsExecutedFMA32;
-        Stats::Scalar numVecOpsExecutedFMA64;
+        statistics::Scalar numVecOpsExecutedFMA16;
+        statistics::Scalar numVecOpsExecutedFMA32;
+        statistics::Scalar numVecOpsExecutedFMA64;
        // number of individual MAC 16,32,64 vector operations executed
-        Stats::Scalar numVecOpsExecutedMAC16;
-        Stats::Scalar numVecOpsExecutedMAC32;
-        Stats::Scalar numVecOpsExecutedMAC64;
+        statistics::Scalar numVecOpsExecutedMAC16;
+        statistics::Scalar numVecOpsExecutedMAC32;
+        statistics::Scalar numVecOpsExecutedMAC64;
        // number of individual MAD 16,32,64 vector operations executed
-        Stats::Scalar numVecOpsExecutedMAD16;
-        Stats::Scalar numVecOpsExecutedMAD32;
-        Stats::Scalar numVecOpsExecutedMAD64;
+        statistics::Scalar numVecOpsExecutedMAD16;
+        statistics::Scalar numVecOpsExecutedMAD32;
+        statistics::Scalar numVecOpsExecutedMAD64;
        // total number of two op FP vector operations executed
-        Stats::Scalar numVecOpsExecutedTwoOpFP;
+        statistics::Scalar numVecOpsExecutedTwoOpFP;
        // Total cycles that something is running on the GPU
-        Stats::Scalar totalCycles;
-        Stats::Formula vpc; // vector ops per cycle
-        Stats::Formula vpc_f16; // vector ops per cycle
-        Stats::Formula vpc_f32; // vector ops per cycle
-        Stats::Formula vpc_f64; // vector ops per cycle
-        Stats::Formula ipc; // vector instructions per cycle
-        Stats::Distribution controlFlowDivergenceDist;
-        Stats::Distribution activeLanesPerGMemInstrDist;
-        Stats::Distribution activeLanesPerLMemInstrDist;
+        statistics::Scalar totalCycles;
+        statistics::Formula vpc; // vector ops per cycle
+        statistics::Formula vpc_f16; // vector ops per cycle
+        statistics::Formula vpc_f32; // vector ops per cycle
+        statistics::Formula vpc_f64; // vector ops per cycle
+        statistics::Formula ipc; // vector instructions per cycle
+        statistics::Distribution controlFlowDivergenceDist;
+        statistics::Distribution activeLanesPerGMemInstrDist;
+        statistics::Distribution activeLanesPerLMemInstrDist;
        // number of vector ALU instructions received
-        Stats::Formula numALUInstsExecuted;
+        statistics::Formula numALUInstsExecuted;
        // number of times a WG cannot start due to lack of free VGPRs in SIMDs
-        Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+        statistics::Scalar numTimesWgBlockedDueVgprAlloc;
        // number of times a WG cannot start due to lack of free SGPRs in SIMDs
-        Stats::Scalar numTimesWgBlockedDueSgprAlloc;
-        Stats::Scalar numCASOps;
-        Stats::Scalar numFailedCASOps;
-        Stats::Scalar completedWfs;
-        Stats::Scalar completedWGs;
+        statistics::Scalar numTimesWgBlockedDueSgprAlloc;
+        statistics::Scalar numCASOps;
+        statistics::Scalar numFailedCASOps;
+        statistics::Scalar completedWfs;
+        statistics::Scalar completedWGs;

        // distrubtion in latency difference between first and last cache block
        // arrival ticks
-        Stats::Distribution headTailLatency;
+        statistics::Distribution headTailLatency;

        // Track the amount of interleaving between wavefronts on each SIMD.
        // This stat is sampled using instExecPerSimd to compute the number
        // of instructions that have been executed on a SIMD between a WF
        // executing two successive instructions.
-        Stats::VectorDistribution instInterleave;
+        statistics::VectorDistribution instInterleave;
    } stats;
 };

--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -344,8 +344,9 @@ GPUDispatcher::scheduleDispatch()
    }
 }

-GPUDispatcher::GPUDispatcherStats::GPUDispatcherStats(Stats::Group *parent)
-    : Stats::Group(parent),
+GPUDispatcher::GPUDispatcherStats::GPUDispatcherStats(
+    statistics::Group *parent)
+    : statistics::Group(parent),
      ADD_STAT(numKernelLaunched, "number of kernel launched"),
      ADD_STAT(cyclesWaitingForDispatch, "number of cycles with outstanding "
               "wavefronts that are waiting to be dispatched")
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -93,12 +93,12 @@ class GPUDispatcher : public SimObject
    bool dispatchActive;

  protected:
-    struct GPUDispatcherStats : public Stats::Group
+    struct GPUDispatcherStats : public statistics::Group
    {
-        GPUDispatcherStats(Stats::Group *parent);
+        GPUDispatcherStats(statistics::Group *parent);

-        Stats::Scalar numKernelLaunched;
-        Stats::Scalar cyclesWaitingForDispatch;
+        statistics::Scalar numKernelLaunched;
+        statistics::Scalar cyclesWaitingForDispatch;
    } stats;
 };

--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -197,8 +197,8 @@ ExecStage::exec()
    collectStatistics(PostExec, 0);
 }

-ExecStage::ExecStageStats::ExecStageStats(Stats::Group *parent)
-    : Stats::Group(parent, "ExecStage"),
+ExecStage::ExecStageStats::ExecStageStats(statistics::Group *parent)
+    : statistics::Group(parent, "ExecStage"),
      ADD_STAT(numTransActiveIdle,
               "number of CU transitions from active to idle"),
      ADD_STAT(numCyclesWithNoIssue, "number of cycles the CU issues nothing"),
--- a/src/gpu-compute/exec_stage.hh
+++ b/src/gpu-compute/exec_stage.hh
@@ -97,27 +97,27 @@ class ExecStage
    const std::string _name;

  protected:
-    struct ExecStageStats : public Stats::Group
+    struct ExecStageStats : public statistics::Group
    {
-        ExecStageStats(Stats::Group *parent);
+        ExecStageStats(statistics::Group *parent);

        // number of transitions from active to idle
-        Stats::Scalar numTransActiveIdle;
+        statistics::Scalar numTransActiveIdle;
        // number of idle cycles
-        Stats::Scalar numCyclesWithNoIssue;
+        statistics::Scalar numCyclesWithNoIssue;
        // number of busy cycles
-        Stats::Scalar numCyclesWithInstrIssued;
+        statistics::Scalar numCyclesWithInstrIssued;
        // SIMDs active per cycle
-        Stats::Distribution spc;
+        statistics::Distribution spc;
        // duration of idle periods in cycles
-        Stats::Distribution idleDur;
+        statistics::Distribution idleDur;
        // number of cycles during which at least one
        // instruction was issued to an execution resource type
-        Stats::Vector numCyclesWithInstrTypeIssued;
+        statistics::Vector numCyclesWithInstrTypeIssued;
        // number of idle cycles during which the scheduler
        // issued no instructions targeting a specific
        // execution resource type
-        Stats::Vector numCyclesWithNoInstrTypeIssued;
+        statistics::Vector numCyclesWithNoInstrTypeIssued;
    } stats;
 };

--- a/src/gpu-compute/fetch_stage.cc
+++ b/src/gpu-compute/fetch_stage.cc
@@ -90,8 +90,8 @@ FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
    _fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
 }

-FetchStage::FetchStageStats::FetchStageStats(Stats::Group *parent)
-    : Stats::Group(parent, "FetchStage"),
+FetchStage::FetchStageStats::FetchStageStats(statistics::Group *parent)
+    : statistics::Group(parent, "FetchStage"),
      ADD_STAT(instFetchInstReturned, "For each instruction fetch request "
               "received record how many instructions you got from it")
 {
--- a/src/gpu-compute/fetch_stage.hh
+++ b/src/gpu-compute/fetch_stage.hh
@@ -74,11 +74,11 @@ class FetchStage
    const std::string _name;

  protected:
-    struct FetchStageStats : public Stats::Group
+    struct FetchStageStats : public statistics::Group
    {
-        FetchStageStats(Stats::Group *parent);
+        FetchStageStats(statistics::Group *parent);

-        Stats::Distribution instFetchInstReturned;
+        statistics::Distribution instFetchInstReturned;
    } stats;
 };

--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -290,8 +290,8 @@ GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
 }

 GlobalMemPipeline::
-GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent)
-    : Stats::Group(parent, "GlobalMemPipeline"),
+GlobalMemPipelineStats::GlobalMemPipelineStats(statistics::Group *parent)
+    : statistics::Group(parent, "GlobalMemPipeline"),
      ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
               "are delayed before updating the VRF")
 {
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -143,14 +143,14 @@ class GlobalMemPipeline
    std::queue<GPUDynInstPtr> gmIssuedRequests;

  protected:
-    struct GlobalMemPipelineStats : public Stats::Group
+    struct GlobalMemPipelineStats : public statistics::Group
    {
-        GlobalMemPipelineStats(Stats::Group *parent);
+        GlobalMemPipelineStats(statistics::Group *parent);

        // number of cycles of delaying the update of a VGPR that is the
        // target of a load instruction (or the load component of an atomic)
        // The delay is due to VRF bank conflicts
-        Stats::Scalar loadVrfBankConflictCycles;
+        statistics::Scalar loadVrfBankConflictCycles;
    } stats;
 };

--- a/src/gpu-compute/gpu_tlb.cc
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -1430,8 +1430,8 @@ namespace X86ISA
        TLBFootprint.clear();
    }

-    GpuTLB::GpuTLBStats::GpuTLBStats(Stats::Group *parent)
-        : Stats::Group(parent),
+    GpuTLB::GpuTLBStats::GpuTLBStats(statistics::Group *parent)
+        : statistics::Group(parent),
          ADD_STAT(localNumTLBAccesses, "Number of TLB accesses"),
          ADD_STAT(localNumTLBHits, "Number of TLB hits"),
          ADD_STAT(localNumTLBMisses, "Number of TLB misses"),
--- a/src/gpu-compute/gpu_tlb.hh
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -400,37 +400,37 @@ namespace X86ISA
        EventFunctionWrapper exitEvent;

      protected:
-        struct GpuTLBStats : public Stats::Group
+        struct GpuTLBStats : public statistics::Group
        {
-            GpuTLBStats(Stats::Group *parent);
+            GpuTLBStats(statistics::Group *parent);

            // local_stats are as seen from the TLB
            // without taking into account coalescing
-            Stats::Scalar localNumTLBAccesses;
-            Stats::Scalar localNumTLBHits;
-            Stats::Scalar localNumTLBMisses;
-            Stats::Formula localTLBMissRate;
+            statistics::Scalar localNumTLBAccesses;
+            statistics::Scalar localNumTLBHits;
+            statistics::Scalar localNumTLBMisses;
+            statistics::Formula localTLBMissRate;

            // global_stats are as seen from the
            // CU's perspective taking into account
            // all coalesced requests.
-            Stats::Scalar globalNumTLBAccesses;
-            Stats::Scalar globalNumTLBHits;
-            Stats::Scalar globalNumTLBMisses;
-            Stats::Formula globalTLBMissRate;
+            statistics::Scalar globalNumTLBAccesses;
+            statistics::Scalar globalNumTLBHits;
+            statistics::Scalar globalNumTLBMisses;
+            statistics::Formula globalTLBMissRate;

            // from the CU perspective (global)
-            Stats::Scalar accessCycles;
+            statistics::Scalar accessCycles;
            // from the CU perspective (global)
-            Stats::Scalar pageTableCycles;
-            Stats::Scalar numUniquePages;
+            statistics::Scalar pageTableCycles;
+            statistics::Scalar numUniquePages;
            // from the perspective of this TLB
-            Stats::Scalar localCycles;
+            statistics::Scalar localCycles;
            // from the perspective of this TLB
-            Stats::Formula localLatency;
+            statistics::Formula localLatency;
            // I take the avg. per page and then
            // the avg. over all pages.
-            Stats::Scalar avgReuseDistance;
+            statistics::Scalar avgReuseDistance;
        } stats;
    };
 }
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -126,8 +126,8 @@ LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)


 LocalMemPipeline::
-LocalMemPipelineStats::LocalMemPipelineStats(Stats::Group *parent)
-    : Stats::Group(parent, "LocalMemPipeline"),
+LocalMemPipelineStats::LocalMemPipelineStats(statistics::Group *parent)
+    : statistics::Group(parent, "LocalMemPipeline"),
      ADD_STAT(loadVrfBankConflictCycles, "total number of cycles LDS data "
               "are delayed before updating the VRF")
 {
--- a/src/gpu-compute/local_memory_pipeline.hh
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -97,11 +97,11 @@ class LocalMemPipeline
    std::queue<GPUDynInstPtr> lmReturnedRequests;

  protected:
-    struct LocalMemPipelineStats : public Stats::Group
+    struct LocalMemPipelineStats : public statistics::Group
    {
-        LocalMemPipelineStats(Stats::Group *parent);
+        LocalMemPipelineStats(statistics::Group *parent);

-        Stats::Scalar loadVrfBankConflictCycles;
+        statistics::Scalar loadVrfBankConflictCycles;
    } stats;
 };

--- a/src/gpu-compute/register_file.cc
+++ b/src/gpu-compute/register_file.cc
@@ -189,8 +189,8 @@ RegisterFile::dispatchInstruction(GPUDynInstPtr ii)
 {
 }

-RegisterFile::RegisterFileStats::RegisterFileStats(Stats::Group *parent)
-    : Stats::Group(parent),
+RegisterFile::RegisterFileStats::RegisterFileStats(statistics::Group *parent)
+    : statistics::Group(parent),
      ADD_STAT(registerReads,
              "Total number of DWORDs read from register file"),
      ADD_STAT(registerWrites,
--- a/src/gpu-compute/register_file.hh
+++ b/src/gpu-compute/register_file.hh
@@ -151,21 +151,21 @@ class RegisterFile : public SimObject
    // numer of registers in this register file
    int _numRegs;

-    struct RegisterFileStats : public Stats::Group
+    struct RegisterFileStats : public statistics::Group
    {
-        RegisterFileStats(Stats::Group *parent);
+        RegisterFileStats(statistics::Group *parent);

        // Total number of register reads per DWORD per thread
-        Stats::Scalar registerReads;
+        statistics::Scalar registerReads;
        // Total number of register writes per DWORD per thread
-        Stats::Scalar registerWrites;
+        statistics::Scalar registerWrites;

        // Number of register file SRAM activations for reads.
        // The register file may be implemented with multiple SRAMs. This stat
        // tracks how many times the SRAMs are accessed for reads.
-        Stats::Scalar sramReads;
+        statistics::Scalar sramReads;
        // Number of register file SRAM activations for writes
-        Stats::Scalar sramWrites;
+        statistics::Scalar sramWrites;
    } stats;
 };

--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -784,9 +784,9 @@ ScheduleStage::deleteFromSch(Wavefront *w)
    wavesInSch.erase(w->wfDynId);
 }

-ScheduleStage::ScheduleStageStats::ScheduleStageStats(Stats::Group *parent,
-                                                      int num_exec_units)
-    : Stats::Group(parent, "ScheduleStage"),
+ScheduleStage::ScheduleStageStats::ScheduleStageStats(
+    statistics::Group *parent, int num_exec_units)
+    : statistics::Group(parent, "ScheduleStage"),
      ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
               "execution resource"),
      ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
--- a/src/gpu-compute/schedule_stage.hh
+++ b/src/gpu-compute/schedule_stage.hh
@@ -184,49 +184,49 @@ class ScheduleStage
    std::vector<std::deque<std::pair<GPUDynInstPtr, SCH_STATUS>>> schList;

  protected:
-    struct ScheduleStageStats : public Stats::Group
+    struct ScheduleStageStats : public statistics::Group
    {
-        ScheduleStageStats(Stats::Group *parent, int num_exec_units);
+        ScheduleStageStats(statistics::Group *parent, int num_exec_units);

        // Number of cycles with empty (or not empty) readyList, per execution
        // resource, when the CU is active (not sleeping)
-        Stats::Vector rdyListEmpty;
-        Stats::Vector rdyListNotEmpty;
+        statistics::Vector rdyListEmpty;
+        statistics::Vector rdyListNotEmpty;

        // Number of cycles, per execution resource, when at least one wave
        // was on the readyList and picked by scheduler, but was unable to be
        // added to the schList, when the CU is active (not sleeping)
-        Stats::Vector addToSchListStalls;
+        statistics::Vector addToSchListStalls;

        // Number of cycles, per execution resource, when a wave is selected
        // as candidate for dispatchList from schList
        // Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
-        Stats::Vector schListToDispList;
+        statistics::Vector schListToDispList;

        // Per execution resource stat, incremented once per cycle if no wave
        // was selected as candidate for dispatch and moved to dispatchList
-        Stats::Vector schListToDispListStalls;
+        statistics::Vector schListToDispListStalls;

        // Number of times a wave is selected by the scheduler but cannot
        // be added to the schList due to register files not being able to
        // support reads or writes of operands. RF_ACCESS_NRDY condition is
        // always incremented if at least one read/write not supported, other
        // conditions are incremented independently from each other.
-        Stats::Vector rfAccessStalls;
+        statistics::Vector rfAccessStalls;

        // Number of times a wave is executing FLAT instruction and
        // forces another wave occupying its required local memory resource
        // to be deselected for execution, and placed back on schList
-        Stats::Scalar ldsBusArbStalls;
+        statistics::Scalar ldsBusArbStalls;

        // Count of times VRF and/or SRF blocks waves on schList from
        // performing RFBUSY->RFREADY transition
-        Stats::Vector opdNrdyStalls;
+        statistics::Vector opdNrdyStalls;

        // Count of times resource required for dispatch is not ready and
        // blocks wave in RFREADY state on schList from potentially moving
        // to dispatchList
-        Stats::Vector dispNrdyStalls;
+        statistics::Vector dispNrdyStalls;
    } stats;
 };

--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -277,8 +277,8 @@ ScoreboardCheckStage::exec()
 }

 ScoreboardCheckStage::
-ScoreboardCheckStageStats::ScoreboardCheckStageStats(Stats::Group *parent)
-    : Stats::Group(parent, "ScoreboardCheckStage"),
+ScoreboardCheckStageStats::ScoreboardCheckStageStats(statistics::Group *parent)
+    : statistics::Group(parent, "ScoreboardCheckStage"),
      ADD_STAT(stallCycles, "number of cycles wave stalled in SCB")
 {
    stallCycles.init(NRDY_CONDITIONS);
--- a/src/gpu-compute/scoreboard_check_stage.hh
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -99,11 +99,11 @@ class ScoreboardCheckStage
    const std::string _name;

  protected:
-    struct ScoreboardCheckStageStats : public Stats::Group
+    struct ScoreboardCheckStageStats : public statistics::Group
    {
-        ScoreboardCheckStageStats(Stats::Group *parent);
+        ScoreboardCheckStageStats(statistics::Group *parent);

-        Stats::Vector stallCycles;
+        statistics::Vector stallCycles;
    } stats;
 };

--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -522,8 +522,8 @@ Shader::notifyCuSleep() {
        stats.shaderActiveTicks += curTick() - _lastInactiveTick;
 }

-Shader::ShaderStats::ShaderStats(Stats::Group *parent, int wf_size)
-    : Stats::Group(parent),
+Shader::ShaderStats::ShaderStats(statistics::Group *parent, int wf_size)
+    : statistics::Group(parent),
      ADD_STAT(allLatencyDist, "delay distribution for all"),
      ADD_STAT(loadLatencyDist, "delay distribution for loads"),
      ADD_STAT(storeLatencyDist, "delay distribution for stores"),
@@ -546,40 +546,40 @@ Shader::ShaderStats::ShaderStats(Stats::Group *parent, int wf_size)
 {
    allLatencyDist
        .init(0, 1600000, 10000)
-        .flags(Stats::pdf | Stats::oneline);
+        .flags(statistics::pdf | statistics::oneline);

    loadLatencyDist
        .init(0, 1600000, 10000)
-        .flags(Stats::pdf | Stats::oneline);
+        .flags(statistics::pdf | statistics::oneline);

    storeLatencyDist
        .init(0, 1600000, 10000)
-        .flags(Stats::pdf | Stats::oneline);
+        .flags(statistics::pdf | statistics::oneline);

    initToCoalesceLatency
        .init(0, 1600000, 10000)
-        .flags(Stats::pdf | Stats::oneline);
+        .flags(statistics::pdf | statistics::oneline);

    rubyNetworkLatency
        .init(0, 1600000, 10000)
-        .flags(Stats::pdf | Stats::oneline);
+        .flags(statistics::pdf | statistics::oneline);

    gmEnqueueLatency
        .init(0, 1600000, 10000)
-        .flags(Stats::pdf | Stats::oneline);
+        .flags(statistics::pdf | statistics::oneline);

    gmToCompleteLatency
        .init(0, 1600000, 10000)
-        .flags(Stats::pdf | Stats::oneline);
+        .flags(statistics::pdf | statistics::oneline);

    coalsrLineAddresses
        .init(0, 20, 1)
-        .flags(Stats::pdf | Stats::oneline);
+        .flags(statistics::pdf | statistics::oneline);

    vectorInstSrcOperand.init(4);
    vectorInstDstOperand.init(4);

-    cacheBlockRoundTrip = new Stats::Distribution[wf_size];
+    cacheBlockRoundTrip = new statistics::Distribution[wf_size];
    for (int idx = 0; idx < wf_size; ++idx) {
        std::stringstream namestr;
        ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
@@ -588,6 +588,6 @@ Shader::ShaderStats::ShaderStats(Stats::Group *parent, int wf_size)
            .init(0, 1600000, 10000)
            .name(namestr.str())
            .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
-            .flags(Stats::pdf | Stats::oneline);
+            .flags(statistics::pdf | statistics::oneline);
    }
 }
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -281,37 +281,37 @@ class Shader : public ClockedObject
    }

  protected:
-    struct ShaderStats : public Stats::Group
+    struct ShaderStats : public statistics::Group
    {
-        ShaderStats(Stats::Group *parent, int wf_size);
+        ShaderStats(statistics::Group *parent, int wf_size);

        // some stats for measuring latency
-        Stats::Distribution allLatencyDist;
-        Stats::Distribution loadLatencyDist;
-        Stats::Distribution storeLatencyDist;
+        statistics::Distribution allLatencyDist;
+        statistics::Distribution loadLatencyDist;
+        statistics::Distribution storeLatencyDist;

        // average ticks from vmem inst initiateAcc to coalescer issue,
-        Stats::Distribution initToCoalesceLatency;
+        statistics::Distribution initToCoalesceLatency;

        // average ticks from coalescer issue to coalescer hit callback,
-        Stats::Distribution rubyNetworkLatency;
+        statistics::Distribution rubyNetworkLatency;

        // average ticks from coalescer hit callback to GM pipe enqueue,
-        Stats::Distribution gmEnqueueLatency;
+        statistics::Distribution gmEnqueueLatency;

        // average ticks spent in GM pipe's ordered resp buffer.
-        Stats::Distribution gmToCompleteLatency;
+        statistics::Distribution gmToCompleteLatency;

        // average number of cache blocks requested by vmem inst
-        Stats::Distribution coalsrLineAddresses;
+        statistics::Distribution coalsrLineAddresses;

        // average ticks for cache blocks to main memory for the Nth
        // cache block generated by a vmem inst.
-        Stats::Distribution *cacheBlockRoundTrip;
+        statistics::Distribution *cacheBlockRoundTrip;

-        Stats::Scalar shaderActiveTicks;
-        Stats::Vector vectorInstSrcOperand;
-        Stats::Vector vectorInstDstOperand;
+        statistics::Scalar shaderActiveTicks;
+        statistics::Vector vectorInstSrcOperand;
+        statistics::Vector vectorInstDstOperand;
    } stats;
 };

--- a/src/gpu-compute/tlb_coalescer.cc
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -521,8 +521,8 @@ TLBCoalescer::processCleanupEvent()
    }
 }

-TLBCoalescer::TLBCoalescerStats::TLBCoalescerStats(Stats::Group *parent)
-    : Stats::Group(parent),
+TLBCoalescer::TLBCoalescerStats::TLBCoalescerStats(statistics::Group *parent)
+    : statistics::Group(parent),
      ADD_STAT(uncoalescedAccesses, "Number of uncoalesced TLB accesses"),
      ADD_STAT(coalescedAccesses, "Number of coalesced TLB accesses"),
      ADD_STAT(queuingCycles, "Number of cycles spent in queue"),
--- a/src/gpu-compute/tlb_coalescer.hh
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -195,26 +195,26 @@ class TLBCoalescer : public ClockedObject
    std::queue<Addr> cleanupQueue;

  protected:
-    struct TLBCoalescerStats : public Stats::Group
+    struct TLBCoalescerStats : public statistics::Group
    {
-        TLBCoalescerStats(Stats::Group *parent);
+        TLBCoalescerStats(statistics::Group *parent);

        // number of packets the coalescer receives
-        Stats::Scalar uncoalescedAccesses;
+        statistics::Scalar uncoalescedAccesses;
        // number packets the coalescer send to the TLB
-        Stats::Scalar coalescedAccesses;
+        statistics::Scalar coalescedAccesses;

        // Number of cycles the coalesced requests spend waiting in
        // coalescerFIFO. For each packet the coalescer receives we take into
        // account the number of all uncoalesced requests this pkt "represents"
-        Stats::Scalar queuingCycles;
+        statistics::Scalar queuingCycles;

        // On average how much time a request from the
        // uncoalescedAccesses that reaches the TLB
        // spends waiting?
-        Stats::Scalar localqueuingCycles;
+        statistics::Scalar localqueuingCycles;
        // localqueuingCycles/uncoalescedAccesses
-        Stats::Formula localLatency;
+        statistics::Formula localLatency;
    } stats;
 };

--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -1435,8 +1435,8 @@ Wavefront::releaseBarrier()
    barId = WFBarrier::InvalidID;
 }

-Wavefront::WavefrontStats::WavefrontStats(Stats::Group *parent)
-    : Stats::Group(parent),
+Wavefront::WavefrontStats::WavefrontStats(statistics::Group *parent)
+    : statistics::Group(parent),
      ADD_STAT(numInstrExecuted,
               "number of instructions executed by this WF slot"),
      ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -330,49 +330,49 @@ class Wavefront : public SimObject
    int barId;

  public:
-    struct WavefrontStats : public Stats::Group
+    struct WavefrontStats : public statistics::Group
    {
-        WavefrontStats(Stats::Group *parent);
+        WavefrontStats(statistics::Group *parent);

        // Number of instructions executed by this wavefront slot across all
        // dynamic wavefronts
-        Stats::Scalar numInstrExecuted;
+        statistics::Scalar numInstrExecuted;

        // Number of cycles this WF spends in SCH stage
-        Stats::Scalar schCycles;
+        statistics::Scalar schCycles;

        // Number of stall cycles encounterd by this WF in SCH stage
-        Stats::Scalar schStalls;
+        statistics::Scalar schStalls;

        // The following stats sum to the value of schStalls, and record, per
        // WF slot, what the cause of each stall was at a coarse granularity.

        // Cycles WF is selected by scheduler, but RFs cannot support
        // instruction
-        Stats::Scalar schRfAccessStalls;
+        statistics::Scalar schRfAccessStalls;
        // Cycles spent waiting for execution resources
-        Stats::Scalar schResourceStalls;
+        statistics::Scalar schResourceStalls;
        // cycles spent waiting for RF reads to complete in SCH stage
-        Stats::Scalar schOpdNrdyStalls;
+        statistics::Scalar schOpdNrdyStalls;
        // LDS arbitration stall cycles. WF attempts to execute LM instruction,
        // but another wave is executing FLAT, which requires LM and GM and
        // forces this WF to stall.
-        Stats::Scalar schLdsArbStalls;
+        statistics::Scalar schLdsArbStalls;

        // number of times an instruction of a WF is blocked from being issued
        // due to WAR and WAW dependencies
-        Stats::Scalar numTimesBlockedDueWAXDependencies;
+        statistics::Scalar numTimesBlockedDueWAXDependencies;
        // number of times an instruction of a WF is blocked from being issued
        // due to WAR and WAW dependencies
-        Stats::Scalar numTimesBlockedDueRAWDependencies;
+        statistics::Scalar numTimesBlockedDueRAWDependencies;

        // Distribution to track the distance between producer and consumer
        // for vector register values
-        Stats::Distribution vecRawDistance;
+        statistics::Distribution vecRawDistance;

        // Distribution to track the number of times every vector register
        // value produced is consumed.
-        Stats::Distribution readsPerWrite;
+        statistics::Distribution readsPerWrite;
    } stats;
 };