arch-gcn3,gpu-compute: Update stats style for GPU

Convert all gpu-compute stats to Stats::Group style. Change-Id: I29116f1de53ae379210c6cfb5bed3fc74f50cca5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/39135 Reviewed-by: Matthew Poremba <matthew.poremba@amd.com> Maintainer: Matthew Poremba <matthew.poremba@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
2021-01-14 10:29:37 -06:00
parent e8b37cc503
commit 5323cccfdd
39 changed files with 1156 additions and 1586 deletions
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -3800,7 +3800,7 @@ namespace Gcn3ISA
            wf->computeUnit->cu_id, wf->wgId, refCount);

        wf->computeUnit->registerManager->freeRegisters(wf);
-        wf->computeUnit->completedWfs++;
+        wf->computeUnit->stats.completedWfs++;
        wf->computeUnit->activeWaves--;

        panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
@@ -3811,7 +3811,7 @@ namespace Gcn3ISA

        for (int i = 0; i < wf->vecReads.size(); i++) {
            if (wf->rawDist.find(i) != wf->rawDist.end()) {
-                wf->readsPerWrite.sample(wf->vecReads.at(i));
+                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
            }
        }
        wf->vecReads.clear();
@@ -3853,7 +3853,7 @@ namespace Gcn3ISA
            if (!kernelEnd || !relNeeded) {
                wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
                wf->setStatus(Wavefront::S_STOPPED);
-                wf->computeUnit->completedWGs++;
+                wf->computeUnit->stats.completedWGs++;

                return;
            }
@@ -3877,7 +3877,7 @@ namespace Gcn3ISA
            // call shader to prepare the flush operations
            wf->computeUnit->shader->prepareFlush(gpuDynInst);

-            wf->computeUnit->completedWGs++;
+            wf->computeUnit->stats.completedWGs++;
        } else {
            wf->computeUnit->shader->dispatcher().scheduleDispatch();
        }
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -106,7 +106,8 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
    _numBarrierSlots(p.num_barrier_slots),
    globalSeqNum(0), wavefrontSize(p.wf_size),
    scoreboardCheckToSchedule(p),
-    scheduleToExecute(p)
+    scheduleToExecute(p),
+    stats(this, p.n_wf)
 {
    /**
     * This check is necessary because std::bitset only provides conversion
@@ -367,7 +368,7 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
    w->initRegState(task, w->actualWgSzTotal);
    w->start(_n_wave++, task->codeAddr());

-    waveLevelParallelism.sample(activeWaves);
+    stats.waveLevelParallelism.sample(activeWaves);
    activeWaves++;
 }

@@ -612,22 +613,22 @@ ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
            freeWfSlots, numMappedWfs, vregAvail, sregAvail);

    if (!vregAvail) {
-        ++numTimesWgBlockedDueVgprAlloc;
+        ++stats.numTimesWgBlockedDueVgprAlloc;
    }

    if (!sregAvail) {
-        ++numTimesWgBlockedDueSgprAlloc;
+        ++stats.numTimesWgBlockedDueSgprAlloc;
    }

    // Return true if enough WF slots to submit workgroup and if there are
    // enough VGPRs to schedule all WFs to their SIMD units
    bool ldsAvail = lds.canReserve(task->ldsSize());
    if (!ldsAvail) {
-        wgBlockedDueLdsAllocation++;
+        stats.wgBlockedDueLdsAllocation++;
    }

    if (!barrier_avail) {
-        wgBlockedDueBarrierAllocation++;
+        stats.wgBlockedDueBarrierAllocation++;
    }

    // Return true if the following are all true:
@@ -734,7 +735,7 @@ ComputeUnit::exec()
    scoreboardCheckStage.exec();
    fetchStage.exec();

-    totalCycles++;
+    stats.totalCycles++;

    // Put this CU to sleep if there is no more work to be done.
    if (!isDone()) {
@@ -1032,8 +1033,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
        fatal("pkt is not a read nor a write\n");
    }

-    tlbCycles -= curTick();
-    ++tlbRequests;
+    stats.tlbCycles -= curTick();
+    ++stats.tlbRequests;

    PortID tlbPort_index = perLaneTLB ? index : 0;

@@ -1075,7 +1076,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
            // update the hitLevel distribution
            int hit_level = translation_state->hitLevel;
            assert(hit_level != -1);
-            hitsPerTLBLevel[hit_level]++;
+            stats.hitsPerTLBLevel[hit_level]++;

            // New SenderState for the memory access
            X86ISA::GpuTLB::TranslationState *sender_state =
@@ -1346,7 +1347,7 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
        // for the first cache block.
        if (compute_unit->headTailMap.count(gpuDynInst)) {
            Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
-            compute_unit->headTailLatency.sample(curTick() - headTick);
+            compute_unit->stats.headTailLatency.sample(curTick() - headTick);
            compute_unit->headTailMap.erase(gpuDynInst);
        }

@@ -1381,7 +1382,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
            pkt->req->getVaddr(), line);

    assert(pkt->senderState);
-    computeUnit->tlbCycles += curTick();
+    computeUnit->stats.tlbCycles += curTick();

    // pop off the TLB translation state
    X86ISA::GpuTLB::TranslationState *translation_state =
@@ -1402,7 +1403,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)

    // update the hitLevel distribution
    int hit_level = translation_state->hitLevel;
-    computeUnit->hitsPerTLBLevel[hit_level]++;
+    computeUnit->stats.hitsPerTLBLevel[hit_level]++;

    delete translation_state->tlbEntry;
    assert(!translation_state->ports.size());
@@ -1788,561 +1789,17 @@ ComputeUnit::ITLBPort::recvReqRetry()
    }
 }

-void
-ComputeUnit::regStats()
-{
-    ClockedObject::regStats();
-
-    vALUInsts
-        .name(name() + ".valu_insts")
-        .desc("Number of vector ALU insts issued.")
-        ;
-    vALUInstsPerWF
-        .name(name() + ".valu_insts_per_wf")
-        .desc("The avg. number of vector ALU insts issued per-wavefront.")
-        ;
-    sALUInsts
-        .name(name() + ".salu_insts")
-        .desc("Number of scalar ALU insts issued.")
-        ;
-    sALUInstsPerWF
-        .name(name() + ".salu_insts_per_wf")
-        .desc("The avg. number of scalar ALU insts issued per-wavefront.")
-        ;
-    instCyclesVALU
-        .name(name() + ".inst_cycles_valu")
-        .desc("Number of cycles needed to execute VALU insts.")
-        ;
-    instCyclesSALU
-        .name(name() + ".inst_cycles_salu")
-        .desc("Number of cycles needed to execute SALU insts.")
-        ;
-    threadCyclesVALU
-        .name(name() + ".thread_cycles_valu")
-        .desc("Number of thread cycles used to execute vector ALU ops. "
-              "Similar to instCyclesVALU but multiplied by the number of "
-              "active threads.")
-        ;
-    vALUUtilization
-        .name(name() + ".valu_utilization")
-        .desc("Percentage of active vector ALU threads in a wave.")
-        ;
-    ldsNoFlatInsts
-        .name(name() + ".lds_no_flat_insts")
-        .desc("Number of LDS insts issued, not including FLAT "
-              "accesses that resolve to LDS.")
-        ;
-    ldsNoFlatInstsPerWF
-        .name(name() + ".lds_no_flat_insts_per_wf")
-        .desc("The avg. number of LDS insts (not including FLAT "
-              "accesses that resolve to LDS) per-wavefront.")
-        ;
-    flatVMemInsts
-        .name(name() + ".flat_vmem_insts")
-        .desc("The number of FLAT insts that resolve to vmem issued.")
-        ;
-    flatVMemInstsPerWF
-        .name(name() + ".flat_vmem_insts_per_wf")
-        .desc("The average number of FLAT insts that resolve to vmem "
-              "issued per-wavefront.")
-        ;
-    flatLDSInsts
-        .name(name() + ".flat_lds_insts")
-        .desc("The number of FLAT insts that resolve to LDS issued.")
-        ;
-    flatLDSInstsPerWF
-        .name(name() + ".flat_lds_insts_per_wf")
-        .desc("The average number of FLAT insts that resolve to LDS "
-              "issued per-wavefront.")
-        ;
-    vectorMemWrites
-        .name(name() + ".vector_mem_writes")
-        .desc("Number of vector mem write insts (excluding FLAT insts).")
-        ;
-    vectorMemWritesPerWF
-        .name(name() + ".vector_mem_writes_per_wf")
-        .desc("The average number of vector mem write insts "
-              "(excluding FLAT insts) per-wavefront.")
-        ;
-    vectorMemReads
-        .name(name() + ".vector_mem_reads")
-        .desc("Number of vector mem read insts (excluding FLAT insts).")
-        ;
-    vectorMemReadsPerWF
-        .name(name() + ".vector_mem_reads_per_wf")
-        .desc("The avg. number of vector mem read insts (excluding "
-              "FLAT insts) per-wavefront.")
-        ;
-    scalarMemWrites
-        .name(name() + ".scalar_mem_writes")
-        .desc("Number of scalar mem write insts.")
-        ;
-    scalarMemWritesPerWF
-        .name(name() + ".scalar_mem_writes_per_wf")
-        .desc("The average number of scalar mem write insts per-wavefront.")
-        ;
-    scalarMemReads
-        .name(name() + ".scalar_mem_reads")
-        .desc("Number of scalar mem read insts.")
-        ;
-    scalarMemReadsPerWF
-        .name(name() + ".scalar_mem_reads_per_wf")
-        .desc("The average number of scalar mem read insts per-wavefront.")
-        ;
-
-    vALUInstsPerWF = vALUInsts / completedWfs;
-    sALUInstsPerWF = sALUInsts / completedWfs;
-    vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
-    ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
-    flatVMemInstsPerWF = flatVMemInsts / completedWfs;
-    flatLDSInstsPerWF = flatLDSInsts / completedWfs;
-    vectorMemWritesPerWF = vectorMemWrites / completedWfs;
-    vectorMemReadsPerWF = vectorMemReads / completedWfs;
-    scalarMemWritesPerWF = scalarMemWrites / completedWfs;
-    scalarMemReadsPerWF = scalarMemReads / completedWfs;
-
-    vectorMemReadsPerKiloInst
-        .name(name() + ".vector_mem_reads_per_kilo_inst")
-        .desc("Number of vector mem reads per kilo-instruction")
-        ;
-    vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
-    vectorMemWritesPerKiloInst
-        .name(name() + ".vector_mem_writes_per_kilo_inst")
-        .desc("Number of vector mem writes per kilo-instruction")
-        ;
-    vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
-    vectorMemInstsPerKiloInst
-        .name(name() + ".vector_mem_insts_per_kilo_inst")
-        .desc("Number of vector mem insts per kilo-instruction")
-        ;
-    vectorMemInstsPerKiloInst =
-        ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
-    scalarMemReadsPerKiloInst
-        .name(name() + ".scalar_mem_reads_per_kilo_inst")
-        .desc("Number of scalar mem reads per kilo-instruction")
-    ;
-    scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
-    scalarMemWritesPerKiloInst
-        .name(name() + ".scalar_mem_writes_per_kilo_inst")
-        .desc("Number of scalar mem writes per kilo-instruction")
-    ;
-    scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
-    scalarMemInstsPerKiloInst
-        .name(name() + ".scalar_mem_insts_per_kilo_inst")
-        .desc("Number of scalar mem insts per kilo-instruction")
-        ;
-    scalarMemInstsPerKiloInst =
-        ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
-
-    instCyclesVMemPerSimd
-       .init(numVectorALUs)
-       .name(name() + ".inst_cycles_vector_memory")
-       .desc("Number of cycles to send address, command, data from VRF to "
-             "vector memory unit, per SIMD")
-       ;
-
-    instCyclesScMemPerSimd
-       .init(numVectorALUs)
-       .name(name() + ".inst_cycles_scalar_memory")
-       .desc("Number of cycles to send address, command, data from SRF to "
-             "scalar memory unit, per SIMD")
-       ;
-
-    instCyclesLdsPerSimd
-       .init(numVectorALUs)
-       .name(name() + ".inst_cycles_lds")
-       .desc("Number of cycles to send address, command, data from VRF to "
-             "LDS unit, per SIMD")
-       ;
-
-    globalReads
-        .name(name() + ".global_mem_reads")
-        .desc("Number of reads to the global segment")
-    ;
-    globalWrites
-        .name(name() + ".global_mem_writes")
-        .desc("Number of writes to the global segment")
-    ;
-    globalMemInsts
-        .name(name() + ".global_mem_insts")
-        .desc("Number of memory instructions sent to the global segment")
-    ;
-    globalMemInsts = globalReads + globalWrites;
-    argReads
-        .name(name() + ".arg_reads")
-        .desc("Number of reads to the arg segment")
-    ;
-    argWrites
-        .name(name() + ".arg_writes")
-        .desc("NUmber of writes to the arg segment")
-    ;
-    argMemInsts
-        .name(name() + ".arg_mem_insts")
-        .desc("Number of memory instructions sent to the arg segment")
-    ;
-    argMemInsts = argReads + argWrites;
-    spillReads
-        .name(name() + ".spill_reads")
-        .desc("Number of reads to the spill segment")
-    ;
-    spillWrites
-        .name(name() + ".spill_writes")
-        .desc("Number of writes to the spill segment")
-    ;
-    spillMemInsts
-        .name(name() + ".spill_mem_insts")
-        .desc("Number of memory instructions sent to the spill segment")
-    ;
-    spillMemInsts = spillReads + spillWrites;
-    groupReads
-        .name(name() + ".group_reads")
-        .desc("Number of reads to the group segment")
-    ;
-    groupWrites
-        .name(name() + ".group_writes")
-        .desc("Number of writes to the group segment")
-    ;
-    groupMemInsts
-        .name(name() + ".group_mem_insts")
-        .desc("Number of memory instructions sent to the group segment")
-    ;
-    groupMemInsts = groupReads + groupWrites;
-    privReads
-        .name(name() + ".private_reads")
-        .desc("Number of reads to the private segment")
-    ;
-    privWrites
-        .name(name() + ".private_writes")
-        .desc("Number of writes to the private segment")
-    ;
-    privMemInsts
-        .name(name() + ".private_mem_insts")
-        .desc("Number of memory instructions sent to the private segment")
-    ;
-    privMemInsts = privReads + privWrites;
-    readonlyReads
-        .name(name() + ".readonly_reads")
-        .desc("Number of reads to the readonly segment")
-    ;
-    readonlyWrites
-        .name(name() + ".readonly_writes")
-        .desc("Number of memory instructions sent to the readonly segment")
-    ;
-    readonlyMemInsts
-        .name(name() + ".readonly_mem_insts")
-        .desc("Number of memory instructions sent to the readonly segment")
-    ;
-    readonlyMemInsts = readonlyReads + readonlyWrites;
-    kernargReads
-        .name(name() + ".kernarg_reads")
-        .desc("Number of reads sent to the kernarg segment")
-    ;
-    kernargWrites
-        .name(name() + ".kernarg_writes")
-        .desc("Number of memory instructions sent to the kernarg segment")
-    ;
-    kernargMemInsts
-        .name(name() + ".kernarg_mem_insts")
-        .desc("Number of memory instructions sent to the kernarg segment")
-    ;
-    kernargMemInsts = kernargReads + kernargWrites;
-
-    tlbCycles
-        .name(name() + ".tlb_cycles")
-        .desc("total number of cycles for all uncoalesced requests")
-        ;
-
-    tlbRequests
-        .name(name() + ".tlb_requests")
-        .desc("number of uncoalesced requests")
-        ;
-
-    tlbLatency
-        .name(name() + ".avg_translation_latency")
-        .desc("Avg. translation latency for data translations")
-        ;
-
-    tlbLatency = tlbCycles / tlbRequests;
-
-    hitsPerTLBLevel
-       .init(4)
-       .name(name() + ".TLB_hits_distribution")
-       .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
-       ;
-
-    // fixed number of TLB levels
-    for (int i = 0; i < 4; ++i) {
-        if (!i)
-            hitsPerTLBLevel.subname(i,"page_table");
-        else
-            hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
-    }
-
-    execRateDist
-        .init(0, 10, 2)
-        .name(name() + ".inst_exec_rate")
-        .desc("Instruction Execution Rate: Number of executed vector "
-              "instructions per cycle")
-        ;
-
-    ldsBankConflictDist
-       .init(0, wfSize(), 2)
-       .name(name() + ".lds_bank_conflicts")
-       .desc("Number of bank conflicts per LDS memory packet")
-       ;
-
-    ldsBankAccesses
-        .name(name() + ".lds_bank_access_cnt")
-        .desc("Total number of LDS bank accesses")
-        ;
-
-    pageDivergenceDist
-        // A wavefront can touch up to N pages per memory instruction where
-        // N is equal to the wavefront size
-        // The number of pages per bin can be configured (here it's 4).
-       .init(1, wfSize(), 4)
-       .name(name() + ".page_divergence_dist")
-       .desc("pages touched per wf (over all mem. instr.)")
-       ;
-
-    controlFlowDivergenceDist
-        .init(1, wfSize(), 4)
-        .name(name() + ".warp_execution_dist")
-        .desc("number of lanes active per instruction (oval all instructions)")
-        ;
-
-    activeLanesPerGMemInstrDist
-        .init(1, wfSize(), 4)
-        .name(name() + ".gmem_lanes_execution_dist")
-        .desc("number of active lanes per global memory instruction")
-        ;
-
-    activeLanesPerLMemInstrDist
-        .init(1, wfSize(), 4)
-        .name(name() + ".lmem_lanes_execution_dist")
-        .desc("number of active lanes per local memory instruction")
-        ;
-
-    numInstrExecuted
-        .name(name() + ".num_instr_executed")
-        .desc("number of instructions executed")
-        ;
-
-    numVecOpsExecuted
-        .name(name() + ".num_vec_ops_executed")
-        .desc("number of vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedF16
-        .name(name() + ".num_vec_ops_f16_executed")
-        .desc("number of f16 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedF32
-        .name(name() + ".num_vec_ops_f32_executed")
-        .desc("number of f32 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedF64
-        .name(name() + ".num_vec_ops_f64_executed")
-        .desc("number of f64 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedFMA16
-        .name(name() + ".num_vec_ops_fma16_executed")
-        .desc("number of fma16 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedFMA32
-        .name(name() + ".num_vec_ops_fma32_executed")
-        .desc("number of fma32 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedFMA64
-        .name(name() + ".num_vec_ops_fma64_executed")
-        .desc("number of fma64 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAD16
-        .name(name() + ".num_vec_ops_mad16_executed")
-        .desc("number of mad16 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAD32
-        .name(name() + ".num_vec_ops_mad32_executed")
-        .desc("number of mad32 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAD64
-        .name(name() + ".num_vec_ops_mad64_executed")
-        .desc("number of mad64 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAC16
-        .name(name() + ".num_vec_ops_mac16_executed")
-        .desc("number of mac16 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAC32
-        .name(name() + ".num_vec_ops_mac32_executed")
-        .desc("number of mac32 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedMAC64
-        .name(name() + ".num_vec_ops_mac64_executed")
-        .desc("number of mac64 vec ops executed (e.g. WF size/inst)")
-        ;
-
-    numVecOpsExecutedTwoOpFP
-        .name(name() + ".num_vec_ops_two_op_fp_executed")
-        .desc("number of two op FP vec ops executed (e.g. WF size/inst)")
-        ;
-
-    totalCycles
-        .name(name() + ".num_total_cycles")
-        .desc("number of cycles the CU ran for")
-        ;
-
-    ipc
-        .name(name() + ".ipc")
-        .desc("Instructions per cycle (this CU only)")
-        ;
-
-    vpc
-        .name(name() + ".vpc")
-        .desc("Vector Operations per cycle (this CU only)")
-        ;
-
-    vpc_f16
-        .name(name() + ".vpc_f16")
-        .desc("F16 Vector Operations per cycle (this CU only)")
-        ;
-
-    vpc_f32
-        .name(name() + ".vpc_f32")
-        .desc("F32 Vector Operations per cycle (this CU only)")
-        ;
-
-    vpc_f64
-        .name(name() + ".vpc_f64")
-        .desc("F64 Vector Operations per cycle (this CU only)")
-        ;
-
-    numALUInstsExecuted
-        .name(name() + ".num_alu_insts_executed")
-        .desc("Number of dynamic non-GM memory insts executed")
-        ;
-
-    wgBlockedDueBarrierAllocation
-        .name(name() + ".wg_blocked_due_barrier_alloc")
-        .desc("WG dispatch was blocked due to lack of barrier resources")
-        ;
-
-    wgBlockedDueLdsAllocation
-        .name(name() + ".wg_blocked_due_lds_alloc")
-        .desc("Workgroup blocked due to LDS capacity")
-        ;
-
-    ipc = numInstrExecuted / totalCycles;
-    vpc = numVecOpsExecuted / totalCycles;
-    vpc_f16 = numVecOpsExecutedF16 / totalCycles;
-    vpc_f32 = numVecOpsExecutedF32 / totalCycles;
-    vpc_f64 = numVecOpsExecutedF64 / totalCycles;
-
-    numTimesWgBlockedDueVgprAlloc
-        .name(name() + ".times_wg_blocked_due_vgpr_alloc")
-        .desc("Number of times WGs are blocked due to VGPR allocation per "
-              "SIMD")
-        ;
-
-    numTimesWgBlockedDueSgprAlloc
-        .name(name() + ".times_wg_blocked_due_sgpr_alloc")
-        .desc("Number of times WGs are blocked due to SGPR allocation per "
-              "SIMD")
-        ;
-
-    dynamicGMemInstrCnt
-        .name(name() + ".global_mem_instr_cnt")
-        .desc("dynamic non-flat global memory instruction count")
-        ;
-
-    dynamicFlatMemInstrCnt
-        .name(name() + ".flat_global_mem_instr_cnt")
-        .desc("dynamic flat global memory instruction count")
-        ;
-
-    dynamicLMemInstrCnt
-        .name(name() + ".local_mem_instr_cnt")
-        .desc("dynamic local memory intruction count")
-        ;
-
-    numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
-        dynamicLMemInstrCnt;
-
-    completedWfs
-        .name(name() + ".num_completed_wfs")
-        .desc("number of completed wavefronts")
-        ;
-
-    completedWGs
-        .name(name() + ".num_completed_wgs")
-        .desc("number of completed workgroups")
-        ;
-
-    numCASOps
-        .name(name() + ".num_CAS_ops")
-        .desc("number of compare and swap operations")
-        ;
-
-    numFailedCASOps
-        .name(name() + ".num_failed_CAS_ops")
-        .desc("number of compare and swap operations that failed")
-        ;
-
-    headTailLatency
-        .init(0, 1000000, 10000)
-        .name(name() + ".head_tail_latency")
-        .desc("ticks between first and last cache block arrival at coalescer")
-        .flags(Stats::pdf | Stats::oneline)
-        ;
-
-    waveLevelParallelism
-        .init(0, shader->n_wf * numVectorALUs, 1)
-        .name(name() + ".wlp")
-        .desc("wave level parallelism: count of active waves at wave launch")
-        ;
-
-    instInterleave
-        .init(numVectorALUs, 0, 20, 1)
-        .name(name() + ".interleaving")
-        .desc("Measure of instruction interleaving per SIMD")
-        ;
-
-    // register stats of pipeline stages
-    fetchStage.regStats();
-    scoreboardCheckStage.regStats();
-    scheduleStage.regStats();
-    execStage.regStats();
-
-    // register stats of memory pipelines
-    globalMemoryPipe.regStats();
-    localMemoryPipe.regStats();
-    scalarMemoryPipe.regStats();
-
-    registerManager->regStats();
-}
-
 void
 ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
 {
    if (gpuDynInst->isScalar()) {
        if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
-            sALUInsts++;
-            instCyclesSALU++;
+            stats.sALUInsts++;
+            stats.instCyclesSALU++;
        } else if (gpuDynInst->isLoad()) {
-            scalarMemReads++;
+            stats.scalarMemReads++;
        } else if (gpuDynInst->isStore()) {
-            scalarMemWrites++;
+            stats.scalarMemWrites++;
        }
    } else {
        if (gpuDynInst->isALU()) {
@@ -2350,45 +1807,46 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
            if (shader->total_valu_insts == shader->max_valu_insts) {
                exitSimLoop("max vALU insts");
            }
-            vALUInsts++;
-            instCyclesVALU++;
-            threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
+            stats.vALUInsts++;
+            stats.instCyclesVALU++;
+            stats.threadCyclesVALU
+                += gpuDynInst->wavefront()->execMask().count();
        } else if (gpuDynInst->isFlat()) {
            if (gpuDynInst->isLocalMem()) {
-                flatLDSInsts++;
+                stats.flatLDSInsts++;
            } else {
-                flatVMemInsts++;
+                stats.flatVMemInsts++;
            }
        } else if (gpuDynInst->isLocalMem()) {
-            ldsNoFlatInsts++;
+            stats.ldsNoFlatInsts++;
        } else if (gpuDynInst->isLoad()) {
-            vectorMemReads++;
+            stats.vectorMemReads++;
        } else if (gpuDynInst->isStore()) {
-            vectorMemWrites++;
+            stats.vectorMemWrites++;
        }

        if (gpuDynInst->isLoad()) {
            switch (gpuDynInst->executedAs()) {
              case Enums::SC_SPILL:
-                spillReads++;
+                stats.spillReads++;
                break;
              case Enums::SC_GLOBAL:
-                globalReads++;
+                stats.globalReads++;
                break;
              case Enums::SC_GROUP:
-                groupReads++;
+                stats.groupReads++;
                break;
              case Enums::SC_PRIVATE:
-                privReads++;
+                stats.privReads++;
                break;
              case Enums::SC_READONLY:
-                readonlyReads++;
+                stats.readonlyReads++;
                break;
              case Enums::SC_KERNARG:
-                kernargReads++;
+                stats.kernargReads++;
                break;
              case Enums::SC_ARG:
-                argReads++;
+                stats.argReads++;
                break;
              case Enums::SC_NONE:
                /**
@@ -2403,25 +1861,25 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
        } else if (gpuDynInst->isStore()) {
            switch (gpuDynInst->executedAs()) {
              case Enums::SC_SPILL:
-                spillWrites++;
+                stats.spillWrites++;
                break;
              case Enums::SC_GLOBAL:
-                globalWrites++;
+                stats.globalWrites++;
                break;
              case Enums::SC_GROUP:
-                groupWrites++;
+                stats.groupWrites++;
                break;
              case Enums::SC_PRIVATE:
-                privWrites++;
+                stats.privWrites++;
                break;
              case Enums::SC_READONLY:
-                readonlyWrites++;
+                stats.readonlyWrites++;
                break;
              case Enums::SC_KERNARG:
-                kernargWrites++;
+                stats.kernargWrites++;
                break;
              case Enums::SC_ARG:
-                argWrites++;
+                stats.argWrites++;
                break;
              case Enums::SC_NONE:
                /**
@@ -2636,3 +2094,241 @@ ComputeUnit::LDSPort::recvReqRetry()
        }
    }
 }
+
+ComputeUnit::ComputeUnitStats::ComputeUnitStats(Stats::Group *parent, int n_wf)
+    : Stats::Group(parent),
+      ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
+      ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
+               "per-wavefront."),
+      ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
+      ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
+               "per-wavefront."),
+      ADD_STAT(instCyclesVALU,
+               "Number of cycles needed to execute VALU insts."),
+      ADD_STAT(instCyclesSALU,
+               "Number of cycles needed to execute SALU insts."),
+      ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
+               "vector ALU ops. Similar to instCyclesVALU but multiplied by "
+               "the number of active threads."),
+      ADD_STAT(vALUUtilization,
+               "Percentage of active vector ALU threads in a wave."),
+      ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
+               " accesses that resolve to LDS."),
+      ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
+               "including FLAT accesses that resolve to LDS) per-wavefront."),
+      ADD_STAT(flatVMemInsts,
+               "The number of FLAT insts that resolve to vmem issued."),
+      ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
+               "resolve to vmem issued per-wavefront."),
+      ADD_STAT(flatLDSInsts,
+               "The number of FLAT insts that resolve to LDS issued."),
+      ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
+               "resolve to LDS issued per-wavefront."),
+      ADD_STAT(vectorMemWrites,
+               "Number of vector mem write insts (excluding FLAT insts)."),
+      ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
+               "insts (excluding FLAT insts) per-wavefront."),
+      ADD_STAT(vectorMemReads,
+               "Number of vector mem read insts (excluding FLAT insts)."),
+      ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
+               "(excluding FLAT insts) per-wavefront."),
+      ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
+      ADD_STAT(scalarMemWritesPerWF,
+               "The average number of scalar mem write insts per-wavefront."),
+      ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
+      ADD_STAT(scalarMemReadsPerWF,
+               "The average number of scalar mem read insts per-wavefront."),
+      ADD_STAT(vectorMemReadsPerKiloInst,
+               "Number of vector mem reads per kilo-instruction"),
+      ADD_STAT(vectorMemWritesPerKiloInst,
+               "Number of vector mem writes per kilo-instruction"),
+      ADD_STAT(vectorMemInstsPerKiloInst,
+               "Number of vector mem insts per kilo-instruction"),
+      ADD_STAT(scalarMemReadsPerKiloInst,
+               "Number of scalar mem reads per kilo-instruction"),
+      ADD_STAT(scalarMemWritesPerKiloInst,
+               "Number of scalar mem writes per kilo-instruction"),
+      ADD_STAT(scalarMemInstsPerKiloInst,
+               "Number of scalar mem insts per kilo-instruction"),
+      ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
+               "command, data from VRF to vector memory unit, per SIMD"),
+      ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
+               "command, data from SRF to scalar memory unit, per SIMD"),
+      ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
+               "command, data from VRF to LDS unit, per SIMD"),
+      ADD_STAT(globalReads, "Number of reads to the global segment"),
+      ADD_STAT(globalWrites, "Number of writes to the global segment"),
+      ADD_STAT(globalMemInsts,
+               "Number of memory instructions sent to the global segment"),
+      ADD_STAT(argReads, "Number of reads to the arg segment"),
+      ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
+      ADD_STAT(argMemInsts,
+               "Number of memory instructions sent to the arg segment"),
+      ADD_STAT(spillReads, "Number of reads to the spill segment"),
+      ADD_STAT(spillWrites, "Number of writes to the spill segment"),
+      ADD_STAT(spillMemInsts,
+               "Number of memory instructions sent to the spill segment"),
+      ADD_STAT(groupReads, "Number of reads to the group segment"),
+      ADD_STAT(groupWrites, "Number of writes to the group segment"),
+      ADD_STAT(groupMemInsts,
+               "Number of memory instructions sent to the group segment"),
+      ADD_STAT(privReads, "Number of reads to the private segment"),
+      ADD_STAT(privWrites, "Number of writes to the private segment"),
+      ADD_STAT(privMemInsts,
+               "Number of memory instructions sent to the private segment"),
+      ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
+      ADD_STAT(readonlyWrites,
+               "Number of memory instructions sent to the readonly segment"),
+      ADD_STAT(readonlyMemInsts,
+               "Number of memory instructions sent to the readonly segment"),
+      ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
+      ADD_STAT(kernargWrites,
+               "Number of memory instructions sent to the kernarg segment"),
+      ADD_STAT(kernargMemInsts,
+               "Number of memory instructions sent to the kernarg segment"),
+      ADD_STAT(waveLevelParallelism,
+               "wave level parallelism: count of active waves at wave launch"),
+      ADD_STAT(tlbRequests, "number of uncoalesced requests"),
+      ADD_STAT(tlbCycles,
+               "total number of cycles for all uncoalesced requests"),
+      ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
+      ADD_STAT(hitsPerTLBLevel,
+               "TLB hits distribution (0 for page table, x for Lx-TLB)"),
+      ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
+      ADD_STAT(ldsBankConflictDist,
+               "Number of bank conflicts per LDS memory packet"),
+      ADD_STAT(pageDivergenceDist,
+               "pages touched per wf (over all mem. instr.)"),
+      ADD_STAT(dynamicGMemInstrCnt,
+               "dynamic non-flat global memory instruction count"),
+      ADD_STAT(dynamicFlatMemInstrCnt,
+               "dynamic flat global memory instruction count"),
+      ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
+      ADD_STAT(wgBlockedDueBarrierAllocation,
+               "WG dispatch was blocked due to lack of barrier resources"),
+      ADD_STAT(wgBlockedDueLdsAllocation,
+               "Workgroup blocked due to LDS capacity"),
+      ADD_STAT(numInstrExecuted, "number of instructions executed"),
+      ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
+               "vector instructions per cycle"),
+      ADD_STAT(numVecOpsExecuted,
+               "number of vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedF16,
+               "number of f16 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedF32,
+               "number of f32 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedF64,
+               "number of f64 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedFMA16,
+               "number of fma16 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedFMA32,
+               "number of fma32 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedFMA64,
+               "number of fma64 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAC16,
+               "number of mac16 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAC32,
+               "number of mac32 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAC64,
+               "number of mac64 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAD16,
+               "number of mad16 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAD32,
+               "number of mad32 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMAD64,
+               "number of mad64 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedTwoOpFP,
+               "number of two op FP vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(totalCycles, "number of cycles the CU ran for"),
+      ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
+      ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
+      ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
+      ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
+      ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
+      ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
+               "instruction (over all instructions)"),
+      ADD_STAT(activeLanesPerGMemInstrDist,
+               "number of active lanes per global memory instruction"),
+      ADD_STAT(activeLanesPerLMemInstrDist,
+               "number of active lanes per local memory instruction"),
+      ADD_STAT(numALUInstsExecuted,
+               "Number of dynamic non-GM memory insts executed"),
+      ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
+               "blocked due to VGPR allocation per SIMD"),
+      ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
+               "blocked due to SGPR allocation per SIMD"),
+      ADD_STAT(numCASOps, "number of compare and swap operations"),
+      ADD_STAT(numFailedCASOps,
+               "number of compare and swap operations that failed"),
+      ADD_STAT(completedWfs, "number of completed wavefronts"),
+      ADD_STAT(completedWGs, "number of completed workgroups"),
+      ADD_STAT(headTailLatency, "ticks between first and last cache block "
+               "arrival at coalescer"),
+      ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
+{
+    ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
+
+    instCyclesVMemPerSimd.init(cu->numVectorALUs);
+    instCyclesScMemPerSimd.init(cu->numVectorALUs);
+    instCyclesLdsPerSimd.init(cu->numVectorALUs);
+
+    hitsPerTLBLevel.init(4);
+    execRateDist.init(0, 10, 2);
+    ldsBankConflictDist.init(0, cu->wfSize(), 2);
+
+    pageDivergenceDist.init(1, cu->wfSize(), 4);
+    controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
+    activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4);
+    activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4);
+
+    headTailLatency.init(0, 1000000, 10000).flags(Stats::pdf | Stats::oneline);
+    waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
+    instInterleave.init(cu->numVectorALUs, 0, 20, 1);
+
+    vALUInstsPerWF = vALUInsts / completedWfs;
+    sALUInstsPerWF = sALUInsts / completedWfs;
+    vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
+    ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
+    flatVMemInstsPerWF = flatVMemInsts / completedWfs;
+    flatLDSInstsPerWF = flatLDSInsts / completedWfs;
+    vectorMemWritesPerWF = vectorMemWrites / completedWfs;
+    vectorMemReadsPerWF = vectorMemReads / completedWfs;
+    scalarMemWritesPerWF = scalarMemWrites / completedWfs;
+    scalarMemReadsPerWF = scalarMemReads / completedWfs;
+
+    vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
+    vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
+    vectorMemInstsPerKiloInst =
+        ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
+    scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
+    scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
+    scalarMemInstsPerKiloInst =
+        ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
+
+    globalMemInsts = globalReads + globalWrites;
+    argMemInsts = argReads + argWrites;
+    spillMemInsts = spillReads + spillWrites;
+    groupMemInsts = groupReads + groupWrites;
+    privMemInsts = privReads + privWrites;
+    readonlyMemInsts = readonlyReads + readonlyWrites;
+    kernargMemInsts = kernargReads + kernargWrites;
+
+    tlbLatency = tlbCycles / tlbRequests;
+
+    // fixed number of TLB levels
+    for (int i = 0; i < 4; ++i) {
+        if (!i)
+            hitsPerTLBLevel.subname(i,"page_table");
+        else
+            hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
+    }
+
+    ipc = numInstrExecuted / totalCycles;
+    vpc = numVecOpsExecuted / totalCycles;
+    vpc_f16 = numVecOpsExecutedF16 / totalCycles;
+    vpc_f32 = numVecOpsExecutedF32 / totalCycles;
+    vpc_f64 = numVecOpsExecutedF64 / totalCycles;
+
+    numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
+        dynamicLMemInstrCnt;
+}
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -42,6 +42,7 @@
 #include "base/callback.hh"
 #include "base/compiler.hh"
 #include "base/statistics.hh"
+#include "base/stats/group.hh"
 #include "base/types.hh"
 #include "config/the_gpu_isa.hh"
 #include "enums/PrefetchType.hh"
@@ -320,12 +321,6 @@ class ComputeUnit : public ClockedObject
    // tracks the last cycle a vector instruction was executed on a SIMD
    std::vector<uint64_t> lastExecCycle;

-    // Track the amount of interleaving between wavefronts on each SIMD.
-    // This stat is sampled using instExecPerSimd to compute the number of
-    // instructions that have been executed on a SIMD between a WF executing
-    // two successive instructions.
-    Stats::VectorDistribution instInterleave;
-
    // tracks the number of dyn inst executed per SIMD
    std::vector<uint64_t> instExecPerSimd;

@@ -472,148 +467,6 @@ class ComputeUnit : public ClockedObject
    LdsState &lds;

  public:
-    Stats::Scalar vALUInsts;
-    Stats::Formula vALUInstsPerWF;
-    Stats::Scalar sALUInsts;
-    Stats::Formula sALUInstsPerWF;
-    Stats::Scalar instCyclesVALU;
-    Stats::Scalar instCyclesSALU;
-    Stats::Scalar threadCyclesVALU;
-    Stats::Formula vALUUtilization;
-    Stats::Scalar ldsNoFlatInsts;
-    Stats::Formula ldsNoFlatInstsPerWF;
-    Stats::Scalar flatVMemInsts;
-    Stats::Formula flatVMemInstsPerWF;
-    Stats::Scalar flatLDSInsts;
-    Stats::Formula flatLDSInstsPerWF;
-    Stats::Scalar vectorMemWrites;
-    Stats::Formula vectorMemWritesPerWF;
-    Stats::Scalar vectorMemReads;
-    Stats::Formula vectorMemReadsPerWF;
-    Stats::Scalar scalarMemWrites;
-    Stats::Formula scalarMemWritesPerWF;
-    Stats::Scalar scalarMemReads;
-    Stats::Formula scalarMemReadsPerWF;
-
-    Stats::Formula vectorMemReadsPerKiloInst;
-    Stats::Formula vectorMemWritesPerKiloInst;
-    Stats::Formula vectorMemInstsPerKiloInst;
-    Stats::Formula scalarMemReadsPerKiloInst;
-    Stats::Formula scalarMemWritesPerKiloInst;
-    Stats::Formula scalarMemInstsPerKiloInst;
-
-    // Cycles required to send register source (addr and data) from
-    // register files to memory pipeline, per SIMD.
-    Stats::Vector instCyclesVMemPerSimd;
-    Stats::Vector instCyclesScMemPerSimd;
-    Stats::Vector instCyclesLdsPerSimd;
-
-    Stats::Scalar globalReads;
-    Stats::Scalar globalWrites;
-    Stats::Formula globalMemInsts;
-    Stats::Scalar argReads;
-    Stats::Scalar argWrites;
-    Stats::Formula argMemInsts;
-    Stats::Scalar spillReads;
-    Stats::Scalar spillWrites;
-    Stats::Formula spillMemInsts;
-    Stats::Scalar groupReads;
-    Stats::Scalar groupWrites;
-    Stats::Formula groupMemInsts;
-    Stats::Scalar privReads;
-    Stats::Scalar privWrites;
-    Stats::Formula privMemInsts;
-    Stats::Scalar readonlyReads;
-    Stats::Scalar readonlyWrites;
-    Stats::Formula readonlyMemInsts;
-    Stats::Scalar kernargReads;
-    Stats::Scalar kernargWrites;
-    Stats::Formula kernargMemInsts;
-
-    int activeWaves;
-    Stats::Distribution waveLevelParallelism;
-
-    void updateInstStats(GPUDynInstPtr gpuDynInst);
-
-    // the following stats compute the avg. TLB accesslatency per
-    // uncoalesced request (only for data)
-    Stats::Scalar tlbRequests;
-    Stats::Scalar tlbCycles;
-    Stats::Formula tlbLatency;
-    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
-    Stats::Vector hitsPerTLBLevel;
-
-    Stats::Scalar ldsBankAccesses;
-    Stats::Distribution ldsBankConflictDist;
-
-    // over all memory instructions executed over all wavefronts
-    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
-    Stats::Distribution pageDivergenceDist;
-    // count of non-flat global memory vector instructions executed
-    Stats::Scalar dynamicGMemInstrCnt;
-    // count of flat global memory vector instructions executed
-    Stats::Scalar dynamicFlatMemInstrCnt;
-    Stats::Scalar dynamicLMemInstrCnt;
-
-    Stats::Scalar wgBlockedDueBarrierAllocation;
-    Stats::Scalar wgBlockedDueLdsAllocation;
-    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
-    // active when the instruction is committed, this number is still
-    // incremented by 1
-    Stats::Scalar numInstrExecuted;
-    // Number of cycles among successive instruction executions across all
-    // wavefronts of the same CU
-    Stats::Distribution execRateDist;
-    // number of individual vector operations executed
-    Stats::Scalar numVecOpsExecuted;
-    // number of individual f16 vector operations executed
-    Stats::Scalar numVecOpsExecutedF16;
-    // number of individual f32 vector operations executed
-    Stats::Scalar numVecOpsExecutedF32;
-    // number of individual f64 vector operations executed
-    Stats::Scalar numVecOpsExecutedF64;
-    // number of individual FMA 16,32,64 vector operations executed
-    Stats::Scalar numVecOpsExecutedFMA16;
-    Stats::Scalar numVecOpsExecutedFMA32;
-    Stats::Scalar numVecOpsExecutedFMA64;
-    // number of individual MAC 16,32,64 vector operations executed
-    Stats::Scalar numVecOpsExecutedMAC16;
-    Stats::Scalar numVecOpsExecutedMAC32;
-    Stats::Scalar numVecOpsExecutedMAC64;
-    // number of individual MAD 16,32,64 vector operations executed
-    Stats::Scalar numVecOpsExecutedMAD16;
-    Stats::Scalar numVecOpsExecutedMAD32;
-    Stats::Scalar numVecOpsExecutedMAD64;
-    // total number of two op FP vector operations executed
-    Stats::Scalar numVecOpsExecutedTwoOpFP;
-    // Total cycles that something is running on the GPU
-    Stats::Scalar totalCycles;
-    Stats::Formula vpc; // vector ops per cycle
-    Stats::Formula vpc_f16; // vector ops per cycle
-    Stats::Formula vpc_f32; // vector ops per cycle
-    Stats::Formula vpc_f64; // vector ops per cycle
-    Stats::Formula ipc; // vector instructions per cycle
-    Stats::Distribution controlFlowDivergenceDist;
-    Stats::Distribution activeLanesPerGMemInstrDist;
-    Stats::Distribution activeLanesPerLMemInstrDist;
-    // number of vector ALU instructions received
-    Stats::Formula numALUInstsExecuted;
-    // number of times a WG can not start due to lack of free VGPRs in SIMDs
-    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
-    // number of times a WG can not start due to lack of free SGPRs in SIMDs
-    Stats::Scalar numTimesWgBlockedDueSgprAlloc;
-    Stats::Scalar numCASOps;
-    Stats::Scalar numFailedCASOps;
-    Stats::Scalar completedWfs;
-    Stats::Scalar completedWGs;
-
-    // distrubtion in latency difference between first and last cache block
-    // arrival ticks
-    Stats::Distribution headTailLatency;
-
-    void
-    regStats() override;
-
    LdsState &
    getLds() const
    {
@@ -1081,6 +934,158 @@ class ComputeUnit : public ClockedObject
    // a particular GPUDynInst. This is used to calculate the difference
    // between the first and last chace block arrival times.
    std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
+
+  public:
+    void updateInstStats(GPUDynInstPtr gpuDynInst);
+    int activeWaves;
+
+    struct ComputeUnitStats : public Stats::Group
+    {
+        ComputeUnitStats(Stats::Group *parent, int n_wf);
+
+        Stats::Scalar vALUInsts;
+        Stats::Formula vALUInstsPerWF;
+        Stats::Scalar sALUInsts;
+        Stats::Formula sALUInstsPerWF;
+        Stats::Scalar instCyclesVALU;
+        Stats::Scalar instCyclesSALU;
+        Stats::Scalar threadCyclesVALU;
+        Stats::Formula vALUUtilization;
+        Stats::Scalar ldsNoFlatInsts;
+        Stats::Formula ldsNoFlatInstsPerWF;
+        Stats::Scalar flatVMemInsts;
+        Stats::Formula flatVMemInstsPerWF;
+        Stats::Scalar flatLDSInsts;
+        Stats::Formula flatLDSInstsPerWF;
+        Stats::Scalar vectorMemWrites;
+        Stats::Formula vectorMemWritesPerWF;
+        Stats::Scalar vectorMemReads;
+        Stats::Formula vectorMemReadsPerWF;
+        Stats::Scalar scalarMemWrites;
+        Stats::Formula scalarMemWritesPerWF;
+        Stats::Scalar scalarMemReads;
+        Stats::Formula scalarMemReadsPerWF;
+
+        Stats::Formula vectorMemReadsPerKiloInst;
+        Stats::Formula vectorMemWritesPerKiloInst;
+        Stats::Formula vectorMemInstsPerKiloInst;
+        Stats::Formula scalarMemReadsPerKiloInst;
+        Stats::Formula scalarMemWritesPerKiloInst;
+        Stats::Formula scalarMemInstsPerKiloInst;
+
+        // Cycles required to send register source (addr and data) from
+        // register files to memory pipeline, per SIMD.
+        Stats::Vector instCyclesVMemPerSimd;
+        Stats::Vector instCyclesScMemPerSimd;
+        Stats::Vector instCyclesLdsPerSimd;
+
+        Stats::Scalar globalReads;
+        Stats::Scalar globalWrites;
+        Stats::Formula globalMemInsts;
+        Stats::Scalar argReads;
+        Stats::Scalar argWrites;
+        Stats::Formula argMemInsts;
+        Stats::Scalar spillReads;
+        Stats::Scalar spillWrites;
+        Stats::Formula spillMemInsts;
+        Stats::Scalar groupReads;
+        Stats::Scalar groupWrites;
+        Stats::Formula groupMemInsts;
+        Stats::Scalar privReads;
+        Stats::Scalar privWrites;
+        Stats::Formula privMemInsts;
+        Stats::Scalar readonlyReads;
+        Stats::Scalar readonlyWrites;
+        Stats::Formula readonlyMemInsts;
+        Stats::Scalar kernargReads;
+        Stats::Scalar kernargWrites;
+        Stats::Formula kernargMemInsts;
+
+        Stats::Distribution waveLevelParallelism;
+
+        // the following stats compute the avg. TLB accesslatency per
+        // uncoalesced request (only for data)
+        Stats::Scalar tlbRequests;
+        Stats::Scalar tlbCycles;
+        Stats::Formula tlbLatency;
+        // hitsPerTLBLevel[x] are the hits in Level x TLB.
+        // x = 0 is the page table.
+        Stats::Vector hitsPerTLBLevel;
+
+        Stats::Scalar ldsBankAccesses;
+        Stats::Distribution ldsBankConflictDist;
+
+        // over all memory instructions executed over all wavefronts
+        // how many touched 0-4 pages, 4-8, ..., 60-64 pages
+        Stats::Distribution pageDivergenceDist;
+        // count of non-flat global memory vector instructions executed
+        Stats::Scalar dynamicGMemInstrCnt;
+        // count of flat global memory vector instructions executed
+        Stats::Scalar dynamicFlatMemInstrCnt;
+        Stats::Scalar dynamicLMemInstrCnt;
+
+        Stats::Scalar wgBlockedDueBarrierAllocation;
+        Stats::Scalar wgBlockedDueLdsAllocation;
+        // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
+        // active when the instruction is committed, this number is still
+        // incremented by 1
+        Stats::Scalar numInstrExecuted;
+        // Number of cycles among successive instruction executions across all
+        // wavefronts of the same CU
+        Stats::Distribution execRateDist;
+        // number of individual vector operations executed
+        Stats::Scalar numVecOpsExecuted;
+        // number of individual f16 vector operations executed
+        Stats::Scalar numVecOpsExecutedF16;
+        // number of individual f32 vector operations executed
+        Stats::Scalar numVecOpsExecutedF32;
+        // number of individual f64 vector operations executed
+        Stats::Scalar numVecOpsExecutedF64;
+        // number of individual FMA 16,32,64 vector operations executed
+        Stats::Scalar numVecOpsExecutedFMA16;
+        Stats::Scalar numVecOpsExecutedFMA32;
+        Stats::Scalar numVecOpsExecutedFMA64;
+        // number of individual MAC 16,32,64 vector operations executed
+        Stats::Scalar numVecOpsExecutedMAC16;
+        Stats::Scalar numVecOpsExecutedMAC32;
+        Stats::Scalar numVecOpsExecutedMAC64;
+        // number of individual MAD 16,32,64 vector operations executed
+        Stats::Scalar numVecOpsExecutedMAD16;
+        Stats::Scalar numVecOpsExecutedMAD32;
+        Stats::Scalar numVecOpsExecutedMAD64;
+        // total number of two op FP vector operations executed
+        Stats::Scalar numVecOpsExecutedTwoOpFP;
+        // Total cycles that something is running on the GPU
+        Stats::Scalar totalCycles;
+        Stats::Formula vpc; // vector ops per cycle
+        Stats::Formula vpc_f16; // vector ops per cycle
+        Stats::Formula vpc_f32; // vector ops per cycle
+        Stats::Formula vpc_f64; // vector ops per cycle
+        Stats::Formula ipc; // vector instructions per cycle
+        Stats::Distribution controlFlowDivergenceDist;
+        Stats::Distribution activeLanesPerGMemInstrDist;
+        Stats::Distribution activeLanesPerLMemInstrDist;
+        // number of vector ALU instructions received
+        Stats::Formula numALUInstsExecuted;
+        // number of times a WG cannot start due to lack of free VGPRs in SIMDs
+        Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+        // number of times a WG cannot start due to lack of free SGPRs in SIMDs
+        Stats::Scalar numTimesWgBlockedDueSgprAlloc;
+        Stats::Scalar numCASOps;
+        Stats::Scalar numFailedCASOps;
+        Stats::Scalar completedWfs;
+        Stats::Scalar completedWGs;
+
+        // distrubtion in latency difference between first and last cache block
+        // arrival ticks
+        Stats::Distribution headTailLatency;
+
+        // Track the amount of interleaving between wavefronts on each SIMD.
+        // This stat is sampled using instExecPerSimd to compute the number
+        // of instructions that have been executed on a SIMD between a WF
+        // executing two successive instructions.
+        Stats::VectorDistribution instInterleave;
+    } stats;
 };

 #endif // __COMPUTE_UNIT_HH__
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -49,7 +49,7 @@ GPUDispatcher::GPUDispatcher(const Params &p)
    : SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
      tickEvent([this]{ exec(); },
          "GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
-      dispatchActive(false)
+      dispatchActive(false), stats(this)
 {
    schedule(&tickEvent, 0);
 }
@@ -58,21 +58,6 @@ GPUDispatcher::~GPUDispatcher()
 {
 }

-void
-GPUDispatcher::regStats()
-{
-    numKernelLaunched
-    .name(name() + ".num_kernel_launched")
-    .desc("number of kernel launched")
-    ;
-
-    cyclesWaitingForDispatch
-    .name(name() + ".cycles_wait_dispatch")
-    .desc("number of cycles with outstanding wavefronts "
-          "that are waiting to be dispatched")
-    ;
-}
-
 HSAQueueEntry*
 GPUDispatcher::hsaTask(int disp_id)
 {
@@ -127,7 +112,7 @@ GPUDispatcher::unserialize(CheckpointIn &cp)
 void
 GPUDispatcher::dispatch(HSAQueueEntry *task)
 {
-    ++numKernelLaunched;
+    ++stats.numKernelLaunched;

    DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
            task->kernelName(), task->dispatchId());
@@ -158,7 +143,7 @@ GPUDispatcher::exec()
    DPRINTF(GPUAgentDisp, "Launching %d Kernels\n", execIds.size());

    if (execIds.size() > 0) {
-        ++cyclesWaitingForDispatch;
+        ++stats.cyclesWaitingForDispatch;
    }

    /**
@@ -368,3 +353,11 @@ GPUDispatcher::scheduleDispatch()
        schedule(&tickEvent, curTick() + shader->clockPeriod());
    }
 }
+
+GPUDispatcher::GPUDispatcherStats::GPUDispatcherStats(Stats::Group *parent)
+    : Stats::Group(parent),
+      ADD_STAT(numKernelLaunched, "number of kernel launched"),
+      ADD_STAT(cyclesWaitingForDispatch, "number of cycles with outstanding "
+               "wavefronts that are waiting to be dispatched")
+{
+}
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -48,6 +48,7 @@
 #include <vector>

 #include "base/statistics.hh"
+#include "base/stats/group.hh"
 #include "dev/hsa/hsa_packet.hh"
 #include "params/GPUDispatcher.hh"
 #include "sim/sim_object.hh"
@@ -67,7 +68,6 @@ class GPUDispatcher : public SimObject

    void serialize(CheckpointOut &cp) const override;
    void unserialize(CheckpointIn &cp) override;
-    void regStats() override;
    void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc);
    void setShader(Shader *new_shader);
    void exec();
@@ -91,9 +91,15 @@ class GPUDispatcher : public SimObject
    std::queue<int> doneIds;
    // is there a kernel in execution?
    bool dispatchActive;
-    /*statistics*/
-    Stats::Scalar numKernelLaunched;
-    Stats::Scalar cyclesWaitingForDispatch;
+
+  protected:
+    struct GPUDispatcherStats : public Stats::Group
+    {
+        GPUDispatcherStats(Stats::Group *parent);
+
+        Stats::Scalar numKernelLaunched;
+        Stats::Scalar cyclesWaitingForDispatch;
+    } stats;
 };

 #endif // __GPU_COMPUTE_DISPATCHER_HH__
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -46,10 +46,11 @@ ExecStage::ExecStage(const ComputeUnitParams &p, ComputeUnit &cu,
    : computeUnit(cu), fromSchedule(from_schedule),
      lastTimeInstExecuted(false),
      thisTimeInstExecuted(false), instrExecuted (false),
-      executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
+      executionResourcesUsed(0), _name(cu.name() + ".ExecStage"),
+      stats(&cu)

 {
-    numTransActiveIdle = 0;
+    stats.numTransActiveIdle = 0;
    idle_dur = 0;
 }

@@ -64,22 +65,22 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
    if (stage == IdleExec) {
        // count cycles when no instruction to a specific execution resource
        // is executed
-        numCyclesWithNoInstrTypeIssued[unitId]++;
+        stats.numCyclesWithNoInstrTypeIssued[unitId]++;
    } else if (stage == BusyExec) {
        // count the number of cycles an instruction to a specific execution
        // resource type was issued
-        numCyclesWithInstrTypeIssued[unitId]++;
+        stats.numCyclesWithInstrTypeIssued[unitId]++;
        thisTimeInstExecuted = true;
        instrExecuted = true;
        ++executionResourcesUsed;
    } else if (stage == PostExec) {
        // count the number of transitions from active to idle
        if (lastTimeInstExecuted && !thisTimeInstExecuted) {
-            ++numTransActiveIdle;
+            ++stats.numTransActiveIdle;
        }

        if (!lastTimeInstExecuted && thisTimeInstExecuted) {
-            idleDur.sample(idle_dur);
+            stats.idleDur.sample(idle_dur);
            idle_dur = 0;
        } else if (!thisTimeInstExecuted) {
            idle_dur++;
@@ -89,11 +90,11 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
        // track the number of cycles we either issued at least
        // instruction or issued no instructions at all
        if (instrExecuted) {
-            numCyclesWithInstrIssued++;
+            stats.numCyclesWithInstrIssued++;
        } else {
-            numCyclesWithNoIssue++;
+            stats.numCyclesWithNoIssue++;
        }
-        spc.sample(executionResourcesUsed);
+        stats.spc.sample(executionResourcesUsed);
    }
 }

@@ -196,57 +197,35 @@ ExecStage::exec()
    collectStatistics(PostExec, 0);
 }

-void
-ExecStage::regStats()
+ExecStage::ExecStageStats::ExecStageStats(Stats::Group *parent)
+    : Stats::Group(parent, "ExecStage"),
+      ADD_STAT(numTransActiveIdle,
+               "number of CU transitions from active to idle"),
+      ADD_STAT(numCyclesWithNoIssue, "number of cycles the CU issues nothing"),
+      ADD_STAT(numCyclesWithInstrIssued,
+               "number of cycles the CU issued at least one instruction"),
+      ADD_STAT(spc,
+               "Execution units active per cycle (Exec unit=SIMD,MemPipe)"),
+      ADD_STAT(idleDur, "duration of idle periods in cycles"),
+      ADD_STAT(numCyclesWithInstrTypeIssued, "Number of cycles at least one "
+               "instruction issued to execution resource type"),
+      ADD_STAT(numCyclesWithNoInstrTypeIssued, "Number of clks no instructions"
+               " issued to execution resource type")
 {
-    numTransActiveIdle
-       .name(name() + ".num_transitions_active_to_idle")
-       .desc("number of CU transitions from active to idle")
-        ;
+    ComputeUnit *compute_unit = static_cast<ComputeUnit*>(parent);

-    numCyclesWithNoIssue
-        .name(name() + ".num_cycles_with_no_issue")
-        .desc("number of cycles the CU issues nothing")
-        ;
-
-    numCyclesWithInstrIssued
-        .name(name() + ".num_cycles_with_instr_issued")
-        .desc("number of cycles the CU issued at least one instruction")
-        ;
-
-    spc
-        .init(0, computeUnit.numExeUnits(), 1)
-        .name(name() + ".spc")
-        .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
-        ;
-
-    idleDur
-        .init(0,75,5)
-        .name(name() + ".idle_duration_in_cycles")
-        .desc("duration of idle periods in cycles")
-        ;
-
-    numCyclesWithInstrTypeIssued
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".num_cycles_issue_exec_rsrc")
-        .desc("Number of cycles at least one instruction issued to "
-              "execution resource type")
-        ;
-
-    numCyclesWithNoInstrTypeIssued
-        .init(computeUnit.numExeUnits())
-       .name(name() + ".num_cycles_no_issue_exec_rsrc")
-       .desc("Number of clks no instructions issued to execution "
-             "resource type")
-       ;
+    spc.init(0, compute_unit->numExeUnits(), 1);
+    idleDur.init(0, 75, 5);
+    numCyclesWithInstrTypeIssued.init(compute_unit->numExeUnits());
+    numCyclesWithNoInstrTypeIssued.init(compute_unit->numExeUnits());

    int c = 0;
-    for (int i = 0; i < computeUnit.numVectorALUs; i++,c++) {
+    for (int i = 0; i < compute_unit->numVectorALUs; i++,c++) {
        std::string s = "VectorALU" + std::to_string(i);
        numCyclesWithNoInstrTypeIssued.subname(c, s);
        numCyclesWithInstrTypeIssued.subname(c, s);
    }
-    for (int i = 0; i < computeUnit.numScalarALUs; i++,c++) {
+    for (int i = 0; i < compute_unit->numScalarALUs; i++,c++) {
        std::string s = "ScalarALU" + std::to_string(i);
        numCyclesWithNoInstrTypeIssued.subname(c, s);
        numCyclesWithInstrTypeIssued.subname(c, s);
@@ -256,7 +235,4 @@ ExecStage::regStats()

    numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe");
    numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe");
-
-    numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe");
-    numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe");
 }
--- a/src/gpu-compute/exec_stage.hh
+++ b/src/gpu-compute/exec_stage.hh
@@ -39,7 +39,8 @@
 #include <utility>
 #include <vector>

-#include "sim/stats.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"

 class ComputeUnit;
 class ScheduleToExecute;
@@ -81,20 +82,6 @@ class ExecStage
    void dumpDispList();

    const std::string& name() const { return _name; }
-    void regStats();
-    // number of idle cycles
-    Stats::Scalar numCyclesWithNoIssue;
-    // number of busy cycles
-    Stats::Scalar numCyclesWithInstrIssued;
-    // number of cycles during which at least one
-    // instruction was issued to an execution resource type
-    Stats::Vector numCyclesWithInstrTypeIssued;
-    // number of idle cycles during which the scheduler
-    // issued no instructions targeting a specific
-    // execution resource type
-    Stats::Vector numCyclesWithNoInstrTypeIssued;
-    // SIMDs active per cycle
-    Stats::Distribution spc;

  private:
    void collectStatistics(enum STAT_STATUS stage, int unitId);
@@ -105,11 +92,33 @@ class ExecStage
    bool lastTimeInstExecuted;
    bool thisTimeInstExecuted;
    bool instrExecuted;
-    Stats::Scalar  numTransActiveIdle;
-    Stats::Distribution idleDur;
    int executionResourcesUsed;
    uint64_t idle_dur;
    const std::string _name;
+
+  protected:
+    struct ExecStageStats : public Stats::Group
+    {
+        ExecStageStats(Stats::Group *parent);
+
+        // number of transitions from active to idle
+        Stats::Scalar numTransActiveIdle;
+        // number of idle cycles
+        Stats::Scalar numCyclesWithNoIssue;
+        // number of busy cycles
+        Stats::Scalar numCyclesWithInstrIssued;
+        // SIMDs active per cycle
+        Stats::Distribution spc;
+        // duration of idle periods in cycles
+        Stats::Distribution idleDur;
+        // number of cycles during which at least one
+        // instruction was issued to an execution resource type
+        Stats::Vector numCyclesWithInstrTypeIssued;
+        // number of idle cycles during which the scheduler
+        // issued no instructions targeting a specific
+        // execution resource type
+        Stats::Vector numCyclesWithNoInstrTypeIssued;
+    } stats;
 };

 #endif // __EXEC_STAGE_HH__
--- a/src/gpu-compute/fetch_stage.cc
+++ b/src/gpu-compute/fetch_stage.cc
@@ -38,7 +38,7 @@

 FetchStage::FetchStage(const ComputeUnitParams &p, ComputeUnit &cu)
    : numVectorALUs(p.num_SIMDs), computeUnit(cu),
-      _name(cu.name() + ".FetchStage")
+      _name(cu.name() + ".FetchStage"), stats(&cu)
 {
    for (int j = 0; j < numVectorALUs; ++j) {
        FetchUnit newFetchUnit(p, cu);
@@ -79,7 +79,7 @@ FetchStage::processFetchReturn(PacketPtr pkt)
    const unsigned num_instructions = pkt->req->getSize() /
        sizeof(TheGpuISA::RawMachInst);

-    instFetchInstReturned.sample(num_instructions);
+    stats.instFetchInstReturned.sample(num_instructions);
    uint32_t simdId = wavefront->simdId;
    _fetchUnit[simdId].processFetchReturn(pkt);
 }
@@ -90,13 +90,10 @@ FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
    _fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
 }

-void
-FetchStage::regStats()
+FetchStage::FetchStageStats::FetchStageStats(Stats::Group *parent)
+    : Stats::Group(parent, "FetchStage"),
+      ADD_STAT(instFetchInstReturned, "For each instruction fetch request "
+               "received record how many instructions you got from it")
 {
-    instFetchInstReturned
-        .init(1, 32, 1)
-        .name(name() + ".inst_fetch_instr_returned")
-        .desc("For each instruction fetch request recieved record how many "
-              "instructions you got from it")
-        ;
+        instFetchInstReturned.init(1, 32, 1);
 }
--- a/src/gpu-compute/fetch_stage.hh
+++ b/src/gpu-compute/fetch_stage.hh
@@ -38,6 +38,7 @@
 #include <vector>

 #include "base/statistics.hh"
+#include "base/stats/group.hh"
 #include "gpu-compute/fetch_unit.hh"

 // Instruction fetch stage.
@@ -61,8 +62,6 @@ class FetchStage

    // Stats related variables and methods
    const std::string& name() const { return _name; }
-    void regStats();
-    Stats::Distribution instFetchInstReturned;
    FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }

  private:
@@ -73,6 +72,14 @@ class FetchStage
    // instantiated per VALU/SIMD
    std::vector<FetchUnit> _fetchUnit;
    const std::string _name;
+
+  protected:
+    struct FetchStageStats : public Stats::Group
+    {
+        FetchStageStats(Stats::Group *parent);
+
+        Stats::Distribution instFetchInstReturned;
+    } stats;
 };

 #endif // __FETCH_STAGE_HH__
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -48,7 +48,7 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
    : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
      gmQueueSize(p.global_mem_queue_size),
      maxWaveRequests(p.max_wave_requests), inflightStores(0),
-      inflightLoads(0)
+      inflightLoads(0), stats(&cu)
 {
 }

@@ -293,12 +293,10 @@ GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
    mem_req->second.second = true;
 }

-void
-GlobalMemPipeline::regStats()
+GlobalMemPipeline::
+GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent)
+    : Stats::Group(parent, "GlobalMemPipeline"),
+      ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
+               "are delayed before updating the VRF")
 {
-    loadVrfBankConflictCycles
-        .name(name() + ".load_vrf_bank_conflict_cycles")
-        .desc("total number of cycles GM data are delayed before updating "
-              "the VRF")
-        ;
 }
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -37,6 +37,8 @@
 #include <queue>
 #include <string>

+#include "base/statistics.hh"
+#include "base/stats/group.hh"
 #include "gpu-compute/misc.hh"
 #include "params/ComputeUnit.hh"
 #include "sim/stats.hh"
@@ -95,11 +97,10 @@ class GlobalMemPipeline
    }

    const std::string &name() const { return _name; }
-    void regStats();
    void
    incLoadVRFBankConflictCycles(int num_cycles)
    {
-        loadVrfBankConflictCycles += num_cycles;
+        stats.loadVrfBankConflictCycles += num_cycles;
    }

    bool coalescerReady(GPUDynInstPtr mp) const;
@@ -113,10 +114,6 @@ class GlobalMemPipeline
    int gmQueueSize;
    int maxWaveRequests;

-    // number of cycles of delaying the update of a VGPR that is the
-    // target of a load instruction (or the load component of an atomic)
-    // The delay is due to VRF bank conflicts
-    Stats::Scalar loadVrfBankConflictCycles;
    // Counters to track the inflight loads and stores
    // so that we can provide the proper backpressure
    // on the number of inflight memory operations.
@@ -144,6 +141,17 @@ class GlobalMemPipeline
    // Global Memory Request FIFO: all global memory requests
    // are issued to this FIFO from the memory pipelines
    std::queue<GPUDynInstPtr> gmIssuedRequests;
+
+  protected:
+    struct GlobalMemPipelineStats : public Stats::Group
+    {
+        GlobalMemPipelineStats(Stats::Group *parent);
+
+        // number of cycles of delaying the update of a VGPR that is the
+        // target of a load instruction (or the load component of an atomic)
+        // The delay is due to VRF bank conflicts
+        Stats::Scalar loadVrfBankConflictCycles;
+    } stats;
 };

 #endif // __GLOBAL_MEMORY_PIPELINE_HH__
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -930,16 +930,16 @@ GPUDynInst::updateStats()
 {
    if (_staticInst->isLocalMem()) {
        // access to LDS (shared) memory
-        cu->dynamicLMemInstrCnt++;
+        cu->stats.dynamicLMemInstrCnt++;
    } else if (_staticInst->isFlat()) {
-        cu->dynamicFlatMemInstrCnt++;
+        cu->stats.dynamicFlatMemInstrCnt++;
    } else {
        // access to global memory

        // update PageDivergence histogram
        int number_pages_touched = cu->pagesTouched.size();
        assert(number_pages_touched);
-        cu->pageDivergenceDist.sample(number_pages_touched);
+        cu->stats.pageDivergenceDist.sample(number_pages_touched);

        std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;

@@ -962,7 +962,7 @@ GPUDynInst::updateStats()
        // total number of memory instructions (dynamic)
        // Atomics are counted as a single memory instruction.
        // this is # memory instructions per wavefronts, not per workitem
-        cu->dynamicGMemInstrCnt++;
+        cu->stats.dynamicGMemInstrCnt++;
    }
 }

--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -63,12 +63,12 @@ class AtomicOpCAS : public TypedAtomicOpFunctor<T>
    void
    execute(T *b)
    {
-        computeUnit->numCASOps++;
+        computeUnit->stats.numCASOps++;

        if (*b == c) {
            *b = s;
        } else {
-            computeUnit->numFailedCASOps++;
+            computeUnit->stats.numFailedCASOps++;
        }
    }
    AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
--- a/src/gpu-compute/gpu_tlb.cc
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -67,7 +67,7 @@ namespace X86ISA
        : ClockedObject(p), configAddress(0), size(p.size),
          cleanupEvent([this]{ cleanup(); }, name(), false,
                       Event::Maximum_Pri),
-          exitEvent([this]{ exitCallback(); }, name())
+          exitEvent([this]{ exitCallback(); }, name()), stats(this)
    {
        assoc = p.assoc;
        assert(assoc <= size);
@@ -402,12 +402,12 @@ namespace X86ISA
                    return tlb_hit;
                }

-                localNumTLBAccesses++;
+                stats.localNumTLBAccesses++;

                if (!entry) {
-                    localNumTLBMisses++;
+                    stats.localNumTLBMisses++;
                } else {
-                    localNumTLBHits++;
+                    stats.localNumTLBHits++;
                }
            }
        }
@@ -499,10 +499,10 @@ namespace X86ISA
                DPRINTF(GPUTLB, "Paging enabled.\n");
                // The vaddr already has the segment base applied.
                TlbEntry *entry = lookup(vaddr);
-                localNumTLBAccesses++;
+                stats.localNumTLBAccesses++;

                if (!entry) {
-                    localNumTLBMisses++;
+                    stats.localNumTLBMisses++;
                    if (timing) {
                        latency = missLatency1;
                    }
@@ -544,7 +544,7 @@ namespace X86ISA
                        DPRINTF(GPUTLB, "Miss was serviced.\n");
                    }
                } else {
-                    localNumTLBHits++;
+                    stats.localNumTLBHits++;

                    if (timing) {
                        latency = hitLatency;
@@ -659,89 +659,6 @@ namespace X86ISA
    {
    }

-    void
-    GpuTLB::regStats()
-    {
-        ClockedObject::regStats();
-
-        localNumTLBAccesses
-            .name(name() + ".local_TLB_accesses")
-            .desc("Number of TLB accesses")
-            ;
-
-        localNumTLBHits
-            .name(name() + ".local_TLB_hits")
-            .desc("Number of TLB hits")
-            ;
-
-        localNumTLBMisses
-            .name(name() + ".local_TLB_misses")
-            .desc("Number of TLB misses")
-            ;
-
-        localTLBMissRate
-            .name(name() + ".local_TLB_miss_rate")
-            .desc("TLB miss rate")
-            ;
-
-        accessCycles
-            .name(name() + ".access_cycles")
-            .desc("Cycles spent accessing this TLB level")
-            ;
-
-        pageTableCycles
-            .name(name() + ".page_table_cycles")
-            .desc("Cycles spent accessing the page table")
-            ;
-
-        localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
-
-        numUniquePages
-            .name(name() + ".unique_pages")
-            .desc("Number of unique pages touched")
-            ;
-
-        localCycles
-            .name(name() + ".local_cycles")
-            .desc("Number of cycles spent in queue for all incoming reqs")
-            ;
-
-        localLatency
-            .name(name() + ".local_latency")
-            .desc("Avg. latency over incoming coalesced reqs")
-            ;
-
-        localLatency = localCycles / localNumTLBAccesses;
-
-        globalNumTLBAccesses
-            .name(name() + ".global_TLB_accesses")
-            .desc("Number of TLB accesses")
-            ;
-
-        globalNumTLBHits
-            .name(name() + ".global_TLB_hits")
-            .desc("Number of TLB hits")
-            ;
-
-        globalNumTLBMisses
-            .name(name() + ".global_TLB_misses")
-            .desc("Number of TLB misses")
-            ;
-
-        globalTLBMissRate
-            .name(name() + ".global_TLB_miss_rate")
-            .desc("TLB miss rate")
-            ;
-
-        globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
-
-        avgReuseDistance
-            .name(name() + ".avg_reuse_distance")
-            .desc("avg. reuse distance over all pages (in ticks)")
-            ;
-
-    }
-
    /**
     * Do the TLB lookup for this coalesced request and schedule
     * another event <TLB access latency> cycles later.
@@ -768,10 +685,10 @@ namespace X86ISA
        int req_cnt = sender_state->reqCnt.back();

        if (update_stats) {
-            accessCycles -= (curTick() * req_cnt);
-            localCycles -= curTick();
+            stats.accessCycles -= (curTick() * req_cnt);
+            stats.localCycles -= curTick();
            updatePageFootprint(virt_page_addr);
-            globalNumTLBAccesses += req_cnt;
+            stats.globalNumTLBAccesses += req_cnt;
        }

        tlbOutcome lookup_outcome = TLB_MISS;
@@ -795,11 +712,11 @@ namespace X86ISA
                // the reqCnt has an entry per level, so its size tells us
                // which level we are in
                sender_state->hitLevel = sender_state->reqCnt.size();
-                globalNumTLBHits += req_cnt;
+                stats.globalNumTLBHits += req_cnt;
            }
        } else {
            if (update_stats)
-                globalNumTLBMisses += req_cnt;
+                stats.globalNumTLBMisses += req_cnt;
        }

        /*
@@ -981,16 +898,16 @@ namespace X86ISA
            handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);

            if (update_stats) {
-                accessCycles += (req_cnt * curTick());
-                localCycles += curTick();
+                stats.accessCycles += (req_cnt * curTick());
+                stats.localCycles += curTick();
            }

        } else if (outcome == TLB_MISS) {

            DPRINTF(GPUTLB, "This is a TLB miss\n");
            if (update_stats) {
-                accessCycles += (req_cnt*curTick());
-                localCycles += curTick();
+                stats.accessCycles += (req_cnt*curTick());
+                stats.localCycles += curTick();
            }

            if (hasMemSidePort) {
@@ -998,8 +915,8 @@ namespace X86ISA
                // the reply back till when we propagate it to the coalescer
                // above.
                if (update_stats) {
-                    accessCycles += (req_cnt * 1);
-                    localCycles += 1;
+                    stats.accessCycles += (req_cnt * 1);
+                    stats.localCycles += 1;
                }

                /**
@@ -1022,7 +939,7 @@ namespace X86ISA
                        "addr %#x\n", virtPageAddr);

                if (update_stats)
-                    pageTableCycles -= (req_cnt*curTick());
+                    stats.pageTableCycles -= (req_cnt*curTick());

                TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
                assert(tlb_event);
@@ -1032,7 +949,7 @@ namespace X86ISA
            }
        } else if (outcome == PAGE_WALK) {
            if (update_stats)
-                pageTableCycles += (req_cnt*curTick());
+                stats.pageTableCycles += (req_cnt*curTick());

            // Need to access the page table and update the TLB
            DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
@@ -1222,17 +1139,17 @@ namespace X86ISA
        // functional mode means no coalescing
        // global metrics are the same as the local metrics
        if (update_stats) {
-            tlb->globalNumTLBAccesses++;
+            tlb->stats.globalNumTLBAccesses++;

            if (success) {
                sender_state->hitLevel = sender_state->reqCnt.size();
-                tlb->globalNumTLBHits++;
+                tlb->stats.globalNumTLBHits++;
            }
        }

        if (!success) {
            if (update_stats)
-                tlb->globalNumTLBMisses++;
+                tlb->stats.globalNumTLBMisses++;
            if (tlb->hasMemSidePort) {
                // there is a TLB below -> propagate down the TLB hierarchy
                tlb->memSidePort[0]->sendFunctional(pkt);
@@ -1405,7 +1322,7 @@ namespace X86ISA
        bool first_page_access = ret.second;

        if (first_page_access) {
-            numUniquePages++;
+            stats.numUniquePages++;
        } else  {
            int accessed_before;
            accessed_before  = curTick() - ret.first->second.lastTimeAccessed;
@@ -1417,7 +1334,7 @@ namespace X86ISA

        if (accessDistance) {
            ret.first->second.localTLBAccesses
-                .push_back(localNumTLBAccesses.value());
+                .push_back(stats.localNumTLBAccesses.value());
        }
    }

@@ -1506,11 +1423,36 @@ namespace X86ISA
        }

        if (!TLBFootprint.empty()) {
-            avgReuseDistance =
+            stats.avgReuseDistance =
                sum_avg_reuse_distance_per_page / TLBFootprint.size();
        }

        //clear the TLBFootprint map
        TLBFootprint.clear();
    }
+
+    GpuTLB::GpuTLBStats::GpuTLBStats(Stats::Group *parent)
+        : Stats::Group(parent),
+          ADD_STAT(localNumTLBAccesses, "Number of TLB accesses"),
+          ADD_STAT(localNumTLBHits, "Number of TLB hits"),
+          ADD_STAT(localNumTLBMisses, "Number of TLB misses"),
+          ADD_STAT(localTLBMissRate, "TLB miss rate"),
+          ADD_STAT(globalNumTLBAccesses, "Number of TLB accesses"),
+          ADD_STAT(globalNumTLBHits, "Number of TLB hits"),
+          ADD_STAT(globalNumTLBMisses, "Number of TLB misses"),
+          ADD_STAT(globalTLBMissRate, "TLB miss rate"),
+          ADD_STAT(accessCycles, "Cycles spent accessing this TLB level"),
+          ADD_STAT(pageTableCycles, "Cycles spent accessing the page table"),
+          ADD_STAT(numUniquePages, "Number of unique pages touched"),
+          ADD_STAT(localCycles, "Number of cycles spent in queue for all "
+                   "incoming reqs"),
+          ADD_STAT(localLatency, "Avg. latency over incoming coalesced reqs"),
+          ADD_STAT(avgReuseDistance, "avg. reuse distance over all pages (in "
+                   "ticks)")
+    {
+        localLatency = localCycles / localNumTLBAccesses;
+
+        localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
+        globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
+    }
 } // namespace X86ISA
--- a/src/gpu-compute/gpu_tlb.hh
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -47,6 +47,7 @@
 #include "base/callback.hh"
 #include "base/logging.hh"
 #include "base/statistics.hh"
+#include "base/stats/group.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
@@ -169,35 +170,6 @@ namespace X86ISA
        int missLatency1;
        int missLatency2;

-        // local_stats are as seen from the TLB
-        // without taking into account coalescing
-        Stats::Scalar localNumTLBAccesses;
-        Stats::Scalar localNumTLBHits;
-        Stats::Scalar localNumTLBMisses;
-        Stats::Formula localTLBMissRate;
-
-        // global_stats are as seen from the
-        // CU's perspective taking into account
-        // all coalesced requests.
-        Stats::Scalar globalNumTLBAccesses;
-        Stats::Scalar globalNumTLBHits;
-        Stats::Scalar globalNumTLBMisses;
-        Stats::Formula globalTLBMissRate;
-
-        // from the CU perspective (global)
-        Stats::Scalar accessCycles;
-        // from the CU perspective (global)
-        Stats::Scalar pageTableCycles;
-        Stats::Scalar numUniquePages;
-        // from the perspective of this TLB
-        Stats::Scalar localCycles;
-        // from the perspective of this TLB
-        Stats::Formula localLatency;
-        // I take the avg. per page and then
-        // the avg. over all pages.
-        Stats::Scalar avgReuseDistance;
-
-        void regStats() override;
        void updatePageFootprint(Addr virt_page_addr);
        void printAccessPattern();

@@ -426,6 +398,40 @@ namespace X86ISA
        void exitCallback();

        EventFunctionWrapper exitEvent;
+
+      protected:
+        struct GpuTLBStats : public Stats::Group
+        {
+            GpuTLBStats(Stats::Group *parent);
+
+            // local_stats are as seen from the TLB
+            // without taking into account coalescing
+            Stats::Scalar localNumTLBAccesses;
+            Stats::Scalar localNumTLBHits;
+            Stats::Scalar localNumTLBMisses;
+            Stats::Formula localTLBMissRate;
+
+            // global_stats are as seen from the
+            // CU's perspective taking into account
+            // all coalesced requests.
+            Stats::Scalar globalNumTLBAccesses;
+            Stats::Scalar globalNumTLBHits;
+            Stats::Scalar globalNumTLBMisses;
+            Stats::Formula globalTLBMissRate;
+
+            // from the CU perspective (global)
+            Stats::Scalar accessCycles;
+            // from the CU perspective (global)
+            Stats::Scalar pageTableCycles;
+            Stats::Scalar numUniquePages;
+            // from the perspective of this TLB
+            Stats::Scalar localCycles;
+            // from the perspective of this TLB
+            Stats::Formula localLatency;
+            // I take the avg. per page and then
+            // the avg. over all pages.
+            Stats::Scalar avgReuseDistance;
+        } stats;
    };
 }

--- a/src/gpu-compute/lds_state.cc
+++ b/src/gpu-compute/lds_state.cc
@@ -189,10 +189,10 @@ LdsState::processPacket(PacketPtr packet)
    // the number of conflicts this packet will have when accessing the LDS
    unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
    // count the total number of physical LDS bank accessed
-    parent->ldsBankAccesses += bankAccesses;
+    parent->stats.ldsBankAccesses += bankAccesses;
    // count the LDS bank conflicts. A number set to 1 indicates one
    // access per bank maximum so there are no bank conflicts
-    parent->ldsBankConflictDist.sample(bankConflicts-1);
+    parent->stats.ldsBankConflictDist.sample(bankConflicts-1);

    GPUDynInstPtr dynInst = getDynInstr(packet);
    // account for the LDS bank conflict overhead
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -43,7 +43,7 @@

 LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
    : computeUnit(cu), _name(cu.name() + ".LocalMemPipeline"),
-      lmQueueSize(p.local_mem_queue_size)
+      lmQueueSize(p.local_mem_queue_size), stats(&cu)
 {
 }

@@ -124,12 +124,11 @@ LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
    lmIssuedRequests.push(gpuDynInst);
 }

-void
-LocalMemPipeline::regStats()
+
+LocalMemPipeline::
+LocalMemPipelineStats::LocalMemPipelineStats(Stats::Group *parent)
+    : Stats::Group(parent, "LocalMemPipeline"),
+      ADD_STAT(loadVrfBankConflictCycles, "total number of cycles LDS data "
+               "are delayed before updating the VRF")
 {
-    loadVrfBankConflictCycles
-        .name(name() + ".load_vrf_bank_conflict_cycles")
-        .desc("total number of cycles LDS data are delayed before updating "
-              "the VRF")
-        ;
 }
--- a/src/gpu-compute/local_memory_pipeline.hh
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -37,9 +37,10 @@
 #include <queue>
 #include <string>

+#include "base/statistics.hh"
+#include "base/stats/group.hh"
 #include "gpu-compute/misc.hh"
 #include "params/ComputeUnit.hh"
-#include "sim/stats.hh"

 /*
 * @file local_memory_pipeline.hh
@@ -75,19 +76,18 @@ class LocalMemPipeline
    }

    const std::string& name() const { return _name; }
-    void regStats();

    void
    incLoadVRFBankConflictCycles(int num_cycles)
    {
-        loadVrfBankConflictCycles += num_cycles;
+        stats.loadVrfBankConflictCycles += num_cycles;
    }

  private:
    ComputeUnit &computeUnit;
    const std::string _name;
    int lmQueueSize;
-    Stats::Scalar loadVrfBankConflictCycles;
+
    // Local Memory Request Fifo: all shared memory requests
    // are issued to this FIFO from the memory pipelines
    std::queue<GPUDynInstPtr> lmIssuedRequests;
@@ -95,6 +95,14 @@ class LocalMemPipeline
    // Local Memory Response Fifo: all responses of shared memory
    // requests are sent to this FIFO from LDS
    std::queue<GPUDynInstPtr> lmReturnedRequests;
+
+  protected:
+    struct LocalMemPipelineStats : public Stats::Group
+    {
+        LocalMemPipelineStats(Stats::Group *parent);
+
+        Stats::Scalar loadVrfBankConflictCycles;
+    } stats;
 };

 #endif // __LOCAL_MEMORY_PIPELINE_HH__
--- a/src/gpu-compute/register_file.cc
+++ b/src/gpu-compute/register_file.cc
@@ -49,7 +49,7 @@
 #include "params/RegisterFile.hh"

 RegisterFile::RegisterFile(const RegisterFileParams &p)
-    : SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs)
+    : SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs), stats(this)
 {
    fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n");
    fatal_if(simdId < 0, "Illegal SIMD id for VRF");
@@ -192,26 +192,15 @@ RegisterFile::dispatchInstruction(GPUDynInstPtr ii)
 {
 }

-void
-RegisterFile::regStats()
+RegisterFile::RegisterFileStats::RegisterFileStats(Stats::Group *parent)
+    : Stats::Group(parent),
+      ADD_STAT(registerReads,
+              "Total number of DWORDs read from register file"),
+      ADD_STAT(registerWrites,
+              "Total number of DWORDS written to register file"),
+      ADD_STAT(sramReads,
+              "Total number of register file bank SRAM activations for reads"),
+      ADD_STAT(sramWrites,
+              "Total number of register file bank SRAM activations for writes")
 {
-    registerReads
-        .name(name() + ".register_reads")
-        .desc("Total number of DWORDs read from register file")
-        ;
-
-    registerWrites
-        .name(name() + ".register_writes")
-        .desc("Total number of DWORDS written to register file")
-        ;
-
-    sramReads
-        .name(name() + ".sram_reads")
-        .desc("Total number of register file bank SRAM activations for reads")
-        ;
-
-    sramWrites
-        .name(name() + ".sram_writes")
-        .desc("Total number of register file bank SRAM activations for writes")
-        ;
 }
--- a/src/gpu-compute/register_file.hh
+++ b/src/gpu-compute/register_file.hh
@@ -62,7 +62,6 @@ class RegisterFile : public SimObject
    virtual ~RegisterFile();
    virtual void setParent(ComputeUnit *_computeUnit);
    int numRegs() const { return _numRegs; }
-    virtual void regStats() override;

    // State functions

@@ -154,18 +153,23 @@ class RegisterFile : public SimObject

    // numer of registers in this register file
    int _numRegs;
-    // Stats
-    // Total number of register reads, incremented once per DWORD per thread
-    Stats::Scalar registerReads;
-    // Total number of register writes, incremented once per DWORD per thread
-    Stats::Scalar registerWrites;

-    // Number of register file SRAM activations for reads.
-    // The register file may be implemented with multiple SRAMs. This stat
-    // tracks how many times the SRAMs are accessed for reads.
-    Stats::Scalar sramReads;
-    // Number of register file SRAM activations for writes
-    Stats::Scalar sramWrites;
+    struct RegisterFileStats : public Stats::Group
+    {
+        RegisterFileStats(Stats::Group *parent);
+
+        // Total number of register reads per DWORD per thread
+        Stats::Scalar registerReads;
+        // Total number of register writes per DWORD per thread
+        Stats::Scalar registerWrites;
+
+        // Number of register file SRAM activations for reads.
+        // The register file may be implemented with multiple SRAMs. This stat
+        // tracks how many times the SRAMs are accessed for reads.
+        Stats::Scalar sramReads;
+        // Number of register file SRAM activations for writes
+        Stats::Scalar sramWrites;
+    } stats;
 };

 #endif // __REGISTER_FILE_HH__
--- a/src/gpu-compute/register_manager.cc
+++ b/src/gpu-compute/register_manager.cc
@@ -129,9 +129,3 @@ RegisterManager::freeRegisters(Wavefront* w)
 {
    policy->freeRegisters(w);
 }
-
-void
-RegisterManager::regStats()
-{
-    policy->regStats();
-}
--- a/src/gpu-compute/register_manager.hh
+++ b/src/gpu-compute/register_manager.hh
@@ -63,9 +63,6 @@ class RegisterManager : public SimObject
    void setParent(ComputeUnit *cu);
    void exec();

-    // Stats related variables and methods
-    void regStats();
-
    // lookup virtual to physical register translation
    int mapVgpr(Wavefront* w, int vgprIndex);
    int mapSgpr(Wavefront* w, int sgprIndex);
--- a/src/gpu-compute/register_manager_policy.hh
+++ b/src/gpu-compute/register_manager_policy.hh
@@ -76,9 +76,6 @@ class RegisterManagerPolicy
    // free all remaining registers held by specified WF
    virtual void freeRegisters(Wavefront *w) = 0;

-    // stats
-    virtual void regStats() = 0;
-
  protected:
    ComputeUnit *cu;
 };
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -142,8 +142,3 @@ ScalarMemPipeline::exec()
                computeUnit.cu_id, mp->simdId, mp->wfSlotId);
    }
 }
-
-void
-ScalarMemPipeline::regStats()
-{
-}
--- a/src/gpu-compute/scalar_memory_pipeline.hh
+++ b/src/gpu-compute/scalar_memory_pipeline.hh
@@ -85,7 +85,6 @@ class ScalarMemPipeline
    }

    const std::string& name() const { return _name; }
-    void regStats();

  private:
    ComputeUnit &computeUnit;
--- a/src/gpu-compute/scalar_register_file.cc
+++ b/src/gpu-compute/scalar_register_file.cc
@@ -66,11 +66,11 @@ ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const

                if (regBusy(pSgpr)) {
                    if (ii->isDstOperand(i)) {
-                        w->numTimesBlockedDueWAXDependencies++;
+                        w->stats.numTimesBlockedDueWAXDependencies++;
                    } else if (ii->isSrcOperand(i)) {
                        DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
                                w->wfDynId, ii->disassemble(), pSgpr);
-                        w->numTimesBlockedDueRAWDependencies++;
+                        w->stats.numTimesBlockedDueRAWDependencies++;
                    }
                    return false;
                }
@@ -109,7 +109,7 @@ ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
        if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
            int DWORDs = ii->getOperandSize(i) <= 4 ? 1
                : ii->getOperandSize(i) / 4;
-            registerReads += DWORDs;
+            stats.registerReads += DWORDs;
        }
    }

@@ -128,7 +128,7 @@ ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
                    enqRegFreeEvent(physReg, tickDelay);
                }

-                registerWrites += nRegs;
+                stats.registerWrites += nRegs;
            }
        }
    }
@@ -152,7 +152,7 @@ ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w,
                enqRegFreeEvent(physReg, computeUnit->clockPeriod());
            }

-            registerWrites += nRegs;
+            stats.registerWrites += nRegs;
        }
    }
 }
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -51,7 +51,7 @@ ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,
      _name(cu.name() + ".ScheduleStage"),
      vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
      scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
-      locMemBusRdy(false), locMemIssueRdy(false)
+      locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
 {
    for (int j = 0; j < cu.numExeUnits(); ++j) {
        scheduler.emplace_back(p);
@@ -121,10 +121,10 @@ ScheduleStage::exec()
        // If no wave is ready to be scheduled on the execution resource
        // then skip scheduling for this execution resource
        if (!readyListSize) {
-            rdyListEmpty[j]++;
+            stats.rdyListEmpty[j]++;
            continue;
        }
-        rdyListNotEmpty[j]++;
+        stats.rdyListNotEmpty[j]++;

        // Pick a wave and attempt to add it to schList
        Wavefront *wf = scheduler[j].chooseWave();
@@ -133,8 +133,8 @@ ScheduleStage::exec()
        if (!addToSchList(j, gpu_dyn_inst)) {
            // For waves not added to schList, increment count of cycles
            // this wave spends in SCH stage.
-            wf->schCycles++;
-            addToSchListStalls[j]++;
+            wf->stats.schCycles++;
+            stats.addToSchListStalls[j]++;
        } else {
            if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
                wf->incLGKMInstsIssued();
@@ -160,10 +160,10 @@ ScheduleStage::exec()
        // If no wave is ready to be scheduled on the execution resource
        // then skip scheduling for this execution resource
        if (!readyListSize) {
-            rdyListEmpty[j]++;
+            stats.rdyListEmpty[j]++;
            continue;
        }
-        rdyListNotEmpty[j]++;
+        stats.rdyListNotEmpty[j]++;

        // Pick a wave and attempt to add it to schList
        Wavefront *wf = scheduler[j].chooseWave();
@@ -172,8 +172,8 @@ ScheduleStage::exec()
        if (!addToSchList(j, gpu_dyn_inst)) {
            // For waves not added to schList, increment count of cycles
            // this wave spends in SCH stage.
-            wf->schCycles++;
-            addToSchListStalls[j]++;
+            wf->stats.schCycles++;
+            stats.addToSchListStalls[j]++;
        }
    }

@@ -241,17 +241,17 @@ ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
        computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
        return true;
    } else {
-        rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+        stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
        if (!accessSrfWr) {
-            rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
+            stats.rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
        }
        if (!accessVrfWr) {
-            rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
+            stats.rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
        }

        // Increment stall counts for WF
-        wf->schStalls++;
-        wf->schRfAccessStalls++;
+        wf->stats.schStalls++;
+        wf->stats.schRfAccessStalls++;
    }
    return false;
 }
@@ -329,19 +329,19 @@ ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
        return true;
    } else {
        // Number of stall cycles due to RF access denied
-        rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+        stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
        // Count number of denials due to each reason
        // Multiple items may contribute to the denied request
        if (!accessVrf) {
-            rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
+            stats.rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
        }
        if (!accessSrf) {
-            rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
+            stats.rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
        }

        // Increment stall counts for WF
-        wf->schStalls++;
-        wf->schRfAccessStalls++;
+        wf->stats.schStalls++;
+        wf->stats.schRfAccessStalls++;
        DPRINTF(GPUSched, "schList[%d]: Could not add: "
                "SIMD[%d] WV[%d]: %d: %s\n",
                exeType, wf->simdId, wf->wfDynId,
@@ -424,26 +424,26 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
        // TODO: Scalar NOP does not require SALU in hardware,
        // and is executed out of IB directly.
        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
-            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
            return false;
        } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
-            dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
            return false;
        }
    } else if (gpu_dyn_inst->isEndOfKernel()) {
        // EndPgm instruction
        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
-            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
            return false;
        }
    } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
               || gpu_dyn_inst->isALU()) {
        // Barrier, Branch, or ALU instruction
        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
-            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
            return false;
        } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
-            dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
            return false;
        }
    } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
@@ -451,19 +451,19 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
        bool rdy = true;
        if (!glbMemIssueRdy) {
            rdy = false;
-            dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
        }
        if (!glbMemBusRdy) {
            rdy = false;
-            dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
        }
        if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
            rdy = false;
-            dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
        }
        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
            rdy = false;
-            dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
+            stats.dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
        }
        if (!rdy) {
            return false;
@@ -473,18 +473,18 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
        bool rdy = true;
        if (!scalarMemIssueRdy) {
            rdy = false;
-            dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
        }
        if (!scalarMemBusRdy) {
            rdy = false;
-            dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
        }
        if (!computeUnit.scalarMemoryPipe
            .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
            + wf->scalarWrGmReqsInPipe))
        {
            rdy = false;
-            dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
+            stats.dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
        }
        if (!rdy) {
            return false;
@@ -494,16 +494,16 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
        bool rdy = true;
        if (!locMemIssueRdy) {
            rdy = false;
-            dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
+            stats.dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
        }
        if (!locMemBusRdy) {
            rdy = false;
-            dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
+            stats.dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
        }
        if (!computeUnit.localMemoryPipe.
                isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
            rdy = false;
-            dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
+            stats.dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
        }
        if (!rdy) {
            return false;
@@ -513,24 +513,24 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
        bool rdy = true;
        if (!glbMemIssueRdy || !locMemIssueRdy) {
            rdy = false;
-            dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
+            stats.dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
        }
        if (!glbMemBusRdy || !locMemBusRdy) {
            rdy = false;
-            dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
+            stats.dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
        }
        if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
            rdy = false;
-            dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
+            stats.dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
        }
        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
            rdy = false;
-            dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
+            stats.dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
        }
        if (!computeUnit.localMemoryPipe.
                isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
            rdy = false;
-            dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
+            stats.dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
        }
        if (!rdy) {
            return false;
@@ -540,7 +540,7 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
              gpu_dyn_inst->disassemble());
        return false;
    }
-    dispNrdyStalls[SCH_RDY]++;
+    stats.dispNrdyStalls[SCH_RDY]++;
    return true;
 }

@@ -584,10 +584,10 @@ ScheduleStage::fillDispatchList()
                } else {
                    // Either another wave has been dispatched, or this wave
                    // was not ready, so it is stalled this cycle
-                    schIter->first->wavefront()->schStalls++;
+                    schIter->first->wavefront()->stats.schStalls++;
                    if (!dispRdy) {
                        // not ready for dispatch, increment stall stat
-                        schIter->first->wavefront()->schResourceStalls++;
+                        schIter->first->wavefront()->stats.schResourceStalls++;
                    }
                    // Examine next wave for this resource
                    schIter++;
@@ -601,9 +601,9 @@ ScheduleStage::fillDispatchList()
        // Increment stall count if no wave sent to dispatchList for
        // current execution resource
        if (!dispatched) {
-            schListToDispListStalls[j]++;
+            stats.schListToDispListStalls[j]++;
        } else {
-            schListToDispList[j]++;
+            stats.schListToDispList[j]++;
        }
    }
 }
@@ -635,9 +635,9 @@ ScheduleStage::arbitrateVrfToLdsBus()
                reinsertToSchList(wf->localMem, toExecute
                                  .readyInst(wf->localMem));
                // Increment stall stats for LDS-VRF arbitration
-                ldsBusArbStalls++;
+                stats.ldsBusArbStalls++;
                toExecute.readyInst(wf->localMem)
-                    ->wavefront()->schLdsArbStalls++;
+                    ->wavefront()->stats.schLdsArbStalls++;
            }
            // With arbitration of LM pipe complete, transition the
            // LM pipe to SKIP state in the dispatchList to inform EX stage
@@ -663,7 +663,7 @@ ScheduleStage::checkRfOperandReadComplete()

            // Increment the number of cycles the wave spends in the
            // SCH stage, since this loop visits every wave in SCH.
-            wf->schCycles++;
+            wf->stats.schCycles++;

            bool vrfRdy = true;
            if (!gpu_dyn_inst->isScalar()) {
@@ -690,15 +690,15 @@ ScheduleStage::checkRfOperandReadComplete()
                p.second = RFBUSY;

                // Increment stall stats
-                wf->schStalls++;
-                wf->schOpdNrdyStalls++;
+                wf->stats.schStalls++;
+                wf->stats.schOpdNrdyStalls++;

-                opdNrdyStalls[SCH_RF_OPD_NRDY]++;
+                stats.opdNrdyStalls[SCH_RF_OPD_NRDY]++;
                if (!vrfRdy) {
-                    opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
+                    stats.opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
                }
                if (!srfRdy) {
-                    opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
+                    stats.opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
                }
            }
        }
@@ -777,60 +777,40 @@ ScheduleStage::deleteFromSch(Wavefront *w)
    wavesInSch.erase(w->wfDynId);
 }

-void
-ScheduleStage::regStats()
+ScheduleStage::ScheduleStageStats::ScheduleStageStats(Stats::Group *parent,
+                                                      int num_exec_units)
+    : Stats::Group(parent, "ScheduleStage"),
+      ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
+               "execution resource"),
+      ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
+               "list per execution resource"),
+      ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "
+               "schList per execution resource when ready list is not empty"),
+      ADD_STAT(schListToDispList, "number of cycles a wave is added to "
+               "dispatchList per execution resource"),
+      ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"
+               " dispatchList per execution resource"),
+      ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),
+      ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "
+               "conflicts"),
+      ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "
+               "ready"),
+      ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "
+               "ready")
 {
-    rdyListNotEmpty
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".rdy_list_not_empty")
-        .desc("number of cycles one or more wave on ready list per "
-              "execution resource")
-        ;
+    rdyListNotEmpty.init(num_exec_units);
+    rdyListEmpty.init(num_exec_units);
+    addToSchListStalls.init(num_exec_units);
+    schListToDispList.init(num_exec_units);
+    schListToDispListStalls.init(num_exec_units);
+    opdNrdyStalls.init(SCH_RF_OPD_NRDY_CONDITIONS);
+    dispNrdyStalls.init(SCH_NRDY_CONDITIONS);
+    rfAccessStalls.init(SCH_RF_ACCESS_NRDY_CONDITIONS);

-    rdyListEmpty
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".rdy_list_empty")
-        .desc("number of cycles no wave on ready list per "
-              "execution resource")
-        ;
-
-    addToSchListStalls
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".sch_list_add_stalls")
-        .desc("number of cycles a wave is not added to schList per "
-              "execution resource when ready list is not empty")
-        ;
-
-    schListToDispList
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".sch_list_to_disp_list")
-        .desc("number of cycles a wave is added to dispatchList per "
-              "execution resource")
-        ;
-
-    schListToDispListStalls
-        .init(computeUnit.numExeUnits())
-        .name(name() + ".sch_list_to_disp_list_stalls")
-        .desc("number of cycles no wave is added to dispatchList per "
-              "execution resource")
-        ;
-
-    // Operand Readiness Stall Cycles
-    opdNrdyStalls
-        .init(SCH_RF_OPD_NRDY_CONDITIONS)
-        .name(name() + ".opd_nrdy_stalls")
-        .desc("number of stalls in SCH due to operands not ready")
-        ;
    opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
    opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
    opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));

-    // dispatchReady Stall Cycles
-    dispNrdyStalls
-        .init(SCH_NRDY_CONDITIONS)
-        .name(name() + ".disp_nrdy_stalls")
-        .desc("number of stalls in SCH due to resource not ready")
-        ;
    dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
    dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
    dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
@@ -862,21 +842,9 @@ ScheduleStage::regStats()
                                  csprintf("FlatMemFIFO"));
    dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));

-    // RF Access Stall Cycles
-    rfAccessStalls
-        .init(SCH_RF_ACCESS_NRDY_CONDITIONS)
-        .name(name() + ".rf_access_stalls")
-        .desc("number of stalls due to RF access denied")
-        ;
    rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
    rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
    rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
    rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
    rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
-
-    // Stall cycles due to wave losing LDS bus arbitration
-    ldsBusArbStalls
-        .name(name() + ".lds_bus_arb_stalls")
-        .desc("number of stalls due to VRF->LDS bus conflicts")
-        ;
 }
--- a/src/gpu-compute/schedule_stage.hh
+++ b/src/gpu-compute/schedule_stage.hh
@@ -40,6 +40,8 @@
 #include <utility>
 #include <vector>

+#include "base/statistics.hh"
+#include "base/stats/group.hh"
 #include "gpu-compute/exec_stage.hh"
 #include "gpu-compute/misc.hh"
 #include "gpu-compute/scheduler.hh"
@@ -105,8 +107,6 @@ class ScheduleStage
        SCH_RF_ACCESS_NRDY_CONDITIONS
    };

-    void regStats();
-
    // Called by ExecStage to inform SCH of instruction execution
    void deleteFromSch(Wavefront *w);

@@ -126,48 +126,6 @@ class ScheduleStage
    // scheduler and a dispatch list
    std::vector<Scheduler> scheduler;

-    // Stats
-
-    // Number of cycles with empty (or not empty) readyList, per execution
-    // resource, when the CU is active (not sleeping)
-    Stats::Vector rdyListEmpty;
-    Stats::Vector rdyListNotEmpty;
-
-    // Number of cycles, per execution resource, when at least one wave
-    // was on the readyList and picked by scheduler, but was unable to be
-    // added to the schList, when the CU is active (not sleeping)
-    Stats::Vector addToSchListStalls;
-
-    // Number of cycles, per execution resource, when a wave is selected
-    // as candidate for dispatchList from schList
-    // Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
-    Stats::Vector schListToDispList;
-
-    // Per execution resource stat, incremented once per cycle if no wave
-    // was selected as candidate for dispatch and moved to dispatchList
-    Stats::Vector schListToDispListStalls;
-
-    // Number of times a wave is selected by the scheduler but cannot
-    // be added to the schList due to register files not being able to
-    // support reads or writes of operands. RF_ACCESS_NRDY condition is always
-    // incremented if at least one read/write not supported, other
-    // conditions are incremented independently from each other.
-    Stats::Vector rfAccessStalls;
-
-    // Number of times a wave is executing FLAT instruction and
-    // forces another wave occupying its required local memory resource
-    // to be deselected for execution, and placed back on schList
-    Stats::Scalar ldsBusArbStalls;
-
-    // Count of times VRF and/or SRF blocks waves on schList from
-    // performing RFBUSY->RFREADY transition
-    Stats::Vector opdNrdyStalls;
-
-    // Count of times resource required for dispatch is not ready and
-    // blocks wave in RFREADY state on schList from potentially moving
-    // to dispatchList
-    Stats::Vector dispNrdyStalls;
-
    const std::string _name;

    // called by exec() to add a wave to schList if the RFs can support it
@@ -221,6 +179,52 @@ class ScheduleStage
    // the VRF/SRF availability or limits imposed by paremeters (to be added)
    // of the SCH stage or CU.
    std::vector<std::deque<std::pair<GPUDynInstPtr, SCH_STATUS>>> schList;
+
+  protected:
+    struct ScheduleStageStats : public Stats::Group
+    {
+        ScheduleStageStats(Stats::Group *parent, int num_exec_units);
+
+        // Number of cycles with empty (or not empty) readyList, per execution
+        // resource, when the CU is active (not sleeping)
+        Stats::Vector rdyListEmpty;
+        Stats::Vector rdyListNotEmpty;
+
+        // Number of cycles, per execution resource, when at least one wave
+        // was on the readyList and picked by scheduler, but was unable to be
+        // added to the schList, when the CU is active (not sleeping)
+        Stats::Vector addToSchListStalls;
+
+        // Number of cycles, per execution resource, when a wave is selected
+        // as candidate for dispatchList from schList
+        // Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
+        Stats::Vector schListToDispList;
+
+        // Per execution resource stat, incremented once per cycle if no wave
+        // was selected as candidate for dispatch and moved to dispatchList
+        Stats::Vector schListToDispListStalls;
+
+        // Number of times a wave is selected by the scheduler but cannot
+        // be added to the schList due to register files not being able to
+        // support reads or writes of operands. RF_ACCESS_NRDY condition is
+        // always incremented if at least one read/write not supported, other
+        // conditions are incremented independently from each other.
+        Stats::Vector rfAccessStalls;
+
+        // Number of times a wave is executing FLAT instruction and
+        // forces another wave occupying its required local memory resource
+        // to be deselected for execution, and placed back on schList
+        Stats::Scalar ldsBusArbStalls;
+
+        // Count of times VRF and/or SRF blocks waves on schList from
+        // performing RFBUSY->RFREADY transition
+        Stats::Vector opdNrdyStalls;
+
+        // Count of times resource required for dispatch is not ready and
+        // blocks wave in RFREADY state on schList from potentially moving
+        // to dispatchList
+        Stats::Vector dispNrdyStalls;
+    } stats;
 };

 #endif // __SCHEDULE_STAGE_HH__
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -49,7 +49,7 @@ ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams &p,
                                           ScoreboardCheckToSchedule
                                           &to_schedule)
    : computeUnit(cu), toSchedule(to_schedule),
-      _name(cu.name() + ".ScoreboardCheckStage")
+      _name(cu.name() + ".ScoreboardCheckStage"), stats(&cu)
 {
 }

@@ -62,7 +62,7 @@ ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus)
 {
    panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS,
             "Instruction ready status %d is illegal!!!", rdyStatus);
-    stallCycles[rdyStatus]++;
+    stats.stallCycles[rdyStatus]++;
 }

 // Return true if this wavefront is ready
@@ -266,14 +266,13 @@ ScoreboardCheckStage::exec()
    }
 }

-void
-ScoreboardCheckStage::regStats()
+ScoreboardCheckStage::
+ScoreboardCheckStageStats::ScoreboardCheckStageStats(Stats::Group *parent)
+    : Stats::Group(parent, "ScoreboardCheckStage"),
+      ADD_STAT(stallCycles, "number of cycles wave stalled in SCB")
 {
-    stallCycles
-        .init(NRDY_CONDITIONS)
-        .name(name() + ".stall_cycles")
-        .desc("number of cycles wave stalled in SCB")
-        ;
+    stallCycles.init(NRDY_CONDITIONS);
+
    stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop"));
    stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty"));
    stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt"));
--- a/src/gpu-compute/scoreboard_check_stage.hh
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -40,7 +40,8 @@
 #include <utility>
 #include <vector>

-#include "sim/stats.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"

 class ComputeUnit;
 class ScoreboardCheckToSchedule;
@@ -78,7 +79,6 @@ class ScoreboardCheckStage

    // Stats related variables and methods
    const std::string& name() const { return _name; }
-    void regStats();

  private:
    void collectStatistics(nonrdytype_e rdyStatus);
@@ -94,10 +94,15 @@ class ScoreboardCheckStage
     */
    ScoreboardCheckToSchedule &toSchedule;

-    // Stats
-    Stats::Vector stallCycles;
-
    const std::string _name;
+
+  protected:
+    struct ScoreboardCheckStageStats : public Stats::Group
+    {
+        ScoreboardCheckStageStats(Stats::Group *parent);
+
+        Stats::Vector stallCycles;
+    } stats;
 };

 #endif // __SCOREBOARD_CHECK_STAGE_HH__
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -65,7 +65,8 @@ Shader::Shader(const Params &p) : ClockedObject(p),
    globalMemSize(p.globalmem),
    nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
    _dispatcher(*p.dispatcher),
-    max_valu_insts(p.max_valu_insts), total_valu_insts(0)
+    max_valu_insts(p.max_valu_insts), total_valu_insts(0),
+    stats(this, p.CUs[0]->wfSize())
 {
    gpuCmdProc.setShader(this);
    _dispatcher.setShader(this);
@@ -278,86 +279,6 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
    return scheduledSomething;
 }

-void
-Shader::regStats()
-{
-    ClockedObject::regStats();
-
-    shaderActiveTicks
-        .name(name() + ".shader_active_ticks")
-        .desc("Total ticks that any CU attached to this shader is active")
-        ;
-    allLatencyDist
-        .init(0, 1600000, 10000)
-        .name(name() + ".allLatencyDist")
-        .desc("delay distribution for all")
-        .flags(Stats::pdf | Stats::oneline);
-
-    loadLatencyDist
-        .init(0, 1600000, 10000)
-        .name(name() + ".loadLatencyDist")
-        .desc("delay distribution for loads")
-        .flags(Stats::pdf | Stats::oneline);
-
-    storeLatencyDist
-        .init(0, 1600000, 10000)
-        .name(name() + ".storeLatencyDist")
-        .desc("delay distribution for stores")
-        .flags(Stats::pdf | Stats::oneline);
-
-    vectorInstSrcOperand
-        .init(4)
-        .name(name() + ".vec_inst_src_operand")
-        .desc("vector instruction source operand distribution");
-
-    vectorInstDstOperand
-        .init(4)
-        .name(name() + ".vec_inst_dst_operand")
-        .desc("vector instruction destination operand distribution");
-
-    initToCoalesceLatency
-        .init(0, 1600000, 10000)
-        .name(name() + ".initToCoalesceLatency")
-        .desc("Ticks from vmem inst initiateAcc to coalescer issue")
-        .flags(Stats::pdf | Stats::oneline);
-
-    rubyNetworkLatency
-        .init(0, 1600000, 10000)
-        .name(name() + ".rubyNetworkLatency")
-        .desc("Ticks from coalescer issue to coalescer hit callback")
-        .flags(Stats::pdf | Stats::oneline);
-
-    gmEnqueueLatency
-        .init(0, 1600000, 10000)
-        .name(name() + ".gmEnqueueLatency")
-        .desc("Ticks from coalescer hit callback to GM pipe enqueue")
-        .flags(Stats::pdf | Stats::oneline);
-
-    gmToCompleteLatency
-        .init(0, 1600000, 10000)
-        .name(name() + ".gmToCompleteLatency")
-        .desc("Ticks queued in GM pipes ordered response buffer")
-        .flags(Stats::pdf | Stats::oneline);
-
-    coalsrLineAddresses
-        .init(0, 20, 1)
-        .name(name() + ".coalsrLineAddresses")
-        .desc("Number of cache lines for coalesced request")
-        .flags(Stats::pdf | Stats::oneline);
-
-    int wfSize = cuList[0]->wfSize();
-    cacheBlockRoundTrip = new Stats::Distribution[wfSize];
-    for (int idx = 0; idx < wfSize; ++idx) {
-        std::stringstream namestr;
-        ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
-        cacheBlockRoundTrip[idx]
-            .init(0, 1600000, 10000)
-            .name(namestr.str())
-            .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
-            .flags(Stats::pdf | Stats::oneline);
-    }
-}
-
 void
 Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
                           bool suppress_func_errors, int cu_id)
@@ -528,8 +449,8 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
 void
 Shader::sampleStore(const Tick accessTime)
 {
-    storeLatencyDist.sample(accessTime);
-    allLatencyDist.sample(accessTime);
+    stats.storeLatencyDist.sample(accessTime);
+    stats.allLatencyDist.sample(accessTime);
 }

 /*
@@ -538,8 +459,8 @@ Shader::sampleStore(const Tick accessTime)
 void
 Shader::sampleLoad(const Tick accessTime)
 {
-    loadLatencyDist.sample(accessTime);
-    allLatencyDist.sample(accessTime);
+    stats.loadLatencyDist.sample(accessTime);
+    stats.allLatencyDist.sample(accessTime);
 }

 void
@@ -556,16 +477,16 @@ Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)
    Tick t4 = roundTripTime[3];
    Tick t5 = roundTripTime[4];

-    initToCoalesceLatency.sample(t2-t1);
-    rubyNetworkLatency.sample(t3-t2);
-    gmEnqueueLatency.sample(t4-t3);
-    gmToCompleteLatency.sample(t5-t4);
+    stats.initToCoalesceLatency.sample(t2-t1);
+    stats.rubyNetworkLatency.sample(t3-t2);
+    stats.gmEnqueueLatency.sample(t4-t3);
+    stats.gmToCompleteLatency.sample(t5-t4);
 }

 void
 Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
 {
-    coalsrLineAddresses.sample(lineMap.size());
+    stats.coalsrLineAddresses.sample(lineMap.size());
    std::vector<Tick> netTimes;

    // For each cache block address generated by a vmem inst, calculate
@@ -586,7 +507,7 @@ Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
    // Nth distribution.
    int idx = 0;
    for (auto& time : netTimes) {
-        cacheBlockRoundTrip[idx].sample(time);
+        stats.cacheBlockRoundTrip[idx].sample(time);
        ++idx;
    }
 }
@@ -598,5 +519,75 @@ Shader::notifyCuSleep() {
             "Invalid activeCu size\n");
    _activeCus--;
    if (!_activeCus)
-        shaderActiveTicks += curTick() - _lastInactiveTick;
+        stats.shaderActiveTicks += curTick() - _lastInactiveTick;
+}
+
+Shader::ShaderStats::ShaderStats(Stats::Group *parent, int wf_size)
+    : Stats::Group(parent),
+      ADD_STAT(allLatencyDist, "delay distribution for all"),
+      ADD_STAT(loadLatencyDist, "delay distribution for loads"),
+      ADD_STAT(storeLatencyDist, "delay distribution for stores"),
+      ADD_STAT(initToCoalesceLatency,
+               "Ticks from vmem inst initiateAcc to coalescer issue"),
+      ADD_STAT(rubyNetworkLatency,
+               "Ticks from coalescer issue to coalescer hit callback"),
+      ADD_STAT(gmEnqueueLatency,
+               "Ticks from coalescer hit callback to GM pipe enqueue"),
+      ADD_STAT(gmToCompleteLatency,
+               "Ticks queued in GM pipes ordered response buffer"),
+      ADD_STAT(coalsrLineAddresses,
+               "Number of cache lines for coalesced request"),
+      ADD_STAT(shaderActiveTicks,
+               "Total ticks that any CU attached to this shader is active"),
+      ADD_STAT(vectorInstSrcOperand,
+               "vector instruction source operand distribution"),
+      ADD_STAT(vectorInstDstOperand,
+               "vector instruction destination operand distribution")
+{
+    allLatencyDist
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    loadLatencyDist
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    storeLatencyDist
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    initToCoalesceLatency
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    rubyNetworkLatency
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    gmEnqueueLatency
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    gmToCompleteLatency
+        .init(0, 1600000, 10000)
+        .flags(Stats::pdf | Stats::oneline);
+
+    coalsrLineAddresses
+        .init(0, 20, 1)
+        .flags(Stats::pdf | Stats::oneline);
+
+    vectorInstSrcOperand.init(4);
+    vectorInstDstOperand.init(4);
+
+    cacheBlockRoundTrip = new Stats::Distribution[wf_size];
+    for (int idx = 0; idx < wf_size; ++idx) {
+        std::stringstream namestr;
+        ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
+                 static_cast<Shader*>(parent)->name(), idx);
+        cacheBlockRoundTrip[idx]
+            .init(0, 1600000, 10000)
+            .name(namestr.str())
+            .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
+            .flags(Stats::pdf | Stats::oneline);
+    }
 }
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -40,6 +40,8 @@
 #include <string>

 #include "arch/isa.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
 #include "base/types.hh"
 #include "cpu/simple/atomic.hh"
 #include "cpu/simple/timing.hh"
@@ -98,26 +100,6 @@ class Shader : public ClockedObject
    // Last tick that all CUs attached to this shader were inactive
    Tick _lastInactiveTick;

-    // some stats for measuring latency
-    Stats::Distribution allLatencyDist;
-    Stats::Distribution loadLatencyDist;
-    Stats::Distribution storeLatencyDist;
-
-    // average ticks from vmem inst initiateAcc to coalescer issue,
-    // average ticks from coalescer issue to coalescer hit callback,
-    // average ticks from coalescer hit callback to GM pipe enqueue,
-    // and average ticks spent in GM pipe's ordered resp buffer.
-    Stats::Distribution initToCoalesceLatency;
-    Stats::Distribution rubyNetworkLatency;
-    Stats::Distribution gmEnqueueLatency;
-    Stats::Distribution gmToCompleteLatency;
-
-    // average number of cache blocks requested by vmem inst, and
-    // average ticks for cache blocks to main memory for the Nth
-    // cache block generated by a vmem inst.
-    Stats::Distribution coalsrLineAddresses;
-    Stats::Distribution *cacheBlockRoundTrip;
-
  public:
    typedef ShaderParams Params;
    enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -249,14 +231,6 @@ class Shader : public ClockedObject
    GPUCommandProcessor &gpuCmdProc;
    GPUDispatcher &_dispatcher;

-    /**
-     * Statistics
-     */
-    Stats::Scalar shaderActiveTicks;
-    Stats::Vector vectorInstSrcOperand;
-    Stats::Vector vectorInstDstOperand;
-    void regStats();
-
    int64_t max_valu_insts;
    int64_t total_valu_insts;

@@ -301,6 +275,52 @@ class Shader : public ClockedObject
    void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
    void updateContext(int cid);
    void notifyCuSleep();
+
+    void
+    incVectorInstSrcOperand(int num_operands)
+    {
+        stats.vectorInstSrcOperand[num_operands]++;
+    }
+
+    void
+    incVectorInstDstOperand(int num_operands)
+    {
+        stats.vectorInstDstOperand[num_operands]++;
+    }
+
+  protected:
+    struct ShaderStats : public Stats::Group
+    {
+        ShaderStats(Stats::Group *parent, int wf_size);
+
+        // some stats for measuring latency
+        Stats::Distribution allLatencyDist;
+        Stats::Distribution loadLatencyDist;
+        Stats::Distribution storeLatencyDist;
+
+        // average ticks from vmem inst initiateAcc to coalescer issue,
+        Stats::Distribution initToCoalesceLatency;
+
+        // average ticks from coalescer issue to coalescer hit callback,
+        Stats::Distribution rubyNetworkLatency;
+
+        // average ticks from coalescer hit callback to GM pipe enqueue,
+        Stats::Distribution gmEnqueueLatency;
+
+        // average ticks spent in GM pipe's ordered resp buffer.
+        Stats::Distribution gmToCompleteLatency;
+
+        // average number of cache blocks requested by vmem inst
+        Stats::Distribution coalsrLineAddresses;
+
+        // average ticks for cache blocks to main memory for the Nth
+        // cache block generated by a vmem inst.
+        Stats::Distribution *cacheBlockRoundTrip;
+
+        Stats::Scalar shaderActiveTicks;
+        Stats::Vector vectorInstSrcOperand;
+        Stats::Vector vectorInstDstOperand;
+    } stats;
 };

 #endif // __SHADER_HH__
--- a/src/gpu-compute/static_register_manager_policy.cc
+++ b/src/gpu-compute/static_register_manager_policy.cc
@@ -180,8 +180,3 @@ StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
    w->reservedScalarRegs = 0;
    w->startSgprIndex = 0;
 }
-
-void
-StaticRegisterManagerPolicy::regStats()
-{
-}
--- a/src/gpu-compute/static_register_manager_policy.hh
+++ b/src/gpu-compute/static_register_manager_policy.hh
@@ -58,8 +58,6 @@ class StaticRegisterManagerPolicy : public RegisterManagerPolicy
        int scalarDemand) override;

    void freeRegisters(Wavefront *w) override;
-
-    void regStats() override;
 };

 #endif // __STATIC_REGISTER_MANAGER_POLICY_HH__
--- a/src/gpu-compute/tlb_coalescer.cc
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -50,7 +50,8 @@ TLBCoalescer::TLBCoalescer(const Params &p)
                    false, Event::CPU_Tick_Pri),
      cleanupEvent([this]{ processCleanupEvent(); },
                   "Cleanup issuedTranslationsTable hashmap",
-                   false, Event::Maximum_Pri)
+                   false, Event::Maximum_Pri),
+      stats(this)
 {
    // create the response ports based on the number of connected ports
    for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) {
@@ -256,11 +257,11 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
        sender_state->reqCnt.push_back(req_cnt);

        // update statistics
-        coalescer->uncoalescedAccesses++;
+        coalescer->stats.uncoalescedAccesses++;
        req_cnt = sender_state->reqCnt.back();
        DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
-        coalescer->queuingCycles -= (curTick() * req_cnt);
-        coalescer->localqueuingCycles -= curTick();
+        coalescer->stats.queuingCycles -= (curTick() * req_cnt);
+        coalescer->stats.localqueuingCycles -= curTick();
    }

    // FIXME if you want to coalesce not based on the issueTime
@@ -302,7 +303,7 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
    // and make necessary allocations.
    if (!coalescedReq_cnt || !didCoalesce) {
        if (update_stats)
-            coalescer->coalescedAccesses++;
+            coalescer->stats.coalescedAccesses++;

        std::vector<PacketPtr> new_array;
        new_array.push_back(pkt);
@@ -339,7 +340,7 @@ TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
    bool update_stats = !sender_state->prefetch;

    if (update_stats)
-        coalescer->uncoalescedAccesses++;
+        coalescer->stats.uncoalescedAccesses++;

    // If there is a pending timing request for this virtual address
    // print a warning message. This is a temporary caveat of
@@ -467,7 +468,7 @@ TLBCoalescer::processProbeTLBEvent()
                    // by the one we just sent counting all the way from
                    // the top of TLB hiearchy (i.e., from the CU)
                    int req_cnt = tmp_sender_state->reqCnt.back();
-                    queuingCycles += (curTick() * req_cnt);
+                    stats.queuingCycles += (curTick() * req_cnt);

                    DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
                            name(), req_cnt);
@@ -475,7 +476,7 @@ TLBCoalescer::processProbeTLBEvent()
                    // pkt_cnt is number of packets we coalesced into the one
                    // we just sent but only at this coalescer level
                    int pkt_cnt = iter->second[vector_index].size();
-                    localqueuingCycles += (curTick() * pkt_cnt);
+                    stats.localqueuingCycles += (curTick() * pkt_cnt);
                }

                DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
@@ -520,35 +521,14 @@ TLBCoalescer::processCleanupEvent()
    }
 }

-void
-TLBCoalescer::regStats()
+TLBCoalescer::TLBCoalescerStats::TLBCoalescerStats(Stats::Group *parent)
+    : Stats::Group(parent),
+      ADD_STAT(uncoalescedAccesses, "Number of uncoalesced TLB accesses"),
+      ADD_STAT(coalescedAccesses, "Number of coalesced TLB accesses"),
+      ADD_STAT(queuingCycles, "Number of cycles spent in queue"),
+      ADD_STAT(localqueuingCycles,
+               "Number of cycles spent in queue for all incoming reqs"),
+      ADD_STAT(localLatency, "Avg. latency over all incoming pkts")
 {
-    ClockedObject::regStats();
-
-    uncoalescedAccesses
-        .name(name() + ".uncoalesced_accesses")
-        .desc("Number of uncoalesced TLB accesses")
-        ;
-
-    coalescedAccesses
-        .name(name() + ".coalesced_accesses")
-        .desc("Number of coalesced TLB accesses")
-        ;
-
-    queuingCycles
-        .name(name() + ".queuing_cycles")
-        .desc("Number of cycles spent in queue")
-        ;
-
-    localqueuingCycles
-        .name(name() + ".local_queuing_cycles")
-        .desc("Number of cycles spent in queue for all incoming reqs")
-        ;
-
-    localLatency
-        .name(name() + ".local_latency")
-        .desc("Avg. latency over all incoming pkts")
-        ;
-
    localLatency = localqueuingCycles / uncoalescedAccesses;
 }
--- a/src/gpu-compute/tlb_coalescer.hh
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -115,26 +115,8 @@ class TLBCoalescer : public ClockedObject

    CoalescingTable issuedTranslationsTable;

-    // number of packets the coalescer receives
-    Stats::Scalar uncoalescedAccesses;
-    // number packets the coalescer send to the TLB
-    Stats::Scalar coalescedAccesses;
-
-    // Number of cycles the coalesced requests spend waiting in
-    // coalescerFIFO. For each packet the coalescer receives we take into
-    // account the number of all uncoalesced requests this pkt "represents"
-    Stats::Scalar queuingCycles;
-
-    // On average how much time a request from the
-    // uncoalescedAccesses that reaches the TLB
-    // spends waiting?
-    Stats::Scalar localqueuingCycles;
-    // localqueuingCycles/uncoalescedAccesses
-    Stats::Formula localLatency;
-
    bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
    void updatePhysAddresses(PacketPtr pkt);
-    void regStats() override;

    class CpuSidePort : public ResponsePort
    {
@@ -211,6 +193,29 @@ class TLBCoalescer : public ClockedObject
    // this FIFO queue keeps track of the virt. page
    // addresses that are pending cleanup
    std::queue<Addr> cleanupQueue;
+
+  protected:
+    struct TLBCoalescerStats : public Stats::Group
+    {
+        TLBCoalescerStats(Stats::Group *parent);
+
+        // number of packets the coalescer receives
+        Stats::Scalar uncoalescedAccesses;
+        // number packets the coalescer send to the TLB
+        Stats::Scalar coalescedAccesses;
+
+        // Number of cycles the coalesced requests spend waiting in
+        // coalescerFIFO. For each packet the coalescer receives we take into
+        // account the number of all uncoalesced requests this pkt "represents"
+        Stats::Scalar queuingCycles;
+
+        // On average how much time a request from the
+        // uncoalescedAccesses that reaches the TLB
+        // spends waiting?
+        Stats::Scalar localqueuingCycles;
+        // localqueuingCycles/uncoalescedAccesses
+        Stats::Formula localLatency;
+    } stats;
 };

 #endif // __TLB_COALESCER_HH__
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -69,11 +69,11 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
                    ->mapVgpr(w, vgprIdx + j);
                if (regBusy(pVgpr)) {
                    if (ii->isDstOperand(i)) {
-                        w->numTimesBlockedDueWAXDependencies++;
+                        w->stats.numTimesBlockedDueWAXDependencies++;
                    } else if (ii->isSrcOperand(i)) {
                        DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
                                w->wfDynId, ii->disassemble(), pVgpr);
-                        w->numTimesBlockedDueRAWDependencies++;
+                        w->stats.numTimesBlockedDueRAWDependencies++;
                    }
                    return false;
                }
@@ -125,13 +125,13 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
 {
    // increment count of number of DWORDs read from VRF
    int DWORDs = ii->numSrcVecDWORDs();
-    registerReads += (DWORDs * w->execMask().count());
+    stats.registerReads += (DWORDs * w->execMask().count());

    uint64_t mask = w->execMask().to_ullong();
    int srams = w->execMask().size() / 4;
    for (int i = 0; i < srams; i++) {
        if (mask & 0xF) {
-            sramReads += DWORDs;
+            stats.sramReads += DWORDs;
        }
        mask = mask >> 4;
    }
@@ -163,13 +163,13 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)

        // increment count of number of DWORDs written to VRF
        DWORDs = ii->numDstVecDWORDs();
-        registerWrites += (DWORDs * w->execMask().count());
+        stats.registerWrites += (DWORDs * w->execMask().count());

        mask = w->execMask().to_ullong();
        srams = w->execMask().size() / 4;
        for (int i = 0; i < srams; i++) {
            if (mask & 0xF) {
-                sramWrites += DWORDs;
+                stats.sramWrites += DWORDs;
            }
            mask = mask >> 4;
        }
@@ -196,13 +196,13 @@ VectorRegisterFile::scheduleWriteOperandsFromLoad(
    }
    // increment count of number of DWORDs written to VRF
    int DWORDs = ii->numDstVecDWORDs();
-    registerWrites += (DWORDs * ii->exec_mask.count());
+    stats.registerWrites += (DWORDs * ii->exec_mask.count());

    uint64_t mask = ii->exec_mask.to_ullong();
    int srams = ii->exec_mask.size() / 4;
    for (int i = 0; i < srams; i++) {
        if (mask & 0xF) {
-            sramWrites += DWORDs;
+            stats.sramWrites += DWORDs;
        }
        mask = mask >> 4;
    }
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -49,7 +49,7 @@ Wavefront::Wavefront(const Params &p)
    maxIbSize(p.max_ib_size), _gpuISA(*this),
    vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
    vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
-    barId(WFBarrier::InvalidID)
+    barId(WFBarrier::InvalidID), stats(this)
 {
    lastTrace = 0;
    execUnitId = -1;
@@ -97,75 +97,6 @@ Wavefront::Wavefront(const Params &p)
    vecReads.clear();
 }

-void
-Wavefront::regStats()
-{
-    SimObject::regStats();
-
-    // FIXME: the name of the WF needs to be unique
-    numTimesBlockedDueWAXDependencies
-        .name(name() + ".timesBlockedDueWAXDependencies")
-        .desc("number of times the wf's instructions are blocked due to WAW "
-              "or WAR dependencies")
-        ;
-
-    // FIXME: the name of the WF needs to be unique
-    numTimesBlockedDueRAWDependencies
-        .name(name() + ".timesBlockedDueRAWDependencies")
-        .desc("number of times the wf's instructions are blocked due to RAW "
-              "dependencies")
-        ;
-
-    numInstrExecuted
-        .name(name() + ".num_instr_executed")
-        .desc("number of instructions executed by this WF slot")
-        ;
-
-    schCycles
-        .name(name() + ".sch_cycles")
-        .desc("number of cycles spent in schedule stage")
-        ;
-
-    schStalls
-        .name(name() + ".sch_stalls")
-        .desc("number of cycles WF is stalled in SCH stage")
-        ;
-
-    schRfAccessStalls
-        .name(name() + ".sch_rf_access_stalls")
-        .desc("number of cycles wave selected in SCH but RF denied adding "
-              "instruction")
-        ;
-
-    schResourceStalls
-        .name(name() + ".sch_resource_stalls")
-        .desc("number of cycles stalled in sch by resource not available")
-        ;
-
-    schOpdNrdyStalls
-        .name(name() + ".sch_opd_nrdy_stalls")
-        .desc("number of cycles stalled in sch waiting for RF reads to "
-              "complete")
-        ;
-
-    schLdsArbStalls
-        .name(name() + ".sch_lds_arb_stalls")
-        .desc("number of cycles wave stalled due to LDS-VRF arbitration")
-        ;
-
-    vecRawDistance
-        .init(0,20,1)
-        .name(name() + ".vec_raw_distance")
-        .desc("Count of RAW distance in dynamic instructions for this WF")
-        ;
-
-    readsPerWrite
-        .init(0,4,1)
-        .name(name() + ".vec_reads_per_write")
-        .desc("Count of Vector reads per write for this WF")
-        ;
-}
-
 void
 Wavefront::init()
 {
@@ -959,17 +890,19 @@ Wavefront::exec()
    }
    computeUnit->srf[simdId]->waveExecuteInst(this, ii);

-    computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++;
-    computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++;
-    computeUnit->numInstrExecuted++;
-    numInstrExecuted++;
+    computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecOperands());
+    computeUnit->shader->incVectorInstDstOperand(ii->numDstVecOperands());
+    computeUnit->stats.numInstrExecuted++;
+    stats.numInstrExecuted++;
    computeUnit->instExecPerSimd[simdId]++;
-    computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
-                                     computeUnit->lastExecCycle[simdId]);
-    computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
+    computeUnit->stats.execRateDist.sample(
+                                    computeUnit->stats.totalCycles.value() -
+                                    computeUnit->lastExecCycle[simdId]);
+    computeUnit->lastExecCycle[simdId] =
+        computeUnit->stats.totalCycles.value();

    if (lastInstExec) {
-        computeUnit->instInterleave[simdId].
+        computeUnit->stats.instInterleave[simdId].
            sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
    }
    lastInstExec = computeUnit->instExecPerSimd[simdId];
@@ -987,8 +920,8 @@ Wavefront::exec()
                if (ii->isSrcOperand(i)) {
                    // This check should never fail, but to be safe we check
                    if (rawDist.find(vgpr+n) != rawDist.end()) {
-                        vecRawDistance.
-                            sample(numInstrExecuted.value() - rawDist[vgpr+n]);
+                        stats.vecRawDistance.sample(
+                            stats.numInstrExecuted.value() - rawDist[vgpr+n]);
                    }
                    // increment number of reads to this register
                    vecReads[vgpr+n]++;
@@ -997,12 +930,12 @@ Wavefront::exec()
                    // for the first write to each physical register
                    if (rawDist.find(vgpr+n) != rawDist.end()) {
                        // sample the number of reads that were performed
-                        readsPerWrite.sample(vecReads[vgpr+n]);
+                        stats.readsPerWrite.sample(vecReads[vgpr+n]);
                    }
                    // on a write, reset count of reads to 0
                    vecReads[vgpr+n] = 0;

-                    rawDist[vgpr+n] = numInstrExecuted.value();
+                    rawDist[vgpr+n] = stats.numInstrExecuted.value();
                }
            }
        }
@@ -1023,26 +956,29 @@ Wavefront::exec()

    if (computeUnit->shader->hsail_mode==Shader::SIMT) {
        const int num_active_lanes = execMask().count();
-        computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
-        computeUnit->numVecOpsExecuted += num_active_lanes;
+        computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
+        computeUnit->stats.numVecOpsExecuted += num_active_lanes;

        if (ii->isF16() && ii->isALU()) {
            if (ii->isF32() || ii->isF64()) {
                fatal("Instruction is tagged as both (1) F16, and (2)"
                       "either F32 or F64.");
            }
-            computeUnit->numVecOpsExecutedF16 += num_active_lanes;
+            computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
            if (ii->isFMA()) {
-                computeUnit->numVecOpsExecutedFMA16 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
            }
            else if (ii->isMAC()) {
-                computeUnit->numVecOpsExecutedMAC16 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
            }
            else if (ii->isMAD()) {
-                computeUnit->numVecOpsExecutedMAD16 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
            }
        }
        if (ii->isF32() && ii->isALU()) {
@@ -1050,18 +986,21 @@ Wavefront::exec()
                fatal("Instruction is tagged as both (1) F32, and (2)"
                       "either F16 or F64.");
            }
-            computeUnit->numVecOpsExecutedF32 += num_active_lanes;
+            computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
            if (ii->isFMA()) {
-                computeUnit->numVecOpsExecutedFMA32 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
            }
            else if (ii->isMAC()) {
-                computeUnit->numVecOpsExecutedMAC32 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
            }
            else if (ii->isMAD()) {
-                computeUnit->numVecOpsExecutedMAD32 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
            }
        }
        if (ii->isF64() && ii->isALU()) {
@@ -1069,24 +1008,29 @@ Wavefront::exec()
                fatal("Instruction is tagged as both (1) F64, and (2)"
                       "either F16 or F32.");
            }
-            computeUnit->numVecOpsExecutedF64 += num_active_lanes;
+            computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
            if (ii->isFMA()) {
-                computeUnit->numVecOpsExecutedFMA64 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
            }
            else if (ii->isMAC()) {
-                computeUnit->numVecOpsExecutedMAC64 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
            }
            else if (ii->isMAD()) {
-                computeUnit->numVecOpsExecutedMAD64 += num_active_lanes;
-                computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
+                computeUnit->stats.numVecOpsExecutedTwoOpFP
+                    += num_active_lanes;
            }
        }
        if (isGmInstruction(ii)) {
-            computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
+            computeUnit->stats.activeLanesPerGMemInstrDist.sample(
+                                                            num_active_lanes);
        } else if (isLmInstruction(ii)) {
-            computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
+            computeUnit->stats.activeLanesPerLMemInstrDist.sample(
+                                                            num_active_lanes);
        }
    }

@@ -1133,14 +1077,14 @@ Wavefront::exec()
                computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
            computeUnit->vectorGlobalMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesVMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
                computeUnit->vrf_gm_bus_latency;
        } else {
            computeUnit->srfToScalarMemPipeBus.set(computeUnit->
                cyclesToTicks(computeUnit->srf_scm_bus_latency));
            computeUnit->scalarMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesScMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
                computeUnit->srf_scm_bus_latency;
        }
    // GM or Flat as GM Store
@@ -1150,14 +1094,14 @@ Wavefront::exec()
                cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
            computeUnit->vectorGlobalMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesVMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
                (2 * computeUnit->vrf_gm_bus_latency);
        } else {
            computeUnit->srfToScalarMemPipeBus.set(computeUnit->
                cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
            computeUnit->scalarMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesScMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
                (2 * computeUnit->srf_scm_bus_latency);
        }
    } else if ((ii->isAtomic() || ii->isMemSync()) &&
@@ -1167,14 +1111,14 @@ Wavefront::exec()
                cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
            computeUnit->vectorGlobalMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesVMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
                (2 * computeUnit->vrf_gm_bus_latency);
        } else {
            computeUnit->srfToScalarMemPipeBus.set(computeUnit->
                cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
            computeUnit->scalarMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-            computeUnit->instCyclesScMemPerSimd[simdId] +=
+            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
                (2 * computeUnit->srf_scm_bus_latency);
        }
    // LM or Flat as LM Load
@@ -1183,7 +1127,7 @@ Wavefront::exec()
            cyclesToTicks(computeUnit->vrf_lm_bus_latency));
        computeUnit->vectorSharedMemUnit.
            set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
-        computeUnit->instCyclesLdsPerSimd[simdId] +=
+        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
            computeUnit->vrf_lm_bus_latency;
    // LM or Flat as LM Store
    } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
@@ -1191,7 +1135,7 @@ Wavefront::exec()
            cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
        computeUnit->vectorSharedMemUnit.
            set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-        computeUnit->instCyclesLdsPerSimd[simdId] +=
+        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
            (2 * computeUnit->vrf_lm_bus_latency);
    // LM or Flat as LM, Atomic or MemFence
    } else if ((ii->isAtomic() || ii->isMemSync()) &&
@@ -1200,7 +1144,7 @@ Wavefront::exec()
            cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
        computeUnit->vectorSharedMemUnit.
            set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
-        computeUnit->instCyclesLdsPerSimd[simdId] +=
+        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
            (2 * computeUnit->vrf_lm_bus_latency);
    } else {
        panic("Bad instruction type!\n");
@@ -1453,3 +1397,31 @@ Wavefront::releaseBarrier()
 {
    barId = WFBarrier::InvalidID;
 }
+
+Wavefront::WavefrontStats::WavefrontStats(Stats::Group *parent)
+    : Stats::Group(parent),
+      ADD_STAT(numInstrExecuted,
+               "number of instructions executed by this WF slot"),
+      ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
+      ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
+      ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
+               "RF denied adding instruction"),
+      ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
+               " not available"),
+      ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
+               "RF reads to complete"),
+      ADD_STAT(schLdsArbStalls,
+               "number of cycles wave stalled due to LDS-VRF arbitration"),
+      // FIXME: the name of the WF needs to be unique
+      ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
+               "instructions are blocked due to WAW or WAR dependencies"),
+      // FIXME: the name of the WF needs to be unique
+      ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
+               "instructions are blocked due to RAW dependencies"),
+      ADD_STAT(vecRawDistance,
+               "Count of RAW distance in dynamic instructions for this WF"),
+      ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
+{
+    vecRawDistance.init(0, 20, 1);
+    readsPerWrite.init(0, 4, 1);
+}
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -43,6 +43,8 @@

 #include "arch/gpu_isa.hh"
 #include "base/logging.hh"
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
 #include "base/types.hh"
 #include "config/the_gpu_isa.hh"
 #include "gpu-compute/compute_unit.hh"
@@ -217,52 +219,13 @@ class Wavefront : public SimObject
    // unique WF id over all WFs executed across all CUs
    uint64_t wfDynId;

-    // Wavefront slot stats
-
-    // Number of instructions executed by this wavefront slot across all
-    // dynamic wavefronts
-    Stats::Scalar numInstrExecuted;
-
-    // Number of cycles this WF spends in SCH stage
-    Stats::Scalar schCycles;
-
-    // Number of stall cycles encounterd by this WF in SCH stage
-    Stats::Scalar schStalls;
-
-    // The following stats sum to the value of schStalls, and record, per
-    // WF slot, what the cause of each stall was at a coarse granularity.
-
-    // Cycles WF is selected by scheduler, but RFs cannot support instruction
-    Stats::Scalar schRfAccessStalls;
-    // Cycles spent waiting for execution resources
-    Stats::Scalar schResourceStalls;
-    // cycles spent waiting for RF reads to complete in SCH stage
-    Stats::Scalar schOpdNrdyStalls;
-    // LDS arbitration stall cycles. WF attempts to execute LM instruction,
-    // but another wave is executing FLAT, which requires LM and GM and forces
-    // this WF to stall.
-    Stats::Scalar schLdsArbStalls;
-
-    // number of times an instruction of a WF is blocked from being issued
-    // due to WAR and WAW dependencies
-    Stats::Scalar numTimesBlockedDueWAXDependencies;
-    // number of times an instruction of a WF is blocked from being issued
-    // due to WAR and WAW dependencies
-    Stats::Scalar numTimesBlockedDueRAWDependencies;
-
    // dyn inst id (per SIMD) of last instruction exec from this wave
    uint64_t lastInstExec;

-    // Distribution to track the distance between producer and consumer
-    // for vector register values
-    Stats::Distribution vecRawDistance;
    // Map to track the dyn instruction id of each vector register value
    // produced, indexed by physical vector register ID
    std::unordered_map<int,uint64_t> rawDist;

-    // Distribution to track the number of times every vector register
-    // value produced is consumed.
-    Stats::Distribution readsPerWrite;
    // Counts the number of reads performed to each physical register
    // - counts are reset to 0 for each dynamic wavefront launched
    std::vector<int> vecReads;
@@ -289,7 +252,6 @@ class Wavefront : public SimObject
    // called by SCH stage to reserve
    std::vector<int> reserveResources();
    bool stopFetch();
-    void regStats();

    Addr pc() const;
    void pc(Addr new_pc);
@@ -357,6 +319,52 @@ class Wavefront : public SimObject
    Addr _pc;
    VectorMask _execMask;
    int barId;
+
+  public:
+    struct WavefrontStats : public Stats::Group
+    {
+        WavefrontStats(Stats::Group *parent);
+
+        // Number of instructions executed by this wavefront slot across all
+        // dynamic wavefronts
+        Stats::Scalar numInstrExecuted;
+
+        // Number of cycles this WF spends in SCH stage
+        Stats::Scalar schCycles;
+
+        // Number of stall cycles encounterd by this WF in SCH stage
+        Stats::Scalar schStalls;
+
+        // The following stats sum to the value of schStalls, and record, per
+        // WF slot, what the cause of each stall was at a coarse granularity.
+
+        // Cycles WF is selected by scheduler, but RFs cannot support
+        // instruction
+        Stats::Scalar schRfAccessStalls;
+        // Cycles spent waiting for execution resources
+        Stats::Scalar schResourceStalls;
+        // cycles spent waiting for RF reads to complete in SCH stage
+        Stats::Scalar schOpdNrdyStalls;
+        // LDS arbitration stall cycles. WF attempts to execute LM instruction,
+        // but another wave is executing FLAT, which requires LM and GM and
+        // forces this WF to stall.
+        Stats::Scalar schLdsArbStalls;
+
+        // number of times an instruction of a WF is blocked from being issued
+        // due to WAR and WAW dependencies
+        Stats::Scalar numTimesBlockedDueWAXDependencies;
+        // number of times an instruction of a WF is blocked from being issued
+        // due to WAR and WAW dependencies
+        Stats::Scalar numTimesBlockedDueRAWDependencies;
+
+        // Distribution to track the distance between producer and consumer
+        // for vector register values
+        Stats::Distribution vecRawDistance;
+
+        // Distribution to track the number of times every vector register
+        // value produced is consumed.
+        Stats::Distribution readsPerWrite;
+    } stats;
 };

 #endif // __GPU_COMPUTE_WAVEFRONT_HH__