arch-gcn3,gpu-compute: Update stats style for GPU

Convert all gpu-compute stats to Stats::Group style.

Change-Id: I29116f1de53ae379210c6cfb5bed3fc74f50cca5
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/39135
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Maintainer: Matthew Poremba <matthew.poremba@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Matthew Poremba
2021-01-14 10:29:37 -06:00
parent e8b37cc503
commit 5323cccfdd
39 changed files with 1156 additions and 1586 deletions

View File

@@ -3800,7 +3800,7 @@ namespace Gcn3ISA
wf->computeUnit->cu_id, wf->wgId, refCount);
wf->computeUnit->registerManager->freeRegisters(wf);
wf->computeUnit->completedWfs++;
wf->computeUnit->stats.completedWfs++;
wf->computeUnit->activeWaves--;
panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
@@ -3811,7 +3811,7 @@ namespace Gcn3ISA
for (int i = 0; i < wf->vecReads.size(); i++) {
if (wf->rawDist.find(i) != wf->rawDist.end()) {
wf->readsPerWrite.sample(wf->vecReads.at(i));
wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
}
}
wf->vecReads.clear();
@@ -3853,7 +3853,7 @@ namespace Gcn3ISA
if (!kernelEnd || !relNeeded) {
wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
wf->setStatus(Wavefront::S_STOPPED);
wf->computeUnit->completedWGs++;
wf->computeUnit->stats.completedWGs++;
return;
}
@@ -3877,7 +3877,7 @@ namespace Gcn3ISA
// call shader to prepare the flush operations
wf->computeUnit->shader->prepareFlush(gpuDynInst);
wf->computeUnit->completedWGs++;
wf->computeUnit->stats.completedWGs++;
} else {
wf->computeUnit->shader->dispatcher().scheduleDispatch();
}

View File

@@ -106,7 +106,8 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
_numBarrierSlots(p.num_barrier_slots),
globalSeqNum(0), wavefrontSize(p.wf_size),
scoreboardCheckToSchedule(p),
scheduleToExecute(p)
scheduleToExecute(p),
stats(this, p.n_wf)
{
/**
* This check is necessary because std::bitset only provides conversion
@@ -367,7 +368,7 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
w->initRegState(task, w->actualWgSzTotal);
w->start(_n_wave++, task->codeAddr());
waveLevelParallelism.sample(activeWaves);
stats.waveLevelParallelism.sample(activeWaves);
activeWaves++;
}
@@ -612,22 +613,22 @@ ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
freeWfSlots, numMappedWfs, vregAvail, sregAvail);
if (!vregAvail) {
++numTimesWgBlockedDueVgprAlloc;
++stats.numTimesWgBlockedDueVgprAlloc;
}
if (!sregAvail) {
++numTimesWgBlockedDueSgprAlloc;
++stats.numTimesWgBlockedDueSgprAlloc;
}
// Return true if enough WF slots to submit workgroup and if there are
// enough VGPRs to schedule all WFs to their SIMD units
bool ldsAvail = lds.canReserve(task->ldsSize());
if (!ldsAvail) {
wgBlockedDueLdsAllocation++;
stats.wgBlockedDueLdsAllocation++;
}
if (!barrier_avail) {
wgBlockedDueBarrierAllocation++;
stats.wgBlockedDueBarrierAllocation++;
}
// Return true if the following are all true:
@@ -734,7 +735,7 @@ ComputeUnit::exec()
scoreboardCheckStage.exec();
fetchStage.exec();
totalCycles++;
stats.totalCycles++;
// Put this CU to sleep if there is no more work to be done.
if (!isDone()) {
@@ -1032,8 +1033,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
fatal("pkt is not a read nor a write\n");
}
tlbCycles -= curTick();
++tlbRequests;
stats.tlbCycles -= curTick();
++stats.tlbRequests;
PortID tlbPort_index = perLaneTLB ? index : 0;
@@ -1075,7 +1076,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
// update the hitLevel distribution
int hit_level = translation_state->hitLevel;
assert(hit_level != -1);
hitsPerTLBLevel[hit_level]++;
stats.hitsPerTLBLevel[hit_level]++;
// New SenderState for the memory access
X86ISA::GpuTLB::TranslationState *sender_state =
@@ -1346,7 +1347,7 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
// for the first cache block.
if (compute_unit->headTailMap.count(gpuDynInst)) {
Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
compute_unit->headTailLatency.sample(curTick() - headTick);
compute_unit->stats.headTailLatency.sample(curTick() - headTick);
compute_unit->headTailMap.erase(gpuDynInst);
}
@@ -1381,7 +1382,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
pkt->req->getVaddr(), line);
assert(pkt->senderState);
computeUnit->tlbCycles += curTick();
computeUnit->stats.tlbCycles += curTick();
// pop off the TLB translation state
X86ISA::GpuTLB::TranslationState *translation_state =
@@ -1402,7 +1403,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
// update the hitLevel distribution
int hit_level = translation_state->hitLevel;
computeUnit->hitsPerTLBLevel[hit_level]++;
computeUnit->stats.hitsPerTLBLevel[hit_level]++;
delete translation_state->tlbEntry;
assert(!translation_state->ports.size());
@@ -1788,561 +1789,17 @@ ComputeUnit::ITLBPort::recvReqRetry()
}
}
void
ComputeUnit::regStats()
{
ClockedObject::regStats();
vALUInsts
.name(name() + ".valu_insts")
.desc("Number of vector ALU insts issued.")
;
vALUInstsPerWF
.name(name() + ".valu_insts_per_wf")
.desc("The avg. number of vector ALU insts issued per-wavefront.")
;
sALUInsts
.name(name() + ".salu_insts")
.desc("Number of scalar ALU insts issued.")
;
sALUInstsPerWF
.name(name() + ".salu_insts_per_wf")
.desc("The avg. number of scalar ALU insts issued per-wavefront.")
;
instCyclesVALU
.name(name() + ".inst_cycles_valu")
.desc("Number of cycles needed to execute VALU insts.")
;
instCyclesSALU
.name(name() + ".inst_cycles_salu")
.desc("Number of cycles needed to execute SALU insts.")
;
threadCyclesVALU
.name(name() + ".thread_cycles_valu")
.desc("Number of thread cycles used to execute vector ALU ops. "
"Similar to instCyclesVALU but multiplied by the number of "
"active threads.")
;
vALUUtilization
.name(name() + ".valu_utilization")
.desc("Percentage of active vector ALU threads in a wave.")
;
ldsNoFlatInsts
.name(name() + ".lds_no_flat_insts")
.desc("Number of LDS insts issued, not including FLAT "
"accesses that resolve to LDS.")
;
ldsNoFlatInstsPerWF
.name(name() + ".lds_no_flat_insts_per_wf")
.desc("The avg. number of LDS insts (not including FLAT "
"accesses that resolve to LDS) per-wavefront.")
;
flatVMemInsts
.name(name() + ".flat_vmem_insts")
.desc("The number of FLAT insts that resolve to vmem issued.")
;
flatVMemInstsPerWF
.name(name() + ".flat_vmem_insts_per_wf")
.desc("The average number of FLAT insts that resolve to vmem "
"issued per-wavefront.")
;
flatLDSInsts
.name(name() + ".flat_lds_insts")
.desc("The number of FLAT insts that resolve to LDS issued.")
;
flatLDSInstsPerWF
.name(name() + ".flat_lds_insts_per_wf")
.desc("The average number of FLAT insts that resolve to LDS "
"issued per-wavefront.")
;
vectorMemWrites
.name(name() + ".vector_mem_writes")
.desc("Number of vector mem write insts (excluding FLAT insts).")
;
vectorMemWritesPerWF
.name(name() + ".vector_mem_writes_per_wf")
.desc("The average number of vector mem write insts "
"(excluding FLAT insts) per-wavefront.")
;
vectorMemReads
.name(name() + ".vector_mem_reads")
.desc("Number of vector mem read insts (excluding FLAT insts).")
;
vectorMemReadsPerWF
.name(name() + ".vector_mem_reads_per_wf")
.desc("The avg. number of vector mem read insts (excluding "
"FLAT insts) per-wavefront.")
;
scalarMemWrites
.name(name() + ".scalar_mem_writes")
.desc("Number of scalar mem write insts.")
;
scalarMemWritesPerWF
.name(name() + ".scalar_mem_writes_per_wf")
.desc("The average number of scalar mem write insts per-wavefront.")
;
scalarMemReads
.name(name() + ".scalar_mem_reads")
.desc("Number of scalar mem read insts.")
;
scalarMemReadsPerWF
.name(name() + ".scalar_mem_reads_per_wf")
.desc("The average number of scalar mem read insts per-wavefront.")
;
vALUInstsPerWF = vALUInsts / completedWfs;
sALUInstsPerWF = sALUInsts / completedWfs;
vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
flatVMemInstsPerWF = flatVMemInsts / completedWfs;
flatLDSInstsPerWF = flatLDSInsts / completedWfs;
vectorMemWritesPerWF = vectorMemWrites / completedWfs;
vectorMemReadsPerWF = vectorMemReads / completedWfs;
scalarMemWritesPerWF = scalarMemWrites / completedWfs;
scalarMemReadsPerWF = scalarMemReads / completedWfs;
vectorMemReadsPerKiloInst
.name(name() + ".vector_mem_reads_per_kilo_inst")
.desc("Number of vector mem reads per kilo-instruction")
;
vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
vectorMemWritesPerKiloInst
.name(name() + ".vector_mem_writes_per_kilo_inst")
.desc("Number of vector mem writes per kilo-instruction")
;
vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
vectorMemInstsPerKiloInst
.name(name() + ".vector_mem_insts_per_kilo_inst")
.desc("Number of vector mem insts per kilo-instruction")
;
vectorMemInstsPerKiloInst =
((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
scalarMemReadsPerKiloInst
.name(name() + ".scalar_mem_reads_per_kilo_inst")
.desc("Number of scalar mem reads per kilo-instruction")
;
scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
scalarMemWritesPerKiloInst
.name(name() + ".scalar_mem_writes_per_kilo_inst")
.desc("Number of scalar mem writes per kilo-instruction")
;
scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
scalarMemInstsPerKiloInst
.name(name() + ".scalar_mem_insts_per_kilo_inst")
.desc("Number of scalar mem insts per kilo-instruction")
;
scalarMemInstsPerKiloInst =
((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
instCyclesVMemPerSimd
.init(numVectorALUs)
.name(name() + ".inst_cycles_vector_memory")
.desc("Number of cycles to send address, command, data from VRF to "
"vector memory unit, per SIMD")
;
instCyclesScMemPerSimd
.init(numVectorALUs)
.name(name() + ".inst_cycles_scalar_memory")
.desc("Number of cycles to send address, command, data from SRF to "
"scalar memory unit, per SIMD")
;
instCyclesLdsPerSimd
.init(numVectorALUs)
.name(name() + ".inst_cycles_lds")
.desc("Number of cycles to send address, command, data from VRF to "
"LDS unit, per SIMD")
;
globalReads
.name(name() + ".global_mem_reads")
.desc("Number of reads to the global segment")
;
globalWrites
.name(name() + ".global_mem_writes")
.desc("Number of writes to the global segment")
;
globalMemInsts
.name(name() + ".global_mem_insts")
.desc("Number of memory instructions sent to the global segment")
;
globalMemInsts = globalReads + globalWrites;
argReads
.name(name() + ".arg_reads")
.desc("Number of reads to the arg segment")
;
argWrites
.name(name() + ".arg_writes")
.desc("NUmber of writes to the arg segment")
;
argMemInsts
.name(name() + ".arg_mem_insts")
.desc("Number of memory instructions sent to the arg segment")
;
argMemInsts = argReads + argWrites;
spillReads
.name(name() + ".spill_reads")
.desc("Number of reads to the spill segment")
;
spillWrites
.name(name() + ".spill_writes")
.desc("Number of writes to the spill segment")
;
spillMemInsts
.name(name() + ".spill_mem_insts")
.desc("Number of memory instructions sent to the spill segment")
;
spillMemInsts = spillReads + spillWrites;
groupReads
.name(name() + ".group_reads")
.desc("Number of reads to the group segment")
;
groupWrites
.name(name() + ".group_writes")
.desc("Number of writes to the group segment")
;
groupMemInsts
.name(name() + ".group_mem_insts")
.desc("Number of memory instructions sent to the group segment")
;
groupMemInsts = groupReads + groupWrites;
privReads
.name(name() + ".private_reads")
.desc("Number of reads to the private segment")
;
privWrites
.name(name() + ".private_writes")
.desc("Number of writes to the private segment")
;
privMemInsts
.name(name() + ".private_mem_insts")
.desc("Number of memory instructions sent to the private segment")
;
privMemInsts = privReads + privWrites;
readonlyReads
.name(name() + ".readonly_reads")
.desc("Number of reads to the readonly segment")
;
readonlyWrites
.name(name() + ".readonly_writes")
.desc("Number of memory instructions sent to the readonly segment")
;
readonlyMemInsts
.name(name() + ".readonly_mem_insts")
.desc("Number of memory instructions sent to the readonly segment")
;
readonlyMemInsts = readonlyReads + readonlyWrites;
kernargReads
.name(name() + ".kernarg_reads")
.desc("Number of reads sent to the kernarg segment")
;
kernargWrites
.name(name() + ".kernarg_writes")
.desc("Number of memory instructions sent to the kernarg segment")
;
kernargMemInsts
.name(name() + ".kernarg_mem_insts")
.desc("Number of memory instructions sent to the kernarg segment")
;
kernargMemInsts = kernargReads + kernargWrites;
tlbCycles
.name(name() + ".tlb_cycles")
.desc("total number of cycles for all uncoalesced requests")
;
tlbRequests
.name(name() + ".tlb_requests")
.desc("number of uncoalesced requests")
;
tlbLatency
.name(name() + ".avg_translation_latency")
.desc("Avg. translation latency for data translations")
;
tlbLatency = tlbCycles / tlbRequests;
hitsPerTLBLevel
.init(4)
.name(name() + ".TLB_hits_distribution")
.desc("TLB hits distribution (0 for page table, x for Lx-TLB")
;
// fixed number of TLB levels
for (int i = 0; i < 4; ++i) {
if (!i)
hitsPerTLBLevel.subname(i,"page_table");
else
hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
}
execRateDist
.init(0, 10, 2)
.name(name() + ".inst_exec_rate")
.desc("Instruction Execution Rate: Number of executed vector "
"instructions per cycle")
;
ldsBankConflictDist
.init(0, wfSize(), 2)
.name(name() + ".lds_bank_conflicts")
.desc("Number of bank conflicts per LDS memory packet")
;
ldsBankAccesses
.name(name() + ".lds_bank_access_cnt")
.desc("Total number of LDS bank accesses")
;
pageDivergenceDist
// A wavefront can touch up to N pages per memory instruction where
// N is equal to the wavefront size
// The number of pages per bin can be configured (here it's 4).
.init(1, wfSize(), 4)
.name(name() + ".page_divergence_dist")
.desc("pages touched per wf (over all mem. instr.)")
;
controlFlowDivergenceDist
.init(1, wfSize(), 4)
.name(name() + ".warp_execution_dist")
.desc("number of lanes active per instruction (oval all instructions)")
;
activeLanesPerGMemInstrDist
.init(1, wfSize(), 4)
.name(name() + ".gmem_lanes_execution_dist")
.desc("number of active lanes per global memory instruction")
;
activeLanesPerLMemInstrDist
.init(1, wfSize(), 4)
.name(name() + ".lmem_lanes_execution_dist")
.desc("number of active lanes per local memory instruction")
;
numInstrExecuted
.name(name() + ".num_instr_executed")
.desc("number of instructions executed")
;
numVecOpsExecuted
.name(name() + ".num_vec_ops_executed")
.desc("number of vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedF16
.name(name() + ".num_vec_ops_f16_executed")
.desc("number of f16 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedF32
.name(name() + ".num_vec_ops_f32_executed")
.desc("number of f32 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedF64
.name(name() + ".num_vec_ops_f64_executed")
.desc("number of f64 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedFMA16
.name(name() + ".num_vec_ops_fma16_executed")
.desc("number of fma16 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedFMA32
.name(name() + ".num_vec_ops_fma32_executed")
.desc("number of fma32 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedFMA64
.name(name() + ".num_vec_ops_fma64_executed")
.desc("number of fma64 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAD16
.name(name() + ".num_vec_ops_mad16_executed")
.desc("number of mad16 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAD32
.name(name() + ".num_vec_ops_mad32_executed")
.desc("number of mad32 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAD64
.name(name() + ".num_vec_ops_mad64_executed")
.desc("number of mad64 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAC16
.name(name() + ".num_vec_ops_mac16_executed")
.desc("number of mac16 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAC32
.name(name() + ".num_vec_ops_mac32_executed")
.desc("number of mac32 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAC64
.name(name() + ".num_vec_ops_mac64_executed")
.desc("number of mac64 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedTwoOpFP
.name(name() + ".num_vec_ops_two_op_fp_executed")
.desc("number of two op FP vec ops executed (e.g. WF size/inst)")
;
totalCycles
.name(name() + ".num_total_cycles")
.desc("number of cycles the CU ran for")
;
ipc
.name(name() + ".ipc")
.desc("Instructions per cycle (this CU only)")
;
vpc
.name(name() + ".vpc")
.desc("Vector Operations per cycle (this CU only)")
;
vpc_f16
.name(name() + ".vpc_f16")
.desc("F16 Vector Operations per cycle (this CU only)")
;
vpc_f32
.name(name() + ".vpc_f32")
.desc("F32 Vector Operations per cycle (this CU only)")
;
vpc_f64
.name(name() + ".vpc_f64")
.desc("F64 Vector Operations per cycle (this CU only)")
;
numALUInstsExecuted
.name(name() + ".num_alu_insts_executed")
.desc("Number of dynamic non-GM memory insts executed")
;
wgBlockedDueBarrierAllocation
.name(name() + ".wg_blocked_due_barrier_alloc")
.desc("WG dispatch was blocked due to lack of barrier resources")
;
wgBlockedDueLdsAllocation
.name(name() + ".wg_blocked_due_lds_alloc")
.desc("Workgroup blocked due to LDS capacity")
;
ipc = numInstrExecuted / totalCycles;
vpc = numVecOpsExecuted / totalCycles;
vpc_f16 = numVecOpsExecutedF16 / totalCycles;
vpc_f32 = numVecOpsExecutedF32 / totalCycles;
vpc_f64 = numVecOpsExecutedF64 / totalCycles;
numTimesWgBlockedDueVgprAlloc
.name(name() + ".times_wg_blocked_due_vgpr_alloc")
.desc("Number of times WGs are blocked due to VGPR allocation per "
"SIMD")
;
numTimesWgBlockedDueSgprAlloc
.name(name() + ".times_wg_blocked_due_sgpr_alloc")
.desc("Number of times WGs are blocked due to SGPR allocation per "
"SIMD")
;
dynamicGMemInstrCnt
.name(name() + ".global_mem_instr_cnt")
.desc("dynamic non-flat global memory instruction count")
;
dynamicFlatMemInstrCnt
.name(name() + ".flat_global_mem_instr_cnt")
.desc("dynamic flat global memory instruction count")
;
dynamicLMemInstrCnt
.name(name() + ".local_mem_instr_cnt")
.desc("dynamic local memory intruction count")
;
numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
dynamicLMemInstrCnt;
completedWfs
.name(name() + ".num_completed_wfs")
.desc("number of completed wavefronts")
;
completedWGs
.name(name() + ".num_completed_wgs")
.desc("number of completed workgroups")
;
numCASOps
.name(name() + ".num_CAS_ops")
.desc("number of compare and swap operations")
;
numFailedCASOps
.name(name() + ".num_failed_CAS_ops")
.desc("number of compare and swap operations that failed")
;
headTailLatency
.init(0, 1000000, 10000)
.name(name() + ".head_tail_latency")
.desc("ticks between first and last cache block arrival at coalescer")
.flags(Stats::pdf | Stats::oneline)
;
waveLevelParallelism
.init(0, shader->n_wf * numVectorALUs, 1)
.name(name() + ".wlp")
.desc("wave level parallelism: count of active waves at wave launch")
;
instInterleave
.init(numVectorALUs, 0, 20, 1)
.name(name() + ".interleaving")
.desc("Measure of instruction interleaving per SIMD")
;
// register stats of pipeline stages
fetchStage.regStats();
scoreboardCheckStage.regStats();
scheduleStage.regStats();
execStage.regStats();
// register stats of memory pipelines
globalMemoryPipe.regStats();
localMemoryPipe.regStats();
scalarMemoryPipe.regStats();
registerManager->regStats();
}
void
ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->isScalar()) {
if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
sALUInsts++;
instCyclesSALU++;
stats.sALUInsts++;
stats.instCyclesSALU++;
} else if (gpuDynInst->isLoad()) {
scalarMemReads++;
stats.scalarMemReads++;
} else if (gpuDynInst->isStore()) {
scalarMemWrites++;
stats.scalarMemWrites++;
}
} else {
if (gpuDynInst->isALU()) {
@@ -2350,45 +1807,46 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
if (shader->total_valu_insts == shader->max_valu_insts) {
exitSimLoop("max vALU insts");
}
vALUInsts++;
instCyclesVALU++;
threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
stats.vALUInsts++;
stats.instCyclesVALU++;
stats.threadCyclesVALU
+= gpuDynInst->wavefront()->execMask().count();
} else if (gpuDynInst->isFlat()) {
if (gpuDynInst->isLocalMem()) {
flatLDSInsts++;
stats.flatLDSInsts++;
} else {
flatVMemInsts++;
stats.flatVMemInsts++;
}
} else if (gpuDynInst->isLocalMem()) {
ldsNoFlatInsts++;
stats.ldsNoFlatInsts++;
} else if (gpuDynInst->isLoad()) {
vectorMemReads++;
stats.vectorMemReads++;
} else if (gpuDynInst->isStore()) {
vectorMemWrites++;
stats.vectorMemWrites++;
}
if (gpuDynInst->isLoad()) {
switch (gpuDynInst->executedAs()) {
case Enums::SC_SPILL:
spillReads++;
stats.spillReads++;
break;
case Enums::SC_GLOBAL:
globalReads++;
stats.globalReads++;
break;
case Enums::SC_GROUP:
groupReads++;
stats.groupReads++;
break;
case Enums::SC_PRIVATE:
privReads++;
stats.privReads++;
break;
case Enums::SC_READONLY:
readonlyReads++;
stats.readonlyReads++;
break;
case Enums::SC_KERNARG:
kernargReads++;
stats.kernargReads++;
break;
case Enums::SC_ARG:
argReads++;
stats.argReads++;
break;
case Enums::SC_NONE:
/**
@@ -2403,25 +1861,25 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
} else if (gpuDynInst->isStore()) {
switch (gpuDynInst->executedAs()) {
case Enums::SC_SPILL:
spillWrites++;
stats.spillWrites++;
break;
case Enums::SC_GLOBAL:
globalWrites++;
stats.globalWrites++;
break;
case Enums::SC_GROUP:
groupWrites++;
stats.groupWrites++;
break;
case Enums::SC_PRIVATE:
privWrites++;
stats.privWrites++;
break;
case Enums::SC_READONLY:
readonlyWrites++;
stats.readonlyWrites++;
break;
case Enums::SC_KERNARG:
kernargWrites++;
stats.kernargWrites++;
break;
case Enums::SC_ARG:
argWrites++;
stats.argWrites++;
break;
case Enums::SC_NONE:
/**
@@ -2636,3 +2094,241 @@ ComputeUnit::LDSPort::recvReqRetry()
}
}
}
ComputeUnit::ComputeUnitStats::ComputeUnitStats(Stats::Group *parent, int n_wf)
: Stats::Group(parent),
ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
"per-wavefront."),
ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
"per-wavefront."),
ADD_STAT(instCyclesVALU,
"Number of cycles needed to execute VALU insts."),
ADD_STAT(instCyclesSALU,
"Number of cycles needed to execute SALU insts."),
ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
"vector ALU ops. Similar to instCyclesVALU but multiplied by "
"the number of active threads."),
ADD_STAT(vALUUtilization,
"Percentage of active vector ALU threads in a wave."),
ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
" accesses that resolve to LDS."),
ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
"including FLAT accesses that resolve to LDS) per-wavefront."),
ADD_STAT(flatVMemInsts,
"The number of FLAT insts that resolve to vmem issued."),
ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
"resolve to vmem issued per-wavefront."),
ADD_STAT(flatLDSInsts,
"The number of FLAT insts that resolve to LDS issued."),
ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
"resolve to LDS issued per-wavefront."),
ADD_STAT(vectorMemWrites,
"Number of vector mem write insts (excluding FLAT insts)."),
ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
"insts (excluding FLAT insts) per-wavefront."),
ADD_STAT(vectorMemReads,
"Number of vector mem read insts (excluding FLAT insts)."),
ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
"(excluding FLAT insts) per-wavefront."),
ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
ADD_STAT(scalarMemWritesPerWF,
"The average number of scalar mem write insts per-wavefront."),
ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
ADD_STAT(scalarMemReadsPerWF,
"The average number of scalar mem read insts per-wavefront."),
ADD_STAT(vectorMemReadsPerKiloInst,
"Number of vector mem reads per kilo-instruction"),
ADD_STAT(vectorMemWritesPerKiloInst,
"Number of vector mem writes per kilo-instruction"),
ADD_STAT(vectorMemInstsPerKiloInst,
"Number of vector mem insts per kilo-instruction"),
ADD_STAT(scalarMemReadsPerKiloInst,
"Number of scalar mem reads per kilo-instruction"),
ADD_STAT(scalarMemWritesPerKiloInst,
"Number of scalar mem writes per kilo-instruction"),
ADD_STAT(scalarMemInstsPerKiloInst,
"Number of scalar mem insts per kilo-instruction"),
ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
"command, data from VRF to vector memory unit, per SIMD"),
ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
"command, data from SRF to scalar memory unit, per SIMD"),
ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
"command, data from VRF to LDS unit, per SIMD"),
ADD_STAT(globalReads, "Number of reads to the global segment"),
ADD_STAT(globalWrites, "Number of writes to the global segment"),
ADD_STAT(globalMemInsts,
"Number of memory instructions sent to the global segment"),
ADD_STAT(argReads, "Number of reads to the arg segment"),
ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
ADD_STAT(argMemInsts,
"Number of memory instructions sent to the arg segment"),
ADD_STAT(spillReads, "Number of reads to the spill segment"),
ADD_STAT(spillWrites, "Number of writes to the spill segment"),
ADD_STAT(spillMemInsts,
"Number of memory instructions sent to the spill segment"),
ADD_STAT(groupReads, "Number of reads to the group segment"),
ADD_STAT(groupWrites, "Number of writes to the group segment"),
ADD_STAT(groupMemInsts,
"Number of memory instructions sent to the group segment"),
ADD_STAT(privReads, "Number of reads to the private segment"),
ADD_STAT(privWrites, "Number of writes to the private segment"),
ADD_STAT(privMemInsts,
"Number of memory instructions sent to the private segment"),
ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
ADD_STAT(readonlyWrites,
"Number of memory instructions sent to the readonly segment"),
ADD_STAT(readonlyMemInsts,
"Number of memory instructions sent to the readonly segment"),
ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
ADD_STAT(kernargWrites,
"Number of memory instructions sent to the kernarg segment"),
ADD_STAT(kernargMemInsts,
"Number of memory instructions sent to the kernarg segment"),
ADD_STAT(waveLevelParallelism,
"wave level parallelism: count of active waves at wave launch"),
ADD_STAT(tlbRequests, "number of uncoalesced requests"),
ADD_STAT(tlbCycles,
"total number of cycles for all uncoalesced requests"),
ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
ADD_STAT(hitsPerTLBLevel,
"TLB hits distribution (0 for page table, x for Lx-TLB)"),
ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
ADD_STAT(ldsBankConflictDist,
"Number of bank conflicts per LDS memory packet"),
ADD_STAT(pageDivergenceDist,
"pages touched per wf (over all mem. instr.)"),
ADD_STAT(dynamicGMemInstrCnt,
"dynamic non-flat global memory instruction count"),
ADD_STAT(dynamicFlatMemInstrCnt,
"dynamic flat global memory instruction count"),
ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
ADD_STAT(wgBlockedDueBarrierAllocation,
"WG dispatch was blocked due to lack of barrier resources"),
ADD_STAT(wgBlockedDueLdsAllocation,
"Workgroup blocked due to LDS capacity"),
ADD_STAT(numInstrExecuted, "number of instructions executed"),
ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
"vector instructions per cycle"),
ADD_STAT(numVecOpsExecuted,
"number of vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedF16,
"number of f16 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedF32,
"number of f32 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedF64,
"number of f64 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedFMA16,
"number of fma16 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedFMA32,
"number of fma32 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedFMA64,
"number of fma64 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMAC16,
"number of mac16 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMAC32,
"number of mac32 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMAC64,
"number of mac64 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMAD16,
"number of mad16 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMAD32,
"number of mad32 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMAD64,
"number of mad64 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedTwoOpFP,
"number of two op FP vec ops executed (e.g. WF size/inst)"),
ADD_STAT(totalCycles, "number of cycles the CU ran for"),
ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
"instruction (over all instructions)"),
ADD_STAT(activeLanesPerGMemInstrDist,
"number of active lanes per global memory instruction"),
ADD_STAT(activeLanesPerLMemInstrDist,
"number of active lanes per local memory instruction"),
ADD_STAT(numALUInstsExecuted,
"Number of dynamic non-GM memory insts executed"),
ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
"blocked due to VGPR allocation per SIMD"),
ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
"blocked due to SGPR allocation per SIMD"),
ADD_STAT(numCASOps, "number of compare and swap operations"),
ADD_STAT(numFailedCASOps,
"number of compare and swap operations that failed"),
ADD_STAT(completedWfs, "number of completed wavefronts"),
ADD_STAT(completedWGs, "number of completed workgroups"),
ADD_STAT(headTailLatency, "ticks between first and last cache block "
"arrival at coalescer"),
ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
{
ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
instCyclesVMemPerSimd.init(cu->numVectorALUs);
instCyclesScMemPerSimd.init(cu->numVectorALUs);
instCyclesLdsPerSimd.init(cu->numVectorALUs);
hitsPerTLBLevel.init(4);
execRateDist.init(0, 10, 2);
ldsBankConflictDist.init(0, cu->wfSize(), 2);
pageDivergenceDist.init(1, cu->wfSize(), 4);
controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4);
activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4);
headTailLatency.init(0, 1000000, 10000).flags(Stats::pdf | Stats::oneline);
waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
instInterleave.init(cu->numVectorALUs, 0, 20, 1);
vALUInstsPerWF = vALUInsts / completedWfs;
sALUInstsPerWF = sALUInsts / completedWfs;
vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
flatVMemInstsPerWF = flatVMemInsts / completedWfs;
flatLDSInstsPerWF = flatLDSInsts / completedWfs;
vectorMemWritesPerWF = vectorMemWrites / completedWfs;
vectorMemReadsPerWF = vectorMemReads / completedWfs;
scalarMemWritesPerWF = scalarMemWrites / completedWfs;
scalarMemReadsPerWF = scalarMemReads / completedWfs;
vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
vectorMemInstsPerKiloInst =
((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
scalarMemInstsPerKiloInst =
((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
globalMemInsts = globalReads + globalWrites;
argMemInsts = argReads + argWrites;
spillMemInsts = spillReads + spillWrites;
groupMemInsts = groupReads + groupWrites;
privMemInsts = privReads + privWrites;
readonlyMemInsts = readonlyReads + readonlyWrites;
kernargMemInsts = kernargReads + kernargWrites;
tlbLatency = tlbCycles / tlbRequests;
// fixed number of TLB levels
for (int i = 0; i < 4; ++i) {
if (!i)
hitsPerTLBLevel.subname(i,"page_table");
else
hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
}
ipc = numInstrExecuted / totalCycles;
vpc = numVecOpsExecuted / totalCycles;
vpc_f16 = numVecOpsExecutedF16 / totalCycles;
vpc_f32 = numVecOpsExecutedF32 / totalCycles;
vpc_f64 = numVecOpsExecutedF64 / totalCycles;
numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
dynamicLMemInstrCnt;
}

View File

@@ -42,6 +42,7 @@
#include "base/callback.hh"
#include "base/compiler.hh"
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "enums/PrefetchType.hh"
@@ -320,12 +321,6 @@ class ComputeUnit : public ClockedObject
// tracks the last cycle a vector instruction was executed on a SIMD
std::vector<uint64_t> lastExecCycle;
// Track the amount of interleaving between wavefronts on each SIMD.
// This stat is sampled using instExecPerSimd to compute the number of
// instructions that have been executed on a SIMD between a WF executing
// two successive instructions.
Stats::VectorDistribution instInterleave;
// tracks the number of dyn inst executed per SIMD
std::vector<uint64_t> instExecPerSimd;
@@ -472,148 +467,6 @@ class ComputeUnit : public ClockedObject
LdsState &lds;
public:
Stats::Scalar vALUInsts;
Stats::Formula vALUInstsPerWF;
Stats::Scalar sALUInsts;
Stats::Formula sALUInstsPerWF;
Stats::Scalar instCyclesVALU;
Stats::Scalar instCyclesSALU;
Stats::Scalar threadCyclesVALU;
Stats::Formula vALUUtilization;
Stats::Scalar ldsNoFlatInsts;
Stats::Formula ldsNoFlatInstsPerWF;
Stats::Scalar flatVMemInsts;
Stats::Formula flatVMemInstsPerWF;
Stats::Scalar flatLDSInsts;
Stats::Formula flatLDSInstsPerWF;
Stats::Scalar vectorMemWrites;
Stats::Formula vectorMemWritesPerWF;
Stats::Scalar vectorMemReads;
Stats::Formula vectorMemReadsPerWF;
Stats::Scalar scalarMemWrites;
Stats::Formula scalarMemWritesPerWF;
Stats::Scalar scalarMemReads;
Stats::Formula scalarMemReadsPerWF;
Stats::Formula vectorMemReadsPerKiloInst;
Stats::Formula vectorMemWritesPerKiloInst;
Stats::Formula vectorMemInstsPerKiloInst;
Stats::Formula scalarMemReadsPerKiloInst;
Stats::Formula scalarMemWritesPerKiloInst;
Stats::Formula scalarMemInstsPerKiloInst;
// Cycles required to send register source (addr and data) from
// register files to memory pipeline, per SIMD.
Stats::Vector instCyclesVMemPerSimd;
Stats::Vector instCyclesScMemPerSimd;
Stats::Vector instCyclesLdsPerSimd;
Stats::Scalar globalReads;
Stats::Scalar globalWrites;
Stats::Formula globalMemInsts;
Stats::Scalar argReads;
Stats::Scalar argWrites;
Stats::Formula argMemInsts;
Stats::Scalar spillReads;
Stats::Scalar spillWrites;
Stats::Formula spillMemInsts;
Stats::Scalar groupReads;
Stats::Scalar groupWrites;
Stats::Formula groupMemInsts;
Stats::Scalar privReads;
Stats::Scalar privWrites;
Stats::Formula privMemInsts;
Stats::Scalar readonlyReads;
Stats::Scalar readonlyWrites;
Stats::Formula readonlyMemInsts;
Stats::Scalar kernargReads;
Stats::Scalar kernargWrites;
Stats::Formula kernargMemInsts;
int activeWaves;
Stats::Distribution waveLevelParallelism;
void updateInstStats(GPUDynInstPtr gpuDynInst);
// the following stats compute the avg. TLB accesslatency per
// uncoalesced request (only for data)
Stats::Scalar tlbRequests;
Stats::Scalar tlbCycles;
Stats::Formula tlbLatency;
// hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
Stats::Vector hitsPerTLBLevel;
Stats::Scalar ldsBankAccesses;
Stats::Distribution ldsBankConflictDist;
// over all memory instructions executed over all wavefronts
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
Stats::Distribution pageDivergenceDist;
// count of non-flat global memory vector instructions executed
Stats::Scalar dynamicGMemInstrCnt;
// count of flat global memory vector instructions executed
Stats::Scalar dynamicFlatMemInstrCnt;
Stats::Scalar dynamicLMemInstrCnt;
Stats::Scalar wgBlockedDueBarrierAllocation;
Stats::Scalar wgBlockedDueLdsAllocation;
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
// active when the instruction is committed, this number is still
// incremented by 1
Stats::Scalar numInstrExecuted;
// Number of cycles among successive instruction executions across all
// wavefronts of the same CU
Stats::Distribution execRateDist;
// number of individual vector operations executed
Stats::Scalar numVecOpsExecuted;
// number of individual f16 vector operations executed
Stats::Scalar numVecOpsExecutedF16;
// number of individual f32 vector operations executed
Stats::Scalar numVecOpsExecutedF32;
// number of individual f64 vector operations executed
Stats::Scalar numVecOpsExecutedF64;
// number of individual FMA 16,32,64 vector operations executed
Stats::Scalar numVecOpsExecutedFMA16;
Stats::Scalar numVecOpsExecutedFMA32;
Stats::Scalar numVecOpsExecutedFMA64;
// number of individual MAC 16,32,64 vector operations executed
Stats::Scalar numVecOpsExecutedMAC16;
Stats::Scalar numVecOpsExecutedMAC32;
Stats::Scalar numVecOpsExecutedMAC64;
// number of individual MAD 16,32,64 vector operations executed
Stats::Scalar numVecOpsExecutedMAD16;
Stats::Scalar numVecOpsExecutedMAD32;
Stats::Scalar numVecOpsExecutedMAD64;
// total number of two op FP vector operations executed
Stats::Scalar numVecOpsExecutedTwoOpFP;
// Total cycles that something is running on the GPU
Stats::Scalar totalCycles;
Stats::Formula vpc; // vector ops per cycle
Stats::Formula vpc_f16; // vector ops per cycle
Stats::Formula vpc_f32; // vector ops per cycle
Stats::Formula vpc_f64; // vector ops per cycle
Stats::Formula ipc; // vector instructions per cycle
Stats::Distribution controlFlowDivergenceDist;
Stats::Distribution activeLanesPerGMemInstrDist;
Stats::Distribution activeLanesPerLMemInstrDist;
// number of vector ALU instructions received
Stats::Formula numALUInstsExecuted;
// number of times a WG can not start due to lack of free VGPRs in SIMDs
Stats::Scalar numTimesWgBlockedDueVgprAlloc;
// number of times a WG can not start due to lack of free SGPRs in SIMDs
Stats::Scalar numTimesWgBlockedDueSgprAlloc;
Stats::Scalar numCASOps;
Stats::Scalar numFailedCASOps;
Stats::Scalar completedWfs;
Stats::Scalar completedWGs;
// distrubtion in latency difference between first and last cache block
// arrival ticks
Stats::Distribution headTailLatency;
void
regStats() override;
LdsState &
getLds() const
{
@@ -1081,6 +934,158 @@ class ComputeUnit : public ClockedObject
// a particular GPUDynInst. This is used to calculate the difference
// between the first and last chace block arrival times.
std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
public:
void updateInstStats(GPUDynInstPtr gpuDynInst);
int activeWaves;
struct ComputeUnitStats : public Stats::Group
{
ComputeUnitStats(Stats::Group *parent, int n_wf);
Stats::Scalar vALUInsts;
Stats::Formula vALUInstsPerWF;
Stats::Scalar sALUInsts;
Stats::Formula sALUInstsPerWF;
Stats::Scalar instCyclesVALU;
Stats::Scalar instCyclesSALU;
Stats::Scalar threadCyclesVALU;
Stats::Formula vALUUtilization;
Stats::Scalar ldsNoFlatInsts;
Stats::Formula ldsNoFlatInstsPerWF;
Stats::Scalar flatVMemInsts;
Stats::Formula flatVMemInstsPerWF;
Stats::Scalar flatLDSInsts;
Stats::Formula flatLDSInstsPerWF;
Stats::Scalar vectorMemWrites;
Stats::Formula vectorMemWritesPerWF;
Stats::Scalar vectorMemReads;
Stats::Formula vectorMemReadsPerWF;
Stats::Scalar scalarMemWrites;
Stats::Formula scalarMemWritesPerWF;
Stats::Scalar scalarMemReads;
Stats::Formula scalarMemReadsPerWF;
Stats::Formula vectorMemReadsPerKiloInst;
Stats::Formula vectorMemWritesPerKiloInst;
Stats::Formula vectorMemInstsPerKiloInst;
Stats::Formula scalarMemReadsPerKiloInst;
Stats::Formula scalarMemWritesPerKiloInst;
Stats::Formula scalarMemInstsPerKiloInst;
// Cycles required to send register source (addr and data) from
// register files to memory pipeline, per SIMD.
Stats::Vector instCyclesVMemPerSimd;
Stats::Vector instCyclesScMemPerSimd;
Stats::Vector instCyclesLdsPerSimd;
Stats::Scalar globalReads;
Stats::Scalar globalWrites;
Stats::Formula globalMemInsts;
Stats::Scalar argReads;
Stats::Scalar argWrites;
Stats::Formula argMemInsts;
Stats::Scalar spillReads;
Stats::Scalar spillWrites;
Stats::Formula spillMemInsts;
Stats::Scalar groupReads;
Stats::Scalar groupWrites;
Stats::Formula groupMemInsts;
Stats::Scalar privReads;
Stats::Scalar privWrites;
Stats::Formula privMemInsts;
Stats::Scalar readonlyReads;
Stats::Scalar readonlyWrites;
Stats::Formula readonlyMemInsts;
Stats::Scalar kernargReads;
Stats::Scalar kernargWrites;
Stats::Formula kernargMemInsts;
Stats::Distribution waveLevelParallelism;
// the following stats compute the avg. TLB accesslatency per
// uncoalesced request (only for data)
Stats::Scalar tlbRequests;
Stats::Scalar tlbCycles;
Stats::Formula tlbLatency;
// hitsPerTLBLevel[x] are the hits in Level x TLB.
// x = 0 is the page table.
Stats::Vector hitsPerTLBLevel;
Stats::Scalar ldsBankAccesses;
Stats::Distribution ldsBankConflictDist;
// over all memory instructions executed over all wavefronts
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
Stats::Distribution pageDivergenceDist;
// count of non-flat global memory vector instructions executed
Stats::Scalar dynamicGMemInstrCnt;
// count of flat global memory vector instructions executed
Stats::Scalar dynamicFlatMemInstrCnt;
Stats::Scalar dynamicLMemInstrCnt;
Stats::Scalar wgBlockedDueBarrierAllocation;
Stats::Scalar wgBlockedDueLdsAllocation;
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
// active when the instruction is committed, this number is still
// incremented by 1
Stats::Scalar numInstrExecuted;
// Number of cycles among successive instruction executions across all
// wavefronts of the same CU
Stats::Distribution execRateDist;
// number of individual vector operations executed
Stats::Scalar numVecOpsExecuted;
// number of individual f16 vector operations executed
Stats::Scalar numVecOpsExecutedF16;
// number of individual f32 vector operations executed
Stats::Scalar numVecOpsExecutedF32;
// number of individual f64 vector operations executed
Stats::Scalar numVecOpsExecutedF64;
// number of individual FMA 16,32,64 vector operations executed
Stats::Scalar numVecOpsExecutedFMA16;
Stats::Scalar numVecOpsExecutedFMA32;
Stats::Scalar numVecOpsExecutedFMA64;
// number of individual MAC 16,32,64 vector operations executed
Stats::Scalar numVecOpsExecutedMAC16;
Stats::Scalar numVecOpsExecutedMAC32;
Stats::Scalar numVecOpsExecutedMAC64;
// number of individual MAD 16,32,64 vector operations executed
Stats::Scalar numVecOpsExecutedMAD16;
Stats::Scalar numVecOpsExecutedMAD32;
Stats::Scalar numVecOpsExecutedMAD64;
// total number of two op FP vector operations executed
Stats::Scalar numVecOpsExecutedTwoOpFP;
// Total cycles that something is running on the GPU
Stats::Scalar totalCycles;
Stats::Formula vpc; // vector ops per cycle
Stats::Formula vpc_f16; // vector ops per cycle
Stats::Formula vpc_f32; // vector ops per cycle
Stats::Formula vpc_f64; // vector ops per cycle
Stats::Formula ipc; // vector instructions per cycle
Stats::Distribution controlFlowDivergenceDist;
Stats::Distribution activeLanesPerGMemInstrDist;
Stats::Distribution activeLanesPerLMemInstrDist;
// number of vector ALU instructions received
Stats::Formula numALUInstsExecuted;
// number of times a WG cannot start due to lack of free VGPRs in SIMDs
Stats::Scalar numTimesWgBlockedDueVgprAlloc;
// number of times a WG cannot start due to lack of free SGPRs in SIMDs
Stats::Scalar numTimesWgBlockedDueSgprAlloc;
Stats::Scalar numCASOps;
Stats::Scalar numFailedCASOps;
Stats::Scalar completedWfs;
Stats::Scalar completedWGs;
// distrubtion in latency difference between first and last cache block
// arrival ticks
Stats::Distribution headTailLatency;
// Track the amount of interleaving between wavefronts on each SIMD.
// This stat is sampled using instExecPerSimd to compute the number
// of instructions that have been executed on a SIMD between a WF
// executing two successive instructions.
Stats::VectorDistribution instInterleave;
} stats;
};
#endif // __COMPUTE_UNIT_HH__

View File

@@ -49,7 +49,7 @@ GPUDispatcher::GPUDispatcher(const Params &p)
: SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
tickEvent([this]{ exec(); },
"GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
dispatchActive(false)
dispatchActive(false), stats(this)
{
schedule(&tickEvent, 0);
}
@@ -58,21 +58,6 @@ GPUDispatcher::~GPUDispatcher()
{
}
void
GPUDispatcher::regStats()
{
numKernelLaunched
.name(name() + ".num_kernel_launched")
.desc("number of kernel launched")
;
cyclesWaitingForDispatch
.name(name() + ".cycles_wait_dispatch")
.desc("number of cycles with outstanding wavefronts "
"that are waiting to be dispatched")
;
}
HSAQueueEntry*
GPUDispatcher::hsaTask(int disp_id)
{
@@ -127,7 +112,7 @@ GPUDispatcher::unserialize(CheckpointIn &cp)
void
GPUDispatcher::dispatch(HSAQueueEntry *task)
{
++numKernelLaunched;
++stats.numKernelLaunched;
DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
task->kernelName(), task->dispatchId());
@@ -158,7 +143,7 @@ GPUDispatcher::exec()
DPRINTF(GPUAgentDisp, "Launching %d Kernels\n", execIds.size());
if (execIds.size() > 0) {
++cyclesWaitingForDispatch;
++stats.cyclesWaitingForDispatch;
}
/**
@@ -368,3 +353,11 @@ GPUDispatcher::scheduleDispatch()
schedule(&tickEvent, curTick() + shader->clockPeriod());
}
}
GPUDispatcher::GPUDispatcherStats::GPUDispatcherStats(Stats::Group *parent)
: Stats::Group(parent),
ADD_STAT(numKernelLaunched, "number of kernel launched"),
ADD_STAT(cyclesWaitingForDispatch, "number of cycles with outstanding "
"wavefronts that are waiting to be dispatched")
{
}

View File

@@ -48,6 +48,7 @@
#include <vector>
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "dev/hsa/hsa_packet.hh"
#include "params/GPUDispatcher.hh"
#include "sim/sim_object.hh"
@@ -67,7 +68,6 @@ class GPUDispatcher : public SimObject
void serialize(CheckpointOut &cp) const override;
void unserialize(CheckpointIn &cp) override;
void regStats() override;
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc);
void setShader(Shader *new_shader);
void exec();
@@ -91,9 +91,15 @@ class GPUDispatcher : public SimObject
std::queue<int> doneIds;
// is there a kernel in execution?
bool dispatchActive;
/*statistics*/
Stats::Scalar numKernelLaunched;
Stats::Scalar cyclesWaitingForDispatch;
protected:
struct GPUDispatcherStats : public Stats::Group
{
GPUDispatcherStats(Stats::Group *parent);
Stats::Scalar numKernelLaunched;
Stats::Scalar cyclesWaitingForDispatch;
} stats;
};
#endif // __GPU_COMPUTE_DISPATCHER_HH__

View File

@@ -46,10 +46,11 @@ ExecStage::ExecStage(const ComputeUnitParams &p, ComputeUnit &cu,
: computeUnit(cu), fromSchedule(from_schedule),
lastTimeInstExecuted(false),
thisTimeInstExecuted(false), instrExecuted (false),
executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
executionResourcesUsed(0), _name(cu.name() + ".ExecStage"),
stats(&cu)
{
numTransActiveIdle = 0;
stats.numTransActiveIdle = 0;
idle_dur = 0;
}
@@ -64,22 +65,22 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
if (stage == IdleExec) {
// count cycles when no instruction to a specific execution resource
// is executed
numCyclesWithNoInstrTypeIssued[unitId]++;
stats.numCyclesWithNoInstrTypeIssued[unitId]++;
} else if (stage == BusyExec) {
// count the number of cycles an instruction to a specific execution
// resource type was issued
numCyclesWithInstrTypeIssued[unitId]++;
stats.numCyclesWithInstrTypeIssued[unitId]++;
thisTimeInstExecuted = true;
instrExecuted = true;
++executionResourcesUsed;
} else if (stage == PostExec) {
// count the number of transitions from active to idle
if (lastTimeInstExecuted && !thisTimeInstExecuted) {
++numTransActiveIdle;
++stats.numTransActiveIdle;
}
if (!lastTimeInstExecuted && thisTimeInstExecuted) {
idleDur.sample(idle_dur);
stats.idleDur.sample(idle_dur);
idle_dur = 0;
} else if (!thisTimeInstExecuted) {
idle_dur++;
@@ -89,11 +90,11 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
// track the number of cycles we either issued at least
// instruction or issued no instructions at all
if (instrExecuted) {
numCyclesWithInstrIssued++;
stats.numCyclesWithInstrIssued++;
} else {
numCyclesWithNoIssue++;
stats.numCyclesWithNoIssue++;
}
spc.sample(executionResourcesUsed);
stats.spc.sample(executionResourcesUsed);
}
}
@@ -196,57 +197,35 @@ ExecStage::exec()
collectStatistics(PostExec, 0);
}
void
ExecStage::regStats()
ExecStage::ExecStageStats::ExecStageStats(Stats::Group *parent)
: Stats::Group(parent, "ExecStage"),
ADD_STAT(numTransActiveIdle,
"number of CU transitions from active to idle"),
ADD_STAT(numCyclesWithNoIssue, "number of cycles the CU issues nothing"),
ADD_STAT(numCyclesWithInstrIssued,
"number of cycles the CU issued at least one instruction"),
ADD_STAT(spc,
"Execution units active per cycle (Exec unit=SIMD,MemPipe)"),
ADD_STAT(idleDur, "duration of idle periods in cycles"),
ADD_STAT(numCyclesWithInstrTypeIssued, "Number of cycles at least one "
"instruction issued to execution resource type"),
ADD_STAT(numCyclesWithNoInstrTypeIssued, "Number of clks no instructions"
" issued to execution resource type")
{
numTransActiveIdle
.name(name() + ".num_transitions_active_to_idle")
.desc("number of CU transitions from active to idle")
;
ComputeUnit *compute_unit = static_cast<ComputeUnit*>(parent);
numCyclesWithNoIssue
.name(name() + ".num_cycles_with_no_issue")
.desc("number of cycles the CU issues nothing")
;
numCyclesWithInstrIssued
.name(name() + ".num_cycles_with_instr_issued")
.desc("number of cycles the CU issued at least one instruction")
;
spc
.init(0, computeUnit.numExeUnits(), 1)
.name(name() + ".spc")
.desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
;
idleDur
.init(0,75,5)
.name(name() + ".idle_duration_in_cycles")
.desc("duration of idle periods in cycles")
;
numCyclesWithInstrTypeIssued
.init(computeUnit.numExeUnits())
.name(name() + ".num_cycles_issue_exec_rsrc")
.desc("Number of cycles at least one instruction issued to "
"execution resource type")
;
numCyclesWithNoInstrTypeIssued
.init(computeUnit.numExeUnits())
.name(name() + ".num_cycles_no_issue_exec_rsrc")
.desc("Number of clks no instructions issued to execution "
"resource type")
;
spc.init(0, compute_unit->numExeUnits(), 1);
idleDur.init(0, 75, 5);
numCyclesWithInstrTypeIssued.init(compute_unit->numExeUnits());
numCyclesWithNoInstrTypeIssued.init(compute_unit->numExeUnits());
int c = 0;
for (int i = 0; i < computeUnit.numVectorALUs; i++,c++) {
for (int i = 0; i < compute_unit->numVectorALUs; i++,c++) {
std::string s = "VectorALU" + std::to_string(i);
numCyclesWithNoInstrTypeIssued.subname(c, s);
numCyclesWithInstrTypeIssued.subname(c, s);
}
for (int i = 0; i < computeUnit.numScalarALUs; i++,c++) {
for (int i = 0; i < compute_unit->numScalarALUs; i++,c++) {
std::string s = "ScalarALU" + std::to_string(i);
numCyclesWithNoInstrTypeIssued.subname(c, s);
numCyclesWithInstrTypeIssued.subname(c, s);
@@ -256,7 +235,4 @@ ExecStage::regStats()
numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe");
numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe");
numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe");
numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe");
}

View File

@@ -39,7 +39,8 @@
#include <utility>
#include <vector>
#include "sim/stats.hh"
#include "base/statistics.hh"
#include "base/stats/group.hh"
class ComputeUnit;
class ScheduleToExecute;
@@ -81,20 +82,6 @@ class ExecStage
void dumpDispList();
const std::string& name() const { return _name; }
void regStats();
// number of idle cycles
Stats::Scalar numCyclesWithNoIssue;
// number of busy cycles
Stats::Scalar numCyclesWithInstrIssued;
// number of cycles during which at least one
// instruction was issued to an execution resource type
Stats::Vector numCyclesWithInstrTypeIssued;
// number of idle cycles during which the scheduler
// issued no instructions targeting a specific
// execution resource type
Stats::Vector numCyclesWithNoInstrTypeIssued;
// SIMDs active per cycle
Stats::Distribution spc;
private:
void collectStatistics(enum STAT_STATUS stage, int unitId);
@@ -105,11 +92,33 @@ class ExecStage
bool lastTimeInstExecuted;
bool thisTimeInstExecuted;
bool instrExecuted;
Stats::Scalar numTransActiveIdle;
Stats::Distribution idleDur;
int executionResourcesUsed;
uint64_t idle_dur;
const std::string _name;
protected:
struct ExecStageStats : public Stats::Group
{
ExecStageStats(Stats::Group *parent);
// number of transitions from active to idle
Stats::Scalar numTransActiveIdle;
// number of idle cycles
Stats::Scalar numCyclesWithNoIssue;
// number of busy cycles
Stats::Scalar numCyclesWithInstrIssued;
// SIMDs active per cycle
Stats::Distribution spc;
// duration of idle periods in cycles
Stats::Distribution idleDur;
// number of cycles during which at least one
// instruction was issued to an execution resource type
Stats::Vector numCyclesWithInstrTypeIssued;
// number of idle cycles during which the scheduler
// issued no instructions targeting a specific
// execution resource type
Stats::Vector numCyclesWithNoInstrTypeIssued;
} stats;
};
#endif // __EXEC_STAGE_HH__

View File

@@ -38,7 +38,7 @@
FetchStage::FetchStage(const ComputeUnitParams &p, ComputeUnit &cu)
: numVectorALUs(p.num_SIMDs), computeUnit(cu),
_name(cu.name() + ".FetchStage")
_name(cu.name() + ".FetchStage"), stats(&cu)
{
for (int j = 0; j < numVectorALUs; ++j) {
FetchUnit newFetchUnit(p, cu);
@@ -79,7 +79,7 @@ FetchStage::processFetchReturn(PacketPtr pkt)
const unsigned num_instructions = pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst);
instFetchInstReturned.sample(num_instructions);
stats.instFetchInstReturned.sample(num_instructions);
uint32_t simdId = wavefront->simdId;
_fetchUnit[simdId].processFetchReturn(pkt);
}
@@ -90,13 +90,10 @@ FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
_fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
}
void
FetchStage::regStats()
FetchStage::FetchStageStats::FetchStageStats(Stats::Group *parent)
: Stats::Group(parent, "FetchStage"),
ADD_STAT(instFetchInstReturned, "For each instruction fetch request "
"received record how many instructions you got from it")
{
instFetchInstReturned
.init(1, 32, 1)
.name(name() + ".inst_fetch_instr_returned")
.desc("For each instruction fetch request recieved record how many "
"instructions you got from it")
;
instFetchInstReturned.init(1, 32, 1);
}

View File

@@ -38,6 +38,7 @@
#include <vector>
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "gpu-compute/fetch_unit.hh"
// Instruction fetch stage.
@@ -61,8 +62,6 @@ class FetchStage
// Stats related variables and methods
const std::string& name() const { return _name; }
void regStats();
Stats::Distribution instFetchInstReturned;
FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
private:
@@ -73,6 +72,14 @@ class FetchStage
// instantiated per VALU/SIMD
std::vector<FetchUnit> _fetchUnit;
const std::string _name;
protected:
struct FetchStageStats : public Stats::Group
{
FetchStageStats(Stats::Group *parent);
Stats::Distribution instFetchInstReturned;
} stats;
};
#endif // __FETCH_STAGE_HH__

View File

@@ -48,7 +48,7 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
: computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
gmQueueSize(p.global_mem_queue_size),
maxWaveRequests(p.max_wave_requests), inflightStores(0),
inflightLoads(0)
inflightLoads(0), stats(&cu)
{
}
@@ -293,12 +293,10 @@ GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
mem_req->second.second = true;
}
void
GlobalMemPipeline::regStats()
GlobalMemPipeline::
GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent)
: Stats::Group(parent, "GlobalMemPipeline"),
ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
"are delayed before updating the VRF")
{
loadVrfBankConflictCycles
.name(name() + ".load_vrf_bank_conflict_cycles")
.desc("total number of cycles GM data are delayed before updating "
"the VRF")
;
}

View File

@@ -37,6 +37,8 @@
#include <queue>
#include <string>
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "gpu-compute/misc.hh"
#include "params/ComputeUnit.hh"
#include "sim/stats.hh"
@@ -95,11 +97,10 @@ class GlobalMemPipeline
}
const std::string &name() const { return _name; }
void regStats();
void
incLoadVRFBankConflictCycles(int num_cycles)
{
loadVrfBankConflictCycles += num_cycles;
stats.loadVrfBankConflictCycles += num_cycles;
}
bool coalescerReady(GPUDynInstPtr mp) const;
@@ -113,10 +114,6 @@ class GlobalMemPipeline
int gmQueueSize;
int maxWaveRequests;
// number of cycles of delaying the update of a VGPR that is the
// target of a load instruction (or the load component of an atomic)
// The delay is due to VRF bank conflicts
Stats::Scalar loadVrfBankConflictCycles;
// Counters to track the inflight loads and stores
// so that we can provide the proper backpressure
// on the number of inflight memory operations.
@@ -144,6 +141,17 @@ class GlobalMemPipeline
// Global Memory Request FIFO: all global memory requests
// are issued to this FIFO from the memory pipelines
std::queue<GPUDynInstPtr> gmIssuedRequests;
protected:
struct GlobalMemPipelineStats : public Stats::Group
{
GlobalMemPipelineStats(Stats::Group *parent);
// number of cycles of delaying the update of a VGPR that is the
// target of a load instruction (or the load component of an atomic)
// The delay is due to VRF bank conflicts
Stats::Scalar loadVrfBankConflictCycles;
} stats;
};
#endif // __GLOBAL_MEMORY_PIPELINE_HH__

View File

@@ -930,16 +930,16 @@ GPUDynInst::updateStats()
{
if (_staticInst->isLocalMem()) {
// access to LDS (shared) memory
cu->dynamicLMemInstrCnt++;
cu->stats.dynamicLMemInstrCnt++;
} else if (_staticInst->isFlat()) {
cu->dynamicFlatMemInstrCnt++;
cu->stats.dynamicFlatMemInstrCnt++;
} else {
// access to global memory
// update PageDivergence histogram
int number_pages_touched = cu->pagesTouched.size();
assert(number_pages_touched);
cu->pageDivergenceDist.sample(number_pages_touched);
cu->stats.pageDivergenceDist.sample(number_pages_touched);
std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
@@ -962,7 +962,7 @@ GPUDynInst::updateStats()
// total number of memory instructions (dynamic)
// Atomics are counted as a single memory instruction.
// this is # memory instructions per wavefronts, not per workitem
cu->dynamicGMemInstrCnt++;
cu->stats.dynamicGMemInstrCnt++;
}
}

View File

@@ -63,12 +63,12 @@ class AtomicOpCAS : public TypedAtomicOpFunctor<T>
void
execute(T *b)
{
computeUnit->numCASOps++;
computeUnit->stats.numCASOps++;
if (*b == c) {
*b = s;
} else {
computeUnit->numFailedCASOps++;
computeUnit->stats.numFailedCASOps++;
}
}
AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }

View File

@@ -67,7 +67,7 @@ namespace X86ISA
: ClockedObject(p), configAddress(0), size(p.size),
cleanupEvent([this]{ cleanup(); }, name(), false,
Event::Maximum_Pri),
exitEvent([this]{ exitCallback(); }, name())
exitEvent([this]{ exitCallback(); }, name()), stats(this)
{
assoc = p.assoc;
assert(assoc <= size);
@@ -402,12 +402,12 @@ namespace X86ISA
return tlb_hit;
}
localNumTLBAccesses++;
stats.localNumTLBAccesses++;
if (!entry) {
localNumTLBMisses++;
stats.localNumTLBMisses++;
} else {
localNumTLBHits++;
stats.localNumTLBHits++;
}
}
}
@@ -499,10 +499,10 @@ namespace X86ISA
DPRINTF(GPUTLB, "Paging enabled.\n");
// The vaddr already has the segment base applied.
TlbEntry *entry = lookup(vaddr);
localNumTLBAccesses++;
stats.localNumTLBAccesses++;
if (!entry) {
localNumTLBMisses++;
stats.localNumTLBMisses++;
if (timing) {
latency = missLatency1;
}
@@ -544,7 +544,7 @@ namespace X86ISA
DPRINTF(GPUTLB, "Miss was serviced.\n");
}
} else {
localNumTLBHits++;
stats.localNumTLBHits++;
if (timing) {
latency = hitLatency;
@@ -659,89 +659,6 @@ namespace X86ISA
{
}
void
GpuTLB::regStats()
{
ClockedObject::regStats();
localNumTLBAccesses
.name(name() + ".local_TLB_accesses")
.desc("Number of TLB accesses")
;
localNumTLBHits
.name(name() + ".local_TLB_hits")
.desc("Number of TLB hits")
;
localNumTLBMisses
.name(name() + ".local_TLB_misses")
.desc("Number of TLB misses")
;
localTLBMissRate
.name(name() + ".local_TLB_miss_rate")
.desc("TLB miss rate")
;
accessCycles
.name(name() + ".access_cycles")
.desc("Cycles spent accessing this TLB level")
;
pageTableCycles
.name(name() + ".page_table_cycles")
.desc("Cycles spent accessing the page table")
;
localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
numUniquePages
.name(name() + ".unique_pages")
.desc("Number of unique pages touched")
;
localCycles
.name(name() + ".local_cycles")
.desc("Number of cycles spent in queue for all incoming reqs")
;
localLatency
.name(name() + ".local_latency")
.desc("Avg. latency over incoming coalesced reqs")
;
localLatency = localCycles / localNumTLBAccesses;
globalNumTLBAccesses
.name(name() + ".global_TLB_accesses")
.desc("Number of TLB accesses")
;
globalNumTLBHits
.name(name() + ".global_TLB_hits")
.desc("Number of TLB hits")
;
globalNumTLBMisses
.name(name() + ".global_TLB_misses")
.desc("Number of TLB misses")
;
globalTLBMissRate
.name(name() + ".global_TLB_miss_rate")
.desc("TLB miss rate")
;
globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
avgReuseDistance
.name(name() + ".avg_reuse_distance")
.desc("avg. reuse distance over all pages (in ticks)")
;
}
/**
* Do the TLB lookup for this coalesced request and schedule
* another event <TLB access latency> cycles later.
@@ -768,10 +685,10 @@ namespace X86ISA
int req_cnt = sender_state->reqCnt.back();
if (update_stats) {
accessCycles -= (curTick() * req_cnt);
localCycles -= curTick();
stats.accessCycles -= (curTick() * req_cnt);
stats.localCycles -= curTick();
updatePageFootprint(virt_page_addr);
globalNumTLBAccesses += req_cnt;
stats.globalNumTLBAccesses += req_cnt;
}
tlbOutcome lookup_outcome = TLB_MISS;
@@ -795,11 +712,11 @@ namespace X86ISA
// the reqCnt has an entry per level, so its size tells us
// which level we are in
sender_state->hitLevel = sender_state->reqCnt.size();
globalNumTLBHits += req_cnt;
stats.globalNumTLBHits += req_cnt;
}
} else {
if (update_stats)
globalNumTLBMisses += req_cnt;
stats.globalNumTLBMisses += req_cnt;
}
/*
@@ -981,16 +898,16 @@ namespace X86ISA
handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
if (update_stats) {
accessCycles += (req_cnt * curTick());
localCycles += curTick();
stats.accessCycles += (req_cnt * curTick());
stats.localCycles += curTick();
}
} else if (outcome == TLB_MISS) {
DPRINTF(GPUTLB, "This is a TLB miss\n");
if (update_stats) {
accessCycles += (req_cnt*curTick());
localCycles += curTick();
stats.accessCycles += (req_cnt*curTick());
stats.localCycles += curTick();
}
if (hasMemSidePort) {
@@ -998,8 +915,8 @@ namespace X86ISA
// the reply back till when we propagate it to the coalescer
// above.
if (update_stats) {
accessCycles += (req_cnt * 1);
localCycles += 1;
stats.accessCycles += (req_cnt * 1);
stats.localCycles += 1;
}
/**
@@ -1022,7 +939,7 @@ namespace X86ISA
"addr %#x\n", virtPageAddr);
if (update_stats)
pageTableCycles -= (req_cnt*curTick());
stats.pageTableCycles -= (req_cnt*curTick());
TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
assert(tlb_event);
@@ -1032,7 +949,7 @@ namespace X86ISA
}
} else if (outcome == PAGE_WALK) {
if (update_stats)
pageTableCycles += (req_cnt*curTick());
stats.pageTableCycles += (req_cnt*curTick());
// Need to access the page table and update the TLB
DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
@@ -1222,17 +1139,17 @@ namespace X86ISA
// functional mode means no coalescing
// global metrics are the same as the local metrics
if (update_stats) {
tlb->globalNumTLBAccesses++;
tlb->stats.globalNumTLBAccesses++;
if (success) {
sender_state->hitLevel = sender_state->reqCnt.size();
tlb->globalNumTLBHits++;
tlb->stats.globalNumTLBHits++;
}
}
if (!success) {
if (update_stats)
tlb->globalNumTLBMisses++;
tlb->stats.globalNumTLBMisses++;
if (tlb->hasMemSidePort) {
// there is a TLB below -> propagate down the TLB hierarchy
tlb->memSidePort[0]->sendFunctional(pkt);
@@ -1405,7 +1322,7 @@ namespace X86ISA
bool first_page_access = ret.second;
if (first_page_access) {
numUniquePages++;
stats.numUniquePages++;
} else {
int accessed_before;
accessed_before = curTick() - ret.first->second.lastTimeAccessed;
@@ -1417,7 +1334,7 @@ namespace X86ISA
if (accessDistance) {
ret.first->second.localTLBAccesses
.push_back(localNumTLBAccesses.value());
.push_back(stats.localNumTLBAccesses.value());
}
}
@@ -1506,11 +1423,36 @@ namespace X86ISA
}
if (!TLBFootprint.empty()) {
avgReuseDistance =
stats.avgReuseDistance =
sum_avg_reuse_distance_per_page / TLBFootprint.size();
}
//clear the TLBFootprint map
TLBFootprint.clear();
}
GpuTLB::GpuTLBStats::GpuTLBStats(Stats::Group *parent)
: Stats::Group(parent),
ADD_STAT(localNumTLBAccesses, "Number of TLB accesses"),
ADD_STAT(localNumTLBHits, "Number of TLB hits"),
ADD_STAT(localNumTLBMisses, "Number of TLB misses"),
ADD_STAT(localTLBMissRate, "TLB miss rate"),
ADD_STAT(globalNumTLBAccesses, "Number of TLB accesses"),
ADD_STAT(globalNumTLBHits, "Number of TLB hits"),
ADD_STAT(globalNumTLBMisses, "Number of TLB misses"),
ADD_STAT(globalTLBMissRate, "TLB miss rate"),
ADD_STAT(accessCycles, "Cycles spent accessing this TLB level"),
ADD_STAT(pageTableCycles, "Cycles spent accessing the page table"),
ADD_STAT(numUniquePages, "Number of unique pages touched"),
ADD_STAT(localCycles, "Number of cycles spent in queue for all "
"incoming reqs"),
ADD_STAT(localLatency, "Avg. latency over incoming coalesced reqs"),
ADD_STAT(avgReuseDistance, "avg. reuse distance over all pages (in "
"ticks)")
{
localLatency = localCycles / localNumTLBAccesses;
localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
}
} // namespace X86ISA

View File

@@ -47,6 +47,7 @@
#include "base/callback.hh"
#include "base/logging.hh"
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "gpu-compute/compute_unit.hh"
#include "mem/port.hh"
#include "mem/request.hh"
@@ -169,35 +170,6 @@ namespace X86ISA
int missLatency1;
int missLatency2;
// local_stats are as seen from the TLB
// without taking into account coalescing
Stats::Scalar localNumTLBAccesses;
Stats::Scalar localNumTLBHits;
Stats::Scalar localNumTLBMisses;
Stats::Formula localTLBMissRate;
// global_stats are as seen from the
// CU's perspective taking into account
// all coalesced requests.
Stats::Scalar globalNumTLBAccesses;
Stats::Scalar globalNumTLBHits;
Stats::Scalar globalNumTLBMisses;
Stats::Formula globalTLBMissRate;
// from the CU perspective (global)
Stats::Scalar accessCycles;
// from the CU perspective (global)
Stats::Scalar pageTableCycles;
Stats::Scalar numUniquePages;
// from the perspective of this TLB
Stats::Scalar localCycles;
// from the perspective of this TLB
Stats::Formula localLatency;
// I take the avg. per page and then
// the avg. over all pages.
Stats::Scalar avgReuseDistance;
void regStats() override;
void updatePageFootprint(Addr virt_page_addr);
void printAccessPattern();
@@ -426,6 +398,40 @@ namespace X86ISA
void exitCallback();
EventFunctionWrapper exitEvent;
protected:
struct GpuTLBStats : public Stats::Group
{
GpuTLBStats(Stats::Group *parent);
// local_stats are as seen from the TLB
// without taking into account coalescing
Stats::Scalar localNumTLBAccesses;
Stats::Scalar localNumTLBHits;
Stats::Scalar localNumTLBMisses;
Stats::Formula localTLBMissRate;
// global_stats are as seen from the
// CU's perspective taking into account
// all coalesced requests.
Stats::Scalar globalNumTLBAccesses;
Stats::Scalar globalNumTLBHits;
Stats::Scalar globalNumTLBMisses;
Stats::Formula globalTLBMissRate;
// from the CU perspective (global)
Stats::Scalar accessCycles;
// from the CU perspective (global)
Stats::Scalar pageTableCycles;
Stats::Scalar numUniquePages;
// from the perspective of this TLB
Stats::Scalar localCycles;
// from the perspective of this TLB
Stats::Formula localLatency;
// I take the avg. per page and then
// the avg. over all pages.
Stats::Scalar avgReuseDistance;
} stats;
};
}

View File

@@ -189,10 +189,10 @@ LdsState::processPacket(PacketPtr packet)
// the number of conflicts this packet will have when accessing the LDS
unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
// count the total number of physical LDS bank accessed
parent->ldsBankAccesses += bankAccesses;
parent->stats.ldsBankAccesses += bankAccesses;
// count the LDS bank conflicts. A number set to 1 indicates one
// access per bank maximum so there are no bank conflicts
parent->ldsBankConflictDist.sample(bankConflicts-1);
parent->stats.ldsBankConflictDist.sample(bankConflicts-1);
GPUDynInstPtr dynInst = getDynInstr(packet);
// account for the LDS bank conflict overhead

View File

@@ -43,7 +43,7 @@
LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
: computeUnit(cu), _name(cu.name() + ".LocalMemPipeline"),
lmQueueSize(p.local_mem_queue_size)
lmQueueSize(p.local_mem_queue_size), stats(&cu)
{
}
@@ -124,12 +124,11 @@ LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
lmIssuedRequests.push(gpuDynInst);
}
void
LocalMemPipeline::regStats()
LocalMemPipeline::
LocalMemPipelineStats::LocalMemPipelineStats(Stats::Group *parent)
: Stats::Group(parent, "LocalMemPipeline"),
ADD_STAT(loadVrfBankConflictCycles, "total number of cycles LDS data "
"are delayed before updating the VRF")
{
loadVrfBankConflictCycles
.name(name() + ".load_vrf_bank_conflict_cycles")
.desc("total number of cycles LDS data are delayed before updating "
"the VRF")
;
}

View File

@@ -37,9 +37,10 @@
#include <queue>
#include <string>
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "gpu-compute/misc.hh"
#include "params/ComputeUnit.hh"
#include "sim/stats.hh"
/*
* @file local_memory_pipeline.hh
@@ -75,19 +76,18 @@ class LocalMemPipeline
}
const std::string& name() const { return _name; }
void regStats();
void
incLoadVRFBankConflictCycles(int num_cycles)
{
loadVrfBankConflictCycles += num_cycles;
stats.loadVrfBankConflictCycles += num_cycles;
}
private:
ComputeUnit &computeUnit;
const std::string _name;
int lmQueueSize;
Stats::Scalar loadVrfBankConflictCycles;
// Local Memory Request Fifo: all shared memory requests
// are issued to this FIFO from the memory pipelines
std::queue<GPUDynInstPtr> lmIssuedRequests;
@@ -95,6 +95,14 @@ class LocalMemPipeline
// Local Memory Response Fifo: all responses of shared memory
// requests are sent to this FIFO from LDS
std::queue<GPUDynInstPtr> lmReturnedRequests;
protected:
struct LocalMemPipelineStats : public Stats::Group
{
LocalMemPipelineStats(Stats::Group *parent);
Stats::Scalar loadVrfBankConflictCycles;
} stats;
};
#endif // __LOCAL_MEMORY_PIPELINE_HH__

View File

@@ -49,7 +49,7 @@
#include "params/RegisterFile.hh"
RegisterFile::RegisterFile(const RegisterFileParams &p)
: SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs)
: SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs), stats(this)
{
fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n");
fatal_if(simdId < 0, "Illegal SIMD id for VRF");
@@ -192,26 +192,15 @@ RegisterFile::dispatchInstruction(GPUDynInstPtr ii)
{
}
void
RegisterFile::regStats()
RegisterFile::RegisterFileStats::RegisterFileStats(Stats::Group *parent)
: Stats::Group(parent),
ADD_STAT(registerReads,
"Total number of DWORDs read from register file"),
ADD_STAT(registerWrites,
"Total number of DWORDS written to register file"),
ADD_STAT(sramReads,
"Total number of register file bank SRAM activations for reads"),
ADD_STAT(sramWrites,
"Total number of register file bank SRAM activations for writes")
{
registerReads
.name(name() + ".register_reads")
.desc("Total number of DWORDs read from register file")
;
registerWrites
.name(name() + ".register_writes")
.desc("Total number of DWORDS written to register file")
;
sramReads
.name(name() + ".sram_reads")
.desc("Total number of register file bank SRAM activations for reads")
;
sramWrites
.name(name() + ".sram_writes")
.desc("Total number of register file bank SRAM activations for writes")
;
}

View File

@@ -62,7 +62,6 @@ class RegisterFile : public SimObject
virtual ~RegisterFile();
virtual void setParent(ComputeUnit *_computeUnit);
int numRegs() const { return _numRegs; }
virtual void regStats() override;
// State functions
@@ -154,18 +153,23 @@ class RegisterFile : public SimObject
// numer of registers in this register file
int _numRegs;
// Stats
// Total number of register reads, incremented once per DWORD per thread
Stats::Scalar registerReads;
// Total number of register writes, incremented once per DWORD per thread
Stats::Scalar registerWrites;
// Number of register file SRAM activations for reads.
// The register file may be implemented with multiple SRAMs. This stat
// tracks how many times the SRAMs are accessed for reads.
Stats::Scalar sramReads;
// Number of register file SRAM activations for writes
Stats::Scalar sramWrites;
struct RegisterFileStats : public Stats::Group
{
RegisterFileStats(Stats::Group *parent);
// Total number of register reads per DWORD per thread
Stats::Scalar registerReads;
// Total number of register writes per DWORD per thread
Stats::Scalar registerWrites;
// Number of register file SRAM activations for reads.
// The register file may be implemented with multiple SRAMs. This stat
// tracks how many times the SRAMs are accessed for reads.
Stats::Scalar sramReads;
// Number of register file SRAM activations for writes
Stats::Scalar sramWrites;
} stats;
};
#endif // __REGISTER_FILE_HH__

View File

@@ -129,9 +129,3 @@ RegisterManager::freeRegisters(Wavefront* w)
{
policy->freeRegisters(w);
}
void
RegisterManager::regStats()
{
policy->regStats();
}

View File

@@ -63,9 +63,6 @@ class RegisterManager : public SimObject
void setParent(ComputeUnit *cu);
void exec();
// Stats related variables and methods
void regStats();
// lookup virtual to physical register translation
int mapVgpr(Wavefront* w, int vgprIndex);
int mapSgpr(Wavefront* w, int sgprIndex);

View File

@@ -76,9 +76,6 @@ class RegisterManagerPolicy
// free all remaining registers held by specified WF
virtual void freeRegisters(Wavefront *w) = 0;
// stats
virtual void regStats() = 0;
protected:
ComputeUnit *cu;
};

View File

@@ -142,8 +142,3 @@ ScalarMemPipeline::exec()
computeUnit.cu_id, mp->simdId, mp->wfSlotId);
}
}
void
ScalarMemPipeline::regStats()
{
}

View File

@@ -85,7 +85,6 @@ class ScalarMemPipeline
}
const std::string& name() const { return _name; }
void regStats();
private:
ComputeUnit &computeUnit;

View File

@@ -66,11 +66,11 @@ ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
if (regBusy(pSgpr)) {
if (ii->isDstOperand(i)) {
w->numTimesBlockedDueWAXDependencies++;
w->stats.numTimesBlockedDueWAXDependencies++;
} else if (ii->isSrcOperand(i)) {
DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
w->wfDynId, ii->disassemble(), pSgpr);
w->numTimesBlockedDueRAWDependencies++;
w->stats.numTimesBlockedDueRAWDependencies++;
}
return false;
}
@@ -109,7 +109,7 @@ ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
int DWORDs = ii->getOperandSize(i) <= 4 ? 1
: ii->getOperandSize(i) / 4;
registerReads += DWORDs;
stats.registerReads += DWORDs;
}
}
@@ -128,7 +128,7 @@ ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
enqRegFreeEvent(physReg, tickDelay);
}
registerWrites += nRegs;
stats.registerWrites += nRegs;
}
}
}
@@ -152,7 +152,7 @@ ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w,
enqRegFreeEvent(physReg, computeUnit->clockPeriod());
}
registerWrites += nRegs;
stats.registerWrites += nRegs;
}
}
}

View File

@@ -51,7 +51,7 @@ ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,
_name(cu.name() + ".ScheduleStage"),
vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
locMemBusRdy(false), locMemIssueRdy(false)
locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
{
for (int j = 0; j < cu.numExeUnits(); ++j) {
scheduler.emplace_back(p);
@@ -121,10 +121,10 @@ ScheduleStage::exec()
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
rdyListEmpty[j]++;
stats.rdyListEmpty[j]++;
continue;
}
rdyListNotEmpty[j]++;
stats.rdyListNotEmpty[j]++;
// Pick a wave and attempt to add it to schList
Wavefront *wf = scheduler[j].chooseWave();
@@ -133,8 +133,8 @@ ScheduleStage::exec()
if (!addToSchList(j, gpu_dyn_inst)) {
// For waves not added to schList, increment count of cycles
// this wave spends in SCH stage.
wf->schCycles++;
addToSchListStalls[j]++;
wf->stats.schCycles++;
stats.addToSchListStalls[j]++;
} else {
if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
wf->incLGKMInstsIssued();
@@ -160,10 +160,10 @@ ScheduleStage::exec()
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
rdyListEmpty[j]++;
stats.rdyListEmpty[j]++;
continue;
}
rdyListNotEmpty[j]++;
stats.rdyListNotEmpty[j]++;
// Pick a wave and attempt to add it to schList
Wavefront *wf = scheduler[j].chooseWave();
@@ -172,8 +172,8 @@ ScheduleStage::exec()
if (!addToSchList(j, gpu_dyn_inst)) {
// For waves not added to schList, increment count of cycles
// this wave spends in SCH stage.
wf->schCycles++;
addToSchListStalls[j]++;
wf->stats.schCycles++;
stats.addToSchListStalls[j]++;
}
}
@@ -241,17 +241,17 @@ ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
return true;
} else {
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
if (!accessSrfWr) {
rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
stats.rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
}
if (!accessVrfWr) {
rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
stats.rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
}
// Increment stall counts for WF
wf->schStalls++;
wf->schRfAccessStalls++;
wf->stats.schStalls++;
wf->stats.schRfAccessStalls++;
}
return false;
}
@@ -329,19 +329,19 @@ ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
return true;
} else {
// Number of stall cycles due to RF access denied
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
// Count number of denials due to each reason
// Multiple items may contribute to the denied request
if (!accessVrf) {
rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
stats.rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
}
if (!accessSrf) {
rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
stats.rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
}
// Increment stall counts for WF
wf->schStalls++;
wf->schRfAccessStalls++;
wf->stats.schStalls++;
wf->stats.schRfAccessStalls++;
DPRINTF(GPUSched, "schList[%d]: Could not add: "
"SIMD[%d] WV[%d]: %d: %s\n",
exeType, wf->simdId, wf->wfDynId,
@@ -424,26 +424,26 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
// TODO: Scalar NOP does not require SALU in hardware,
// and is executed out of IB directly.
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
} else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
return false;
}
} else if (gpu_dyn_inst->isEndOfKernel()) {
// EndPgm instruction
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
}
} else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
|| gpu_dyn_inst->isALU()) {
// Barrier, Branch, or ALU instruction
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
} else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
return false;
}
} else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
@@ -451,19 +451,19 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
bool rdy = true;
if (!glbMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
stats.dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
}
if (!glbMemBusRdy) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
stats.dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
stats.dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
}
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
stats.dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
}
if (!rdy) {
return false;
@@ -473,18 +473,18 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
bool rdy = true;
if (!scalarMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
stats.dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
}
if (!scalarMemBusRdy) {
rdy = false;
dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
stats.dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit.scalarMemoryPipe
.isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
+ wf->scalarWrGmReqsInPipe))
{
rdy = false;
dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
stats.dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
@@ -494,16 +494,16 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
bool rdy = true;
if (!locMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
stats.dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
}
if (!locMemBusRdy) {
rdy = false;
dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
stats.dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit.localMemoryPipe.
isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
rdy = false;
dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
stats.dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
@@ -513,24 +513,24 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
bool rdy = true;
if (!glbMemIssueRdy || !locMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
stats.dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
}
if (!glbMemBusRdy || !locMemBusRdy) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
stats.dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
stats.dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
}
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
stats.dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
}
if (!computeUnit.localMemoryPipe.
isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
stats.dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
@@ -540,7 +540,7 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
gpu_dyn_inst->disassemble());
return false;
}
dispNrdyStalls[SCH_RDY]++;
stats.dispNrdyStalls[SCH_RDY]++;
return true;
}
@@ -584,10 +584,10 @@ ScheduleStage::fillDispatchList()
} else {
// Either another wave has been dispatched, or this wave
// was not ready, so it is stalled this cycle
schIter->first->wavefront()->schStalls++;
schIter->first->wavefront()->stats.schStalls++;
if (!dispRdy) {
// not ready for dispatch, increment stall stat
schIter->first->wavefront()->schResourceStalls++;
schIter->first->wavefront()->stats.schResourceStalls++;
}
// Examine next wave for this resource
schIter++;
@@ -601,9 +601,9 @@ ScheduleStage::fillDispatchList()
// Increment stall count if no wave sent to dispatchList for
// current execution resource
if (!dispatched) {
schListToDispListStalls[j]++;
stats.schListToDispListStalls[j]++;
} else {
schListToDispList[j]++;
stats.schListToDispList[j]++;
}
}
}
@@ -635,9 +635,9 @@ ScheduleStage::arbitrateVrfToLdsBus()
reinsertToSchList(wf->localMem, toExecute
.readyInst(wf->localMem));
// Increment stall stats for LDS-VRF arbitration
ldsBusArbStalls++;
stats.ldsBusArbStalls++;
toExecute.readyInst(wf->localMem)
->wavefront()->schLdsArbStalls++;
->wavefront()->stats.schLdsArbStalls++;
}
// With arbitration of LM pipe complete, transition the
// LM pipe to SKIP state in the dispatchList to inform EX stage
@@ -663,7 +663,7 @@ ScheduleStage::checkRfOperandReadComplete()
// Increment the number of cycles the wave spends in the
// SCH stage, since this loop visits every wave in SCH.
wf->schCycles++;
wf->stats.schCycles++;
bool vrfRdy = true;
if (!gpu_dyn_inst->isScalar()) {
@@ -690,15 +690,15 @@ ScheduleStage::checkRfOperandReadComplete()
p.second = RFBUSY;
// Increment stall stats
wf->schStalls++;
wf->schOpdNrdyStalls++;
wf->stats.schStalls++;
wf->stats.schOpdNrdyStalls++;
opdNrdyStalls[SCH_RF_OPD_NRDY]++;
stats.opdNrdyStalls[SCH_RF_OPD_NRDY]++;
if (!vrfRdy) {
opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
stats.opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
}
if (!srfRdy) {
opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
stats.opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
}
}
}
@@ -777,60 +777,40 @@ ScheduleStage::deleteFromSch(Wavefront *w)
wavesInSch.erase(w->wfDynId);
}
void
ScheduleStage::regStats()
ScheduleStage::ScheduleStageStats::ScheduleStageStats(Stats::Group *parent,
int num_exec_units)
: Stats::Group(parent, "ScheduleStage"),
ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
"execution resource"),
ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
"list per execution resource"),
ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "
"schList per execution resource when ready list is not empty"),
ADD_STAT(schListToDispList, "number of cycles a wave is added to "
"dispatchList per execution resource"),
ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"
" dispatchList per execution resource"),
ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),
ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "
"conflicts"),
ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "
"ready"),
ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "
"ready")
{
rdyListNotEmpty
.init(computeUnit.numExeUnits())
.name(name() + ".rdy_list_not_empty")
.desc("number of cycles one or more wave on ready list per "
"execution resource")
;
rdyListNotEmpty.init(num_exec_units);
rdyListEmpty.init(num_exec_units);
addToSchListStalls.init(num_exec_units);
schListToDispList.init(num_exec_units);
schListToDispListStalls.init(num_exec_units);
opdNrdyStalls.init(SCH_RF_OPD_NRDY_CONDITIONS);
dispNrdyStalls.init(SCH_NRDY_CONDITIONS);
rfAccessStalls.init(SCH_RF_ACCESS_NRDY_CONDITIONS);
rdyListEmpty
.init(computeUnit.numExeUnits())
.name(name() + ".rdy_list_empty")
.desc("number of cycles no wave on ready list per "
"execution resource")
;
addToSchListStalls
.init(computeUnit.numExeUnits())
.name(name() + ".sch_list_add_stalls")
.desc("number of cycles a wave is not added to schList per "
"execution resource when ready list is not empty")
;
schListToDispList
.init(computeUnit.numExeUnits())
.name(name() + ".sch_list_to_disp_list")
.desc("number of cycles a wave is added to dispatchList per "
"execution resource")
;
schListToDispListStalls
.init(computeUnit.numExeUnits())
.name(name() + ".sch_list_to_disp_list_stalls")
.desc("number of cycles no wave is added to dispatchList per "
"execution resource")
;
// Operand Readiness Stall Cycles
opdNrdyStalls
.init(SCH_RF_OPD_NRDY_CONDITIONS)
.name(name() + ".opd_nrdy_stalls")
.desc("number of stalls in SCH due to operands not ready")
;
opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
// dispatchReady Stall Cycles
dispNrdyStalls
.init(SCH_NRDY_CONDITIONS)
.name(name() + ".disp_nrdy_stalls")
.desc("number of stalls in SCH due to resource not ready")
;
dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
@@ -862,21 +842,9 @@ ScheduleStage::regStats()
csprintf("FlatMemFIFO"));
dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
// RF Access Stall Cycles
rfAccessStalls
.init(SCH_RF_ACCESS_NRDY_CONDITIONS)
.name(name() + ".rf_access_stalls")
.desc("number of stalls due to RF access denied")
;
rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
// Stall cycles due to wave losing LDS bus arbitration
ldsBusArbStalls
.name(name() + ".lds_bus_arb_stalls")
.desc("number of stalls due to VRF->LDS bus conflicts")
;
}

View File

@@ -40,6 +40,8 @@
#include <utility>
#include <vector>
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/misc.hh"
#include "gpu-compute/scheduler.hh"
@@ -105,8 +107,6 @@ class ScheduleStage
SCH_RF_ACCESS_NRDY_CONDITIONS
};
void regStats();
// Called by ExecStage to inform SCH of instruction execution
void deleteFromSch(Wavefront *w);
@@ -126,48 +126,6 @@ class ScheduleStage
// scheduler and a dispatch list
std::vector<Scheduler> scheduler;
// Stats
// Number of cycles with empty (or not empty) readyList, per execution
// resource, when the CU is active (not sleeping)
Stats::Vector rdyListEmpty;
Stats::Vector rdyListNotEmpty;
// Number of cycles, per execution resource, when at least one wave
// was on the readyList and picked by scheduler, but was unable to be
// added to the schList, when the CU is active (not sleeping)
Stats::Vector addToSchListStalls;
// Number of cycles, per execution resource, when a wave is selected
// as candidate for dispatchList from schList
// Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
Stats::Vector schListToDispList;
// Per execution resource stat, incremented once per cycle if no wave
// was selected as candidate for dispatch and moved to dispatchList
Stats::Vector schListToDispListStalls;
// Number of times a wave is selected by the scheduler but cannot
// be added to the schList due to register files not being able to
// support reads or writes of operands. RF_ACCESS_NRDY condition is always
// incremented if at least one read/write not supported, other
// conditions are incremented independently from each other.
Stats::Vector rfAccessStalls;
// Number of times a wave is executing FLAT instruction and
// forces another wave occupying its required local memory resource
// to be deselected for execution, and placed back on schList
Stats::Scalar ldsBusArbStalls;
// Count of times VRF and/or SRF blocks waves on schList from
// performing RFBUSY->RFREADY transition
Stats::Vector opdNrdyStalls;
// Count of times resource required for dispatch is not ready and
// blocks wave in RFREADY state on schList from potentially moving
// to dispatchList
Stats::Vector dispNrdyStalls;
const std::string _name;
// called by exec() to add a wave to schList if the RFs can support it
@@ -221,6 +179,52 @@ class ScheduleStage
// the VRF/SRF availability or limits imposed by paremeters (to be added)
// of the SCH stage or CU.
std::vector<std::deque<std::pair<GPUDynInstPtr, SCH_STATUS>>> schList;
protected:
struct ScheduleStageStats : public Stats::Group
{
ScheduleStageStats(Stats::Group *parent, int num_exec_units);
// Number of cycles with empty (or not empty) readyList, per execution
// resource, when the CU is active (not sleeping)
Stats::Vector rdyListEmpty;
Stats::Vector rdyListNotEmpty;
// Number of cycles, per execution resource, when at least one wave
// was on the readyList and picked by scheduler, but was unable to be
// added to the schList, when the CU is active (not sleeping)
Stats::Vector addToSchListStalls;
// Number of cycles, per execution resource, when a wave is selected
// as candidate for dispatchList from schList
// Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
Stats::Vector schListToDispList;
// Per execution resource stat, incremented once per cycle if no wave
// was selected as candidate for dispatch and moved to dispatchList
Stats::Vector schListToDispListStalls;
// Number of times a wave is selected by the scheduler but cannot
// be added to the schList due to register files not being able to
// support reads or writes of operands. RF_ACCESS_NRDY condition is
// always incremented if at least one read/write not supported, other
// conditions are incremented independently from each other.
Stats::Vector rfAccessStalls;
// Number of times a wave is executing FLAT instruction and
// forces another wave occupying its required local memory resource
// to be deselected for execution, and placed back on schList
Stats::Scalar ldsBusArbStalls;
// Count of times VRF and/or SRF blocks waves on schList from
// performing RFBUSY->RFREADY transition
Stats::Vector opdNrdyStalls;
// Count of times resource required for dispatch is not ready and
// blocks wave in RFREADY state on schList from potentially moving
// to dispatchList
Stats::Vector dispNrdyStalls;
} stats;
};
#endif // __SCHEDULE_STAGE_HH__

View File

@@ -49,7 +49,7 @@ ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams &p,
ScoreboardCheckToSchedule
&to_schedule)
: computeUnit(cu), toSchedule(to_schedule),
_name(cu.name() + ".ScoreboardCheckStage")
_name(cu.name() + ".ScoreboardCheckStage"), stats(&cu)
{
}
@@ -62,7 +62,7 @@ ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus)
{
panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS,
"Instruction ready status %d is illegal!!!", rdyStatus);
stallCycles[rdyStatus]++;
stats.stallCycles[rdyStatus]++;
}
// Return true if this wavefront is ready
@@ -266,14 +266,13 @@ ScoreboardCheckStage::exec()
}
}
void
ScoreboardCheckStage::regStats()
ScoreboardCheckStage::
ScoreboardCheckStageStats::ScoreboardCheckStageStats(Stats::Group *parent)
: Stats::Group(parent, "ScoreboardCheckStage"),
ADD_STAT(stallCycles, "number of cycles wave stalled in SCB")
{
stallCycles
.init(NRDY_CONDITIONS)
.name(name() + ".stall_cycles")
.desc("number of cycles wave stalled in SCB")
;
stallCycles.init(NRDY_CONDITIONS);
stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop"));
stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty"));
stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt"));

View File

@@ -40,7 +40,8 @@
#include <utility>
#include <vector>
#include "sim/stats.hh"
#include "base/statistics.hh"
#include "base/stats/group.hh"
class ComputeUnit;
class ScoreboardCheckToSchedule;
@@ -78,7 +79,6 @@ class ScoreboardCheckStage
// Stats related variables and methods
const std::string& name() const { return _name; }
void regStats();
private:
void collectStatistics(nonrdytype_e rdyStatus);
@@ -94,10 +94,15 @@ class ScoreboardCheckStage
*/
ScoreboardCheckToSchedule &toSchedule;
// Stats
Stats::Vector stallCycles;
const std::string _name;
protected:
struct ScoreboardCheckStageStats : public Stats::Group
{
ScoreboardCheckStageStats(Stats::Group *parent);
Stats::Vector stallCycles;
} stats;
};
#endif // __SCOREBOARD_CHECK_STAGE_HH__

View File

@@ -65,7 +65,8 @@ Shader::Shader(const Params &p) : ClockedObject(p),
globalMemSize(p.globalmem),
nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
_dispatcher(*p.dispatcher),
max_valu_insts(p.max_valu_insts), total_valu_insts(0)
max_valu_insts(p.max_valu_insts), total_valu_insts(0),
stats(this, p.CUs[0]->wfSize())
{
gpuCmdProc.setShader(this);
_dispatcher.setShader(this);
@@ -278,86 +279,6 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
return scheduledSomething;
}
void
Shader::regStats()
{
ClockedObject::regStats();
shaderActiveTicks
.name(name() + ".shader_active_ticks")
.desc("Total ticks that any CU attached to this shader is active")
;
allLatencyDist
.init(0, 1600000, 10000)
.name(name() + ".allLatencyDist")
.desc("delay distribution for all")
.flags(Stats::pdf | Stats::oneline);
loadLatencyDist
.init(0, 1600000, 10000)
.name(name() + ".loadLatencyDist")
.desc("delay distribution for loads")
.flags(Stats::pdf | Stats::oneline);
storeLatencyDist
.init(0, 1600000, 10000)
.name(name() + ".storeLatencyDist")
.desc("delay distribution for stores")
.flags(Stats::pdf | Stats::oneline);
vectorInstSrcOperand
.init(4)
.name(name() + ".vec_inst_src_operand")
.desc("vector instruction source operand distribution");
vectorInstDstOperand
.init(4)
.name(name() + ".vec_inst_dst_operand")
.desc("vector instruction destination operand distribution");
initToCoalesceLatency
.init(0, 1600000, 10000)
.name(name() + ".initToCoalesceLatency")
.desc("Ticks from vmem inst initiateAcc to coalescer issue")
.flags(Stats::pdf | Stats::oneline);
rubyNetworkLatency
.init(0, 1600000, 10000)
.name(name() + ".rubyNetworkLatency")
.desc("Ticks from coalescer issue to coalescer hit callback")
.flags(Stats::pdf | Stats::oneline);
gmEnqueueLatency
.init(0, 1600000, 10000)
.name(name() + ".gmEnqueueLatency")
.desc("Ticks from coalescer hit callback to GM pipe enqueue")
.flags(Stats::pdf | Stats::oneline);
gmToCompleteLatency
.init(0, 1600000, 10000)
.name(name() + ".gmToCompleteLatency")
.desc("Ticks queued in GM pipes ordered response buffer")
.flags(Stats::pdf | Stats::oneline);
coalsrLineAddresses
.init(0, 20, 1)
.name(name() + ".coalsrLineAddresses")
.desc("Number of cache lines for coalesced request")
.flags(Stats::pdf | Stats::oneline);
int wfSize = cuList[0]->wfSize();
cacheBlockRoundTrip = new Stats::Distribution[wfSize];
for (int idx = 0; idx < wfSize; ++idx) {
std::stringstream namestr;
ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
cacheBlockRoundTrip[idx]
.init(0, 1600000, 10000)
.name(namestr.str())
.desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
.flags(Stats::pdf | Stats::oneline);
}
}
void
Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
bool suppress_func_errors, int cu_id)
@@ -528,8 +449,8 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
void
Shader::sampleStore(const Tick accessTime)
{
storeLatencyDist.sample(accessTime);
allLatencyDist.sample(accessTime);
stats.storeLatencyDist.sample(accessTime);
stats.allLatencyDist.sample(accessTime);
}
/*
@@ -538,8 +459,8 @@ Shader::sampleStore(const Tick accessTime)
void
Shader::sampleLoad(const Tick accessTime)
{
loadLatencyDist.sample(accessTime);
allLatencyDist.sample(accessTime);
stats.loadLatencyDist.sample(accessTime);
stats.allLatencyDist.sample(accessTime);
}
void
@@ -556,16 +477,16 @@ Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)
Tick t4 = roundTripTime[3];
Tick t5 = roundTripTime[4];
initToCoalesceLatency.sample(t2-t1);
rubyNetworkLatency.sample(t3-t2);
gmEnqueueLatency.sample(t4-t3);
gmToCompleteLatency.sample(t5-t4);
stats.initToCoalesceLatency.sample(t2-t1);
stats.rubyNetworkLatency.sample(t3-t2);
stats.gmEnqueueLatency.sample(t4-t3);
stats.gmToCompleteLatency.sample(t5-t4);
}
void
Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
{
coalsrLineAddresses.sample(lineMap.size());
stats.coalsrLineAddresses.sample(lineMap.size());
std::vector<Tick> netTimes;
// For each cache block address generated by a vmem inst, calculate
@@ -586,7 +507,7 @@ Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
// Nth distribution.
int idx = 0;
for (auto& time : netTimes) {
cacheBlockRoundTrip[idx].sample(time);
stats.cacheBlockRoundTrip[idx].sample(time);
++idx;
}
}
@@ -598,5 +519,75 @@ Shader::notifyCuSleep() {
"Invalid activeCu size\n");
_activeCus--;
if (!_activeCus)
shaderActiveTicks += curTick() - _lastInactiveTick;
stats.shaderActiveTicks += curTick() - _lastInactiveTick;
}
Shader::ShaderStats::ShaderStats(Stats::Group *parent, int wf_size)
: Stats::Group(parent),
ADD_STAT(allLatencyDist, "delay distribution for all"),
ADD_STAT(loadLatencyDist, "delay distribution for loads"),
ADD_STAT(storeLatencyDist, "delay distribution for stores"),
ADD_STAT(initToCoalesceLatency,
"Ticks from vmem inst initiateAcc to coalescer issue"),
ADD_STAT(rubyNetworkLatency,
"Ticks from coalescer issue to coalescer hit callback"),
ADD_STAT(gmEnqueueLatency,
"Ticks from coalescer hit callback to GM pipe enqueue"),
ADD_STAT(gmToCompleteLatency,
"Ticks queued in GM pipes ordered response buffer"),
ADD_STAT(coalsrLineAddresses,
"Number of cache lines for coalesced request"),
ADD_STAT(shaderActiveTicks,
"Total ticks that any CU attached to this shader is active"),
ADD_STAT(vectorInstSrcOperand,
"vector instruction source operand distribution"),
ADD_STAT(vectorInstDstOperand,
"vector instruction destination operand distribution")
{
allLatencyDist
.init(0, 1600000, 10000)
.flags(Stats::pdf | Stats::oneline);
loadLatencyDist
.init(0, 1600000, 10000)
.flags(Stats::pdf | Stats::oneline);
storeLatencyDist
.init(0, 1600000, 10000)
.flags(Stats::pdf | Stats::oneline);
initToCoalesceLatency
.init(0, 1600000, 10000)
.flags(Stats::pdf | Stats::oneline);
rubyNetworkLatency
.init(0, 1600000, 10000)
.flags(Stats::pdf | Stats::oneline);
gmEnqueueLatency
.init(0, 1600000, 10000)
.flags(Stats::pdf | Stats::oneline);
gmToCompleteLatency
.init(0, 1600000, 10000)
.flags(Stats::pdf | Stats::oneline);
coalsrLineAddresses
.init(0, 20, 1)
.flags(Stats::pdf | Stats::oneline);
vectorInstSrcOperand.init(4);
vectorInstDstOperand.init(4);
cacheBlockRoundTrip = new Stats::Distribution[wf_size];
for (int idx = 0; idx < wf_size; ++idx) {
std::stringstream namestr;
ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
static_cast<Shader*>(parent)->name(), idx);
cacheBlockRoundTrip[idx]
.init(0, 1600000, 10000)
.name(namestr.str())
.desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
.flags(Stats::pdf | Stats::oneline);
}
}

View File

@@ -40,6 +40,8 @@
#include <string>
#include "arch/isa.hh"
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "base/types.hh"
#include "cpu/simple/atomic.hh"
#include "cpu/simple/timing.hh"
@@ -98,26 +100,6 @@ class Shader : public ClockedObject
// Last tick that all CUs attached to this shader were inactive
Tick _lastInactiveTick;
// some stats for measuring latency
Stats::Distribution allLatencyDist;
Stats::Distribution loadLatencyDist;
Stats::Distribution storeLatencyDist;
// average ticks from vmem inst initiateAcc to coalescer issue,
// average ticks from coalescer issue to coalescer hit callback,
// average ticks from coalescer hit callback to GM pipe enqueue,
// and average ticks spent in GM pipe's ordered resp buffer.
Stats::Distribution initToCoalesceLatency;
Stats::Distribution rubyNetworkLatency;
Stats::Distribution gmEnqueueLatency;
Stats::Distribution gmToCompleteLatency;
// average number of cache blocks requested by vmem inst, and
// average ticks for cache blocks to main memory for the Nth
// cache block generated by a vmem inst.
Stats::Distribution coalsrLineAddresses;
Stats::Distribution *cacheBlockRoundTrip;
public:
typedef ShaderParams Params;
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -249,14 +231,6 @@ class Shader : public ClockedObject
GPUCommandProcessor &gpuCmdProc;
GPUDispatcher &_dispatcher;
/**
* Statistics
*/
Stats::Scalar shaderActiveTicks;
Stats::Vector vectorInstSrcOperand;
Stats::Vector vectorInstDstOperand;
void regStats();
int64_t max_valu_insts;
int64_t total_valu_insts;
@@ -301,6 +275,52 @@ class Shader : public ClockedObject
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
void updateContext(int cid);
void notifyCuSleep();
void
incVectorInstSrcOperand(int num_operands)
{
stats.vectorInstSrcOperand[num_operands]++;
}
void
incVectorInstDstOperand(int num_operands)
{
stats.vectorInstDstOperand[num_operands]++;
}
protected:
struct ShaderStats : public Stats::Group
{
ShaderStats(Stats::Group *parent, int wf_size);
// some stats for measuring latency
Stats::Distribution allLatencyDist;
Stats::Distribution loadLatencyDist;
Stats::Distribution storeLatencyDist;
// average ticks from vmem inst initiateAcc to coalescer issue,
Stats::Distribution initToCoalesceLatency;
// average ticks from coalescer issue to coalescer hit callback,
Stats::Distribution rubyNetworkLatency;
// average ticks from coalescer hit callback to GM pipe enqueue,
Stats::Distribution gmEnqueueLatency;
// average ticks spent in GM pipe's ordered resp buffer.
Stats::Distribution gmToCompleteLatency;
// average number of cache blocks requested by vmem inst
Stats::Distribution coalsrLineAddresses;
// average ticks for cache blocks to main memory for the Nth
// cache block generated by a vmem inst.
Stats::Distribution *cacheBlockRoundTrip;
Stats::Scalar shaderActiveTicks;
Stats::Vector vectorInstSrcOperand;
Stats::Vector vectorInstDstOperand;
} stats;
};
#endif // __SHADER_HH__

View File

@@ -180,8 +180,3 @@ StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
w->reservedScalarRegs = 0;
w->startSgprIndex = 0;
}
void
StaticRegisterManagerPolicy::regStats()
{
}

View File

@@ -58,8 +58,6 @@ class StaticRegisterManagerPolicy : public RegisterManagerPolicy
int scalarDemand) override;
void freeRegisters(Wavefront *w) override;
void regStats() override;
};
#endif // __STATIC_REGISTER_MANAGER_POLICY_HH__

View File

@@ -50,7 +50,8 @@ TLBCoalescer::TLBCoalescer(const Params &p)
false, Event::CPU_Tick_Pri),
cleanupEvent([this]{ processCleanupEvent(); },
"Cleanup issuedTranslationsTable hashmap",
false, Event::Maximum_Pri)
false, Event::Maximum_Pri),
stats(this)
{
// create the response ports based on the number of connected ports
for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) {
@@ -256,11 +257,11 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
sender_state->reqCnt.push_back(req_cnt);
// update statistics
coalescer->uncoalescedAccesses++;
coalescer->stats.uncoalescedAccesses++;
req_cnt = sender_state->reqCnt.back();
DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
coalescer->queuingCycles -= (curTick() * req_cnt);
coalescer->localqueuingCycles -= curTick();
coalescer->stats.queuingCycles -= (curTick() * req_cnt);
coalescer->stats.localqueuingCycles -= curTick();
}
// FIXME if you want to coalesce not based on the issueTime
@@ -302,7 +303,7 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
// and make necessary allocations.
if (!coalescedReq_cnt || !didCoalesce) {
if (update_stats)
coalescer->coalescedAccesses++;
coalescer->stats.coalescedAccesses++;
std::vector<PacketPtr> new_array;
new_array.push_back(pkt);
@@ -339,7 +340,7 @@ TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
bool update_stats = !sender_state->prefetch;
if (update_stats)
coalescer->uncoalescedAccesses++;
coalescer->stats.uncoalescedAccesses++;
// If there is a pending timing request for this virtual address
// print a warning message. This is a temporary caveat of
@@ -467,7 +468,7 @@ TLBCoalescer::processProbeTLBEvent()
// by the one we just sent counting all the way from
// the top of TLB hiearchy (i.e., from the CU)
int req_cnt = tmp_sender_state->reqCnt.back();
queuingCycles += (curTick() * req_cnt);
stats.queuingCycles += (curTick() * req_cnt);
DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
name(), req_cnt);
@@ -475,7 +476,7 @@ TLBCoalescer::processProbeTLBEvent()
// pkt_cnt is number of packets we coalesced into the one
// we just sent but only at this coalescer level
int pkt_cnt = iter->second[vector_index].size();
localqueuingCycles += (curTick() * pkt_cnt);
stats.localqueuingCycles += (curTick() * pkt_cnt);
}
DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
@@ -520,35 +521,14 @@ TLBCoalescer::processCleanupEvent()
}
}
void
TLBCoalescer::regStats()
TLBCoalescer::TLBCoalescerStats::TLBCoalescerStats(Stats::Group *parent)
: Stats::Group(parent),
ADD_STAT(uncoalescedAccesses, "Number of uncoalesced TLB accesses"),
ADD_STAT(coalescedAccesses, "Number of coalesced TLB accesses"),
ADD_STAT(queuingCycles, "Number of cycles spent in queue"),
ADD_STAT(localqueuingCycles,
"Number of cycles spent in queue for all incoming reqs"),
ADD_STAT(localLatency, "Avg. latency over all incoming pkts")
{
ClockedObject::regStats();
uncoalescedAccesses
.name(name() + ".uncoalesced_accesses")
.desc("Number of uncoalesced TLB accesses")
;
coalescedAccesses
.name(name() + ".coalesced_accesses")
.desc("Number of coalesced TLB accesses")
;
queuingCycles
.name(name() + ".queuing_cycles")
.desc("Number of cycles spent in queue")
;
localqueuingCycles
.name(name() + ".local_queuing_cycles")
.desc("Number of cycles spent in queue for all incoming reqs")
;
localLatency
.name(name() + ".local_latency")
.desc("Avg. latency over all incoming pkts")
;
localLatency = localqueuingCycles / uncoalescedAccesses;
}

View File

@@ -115,26 +115,8 @@ class TLBCoalescer : public ClockedObject
CoalescingTable issuedTranslationsTable;
// number of packets the coalescer receives
Stats::Scalar uncoalescedAccesses;
// number packets the coalescer send to the TLB
Stats::Scalar coalescedAccesses;
// Number of cycles the coalesced requests spend waiting in
// coalescerFIFO. For each packet the coalescer receives we take into
// account the number of all uncoalesced requests this pkt "represents"
Stats::Scalar queuingCycles;
// On average how much time a request from the
// uncoalescedAccesses that reaches the TLB
// spends waiting?
Stats::Scalar localqueuingCycles;
// localqueuingCycles/uncoalescedAccesses
Stats::Formula localLatency;
bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
void updatePhysAddresses(PacketPtr pkt);
void regStats() override;
class CpuSidePort : public ResponsePort
{
@@ -211,6 +193,29 @@ class TLBCoalescer : public ClockedObject
// this FIFO queue keeps track of the virt. page
// addresses that are pending cleanup
std::queue<Addr> cleanupQueue;
protected:
struct TLBCoalescerStats : public Stats::Group
{
TLBCoalescerStats(Stats::Group *parent);
// number of packets the coalescer receives
Stats::Scalar uncoalescedAccesses;
// number packets the coalescer send to the TLB
Stats::Scalar coalescedAccesses;
// Number of cycles the coalesced requests spend waiting in
// coalescerFIFO. For each packet the coalescer receives we take into
// account the number of all uncoalesced requests this pkt "represents"
Stats::Scalar queuingCycles;
// On average how much time a request from the
// uncoalescedAccesses that reaches the TLB
// spends waiting?
Stats::Scalar localqueuingCycles;
// localqueuingCycles/uncoalescedAccesses
Stats::Formula localLatency;
} stats;
};
#endif // __TLB_COALESCER_HH__

View File

@@ -69,11 +69,11 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
->mapVgpr(w, vgprIdx + j);
if (regBusy(pVgpr)) {
if (ii->isDstOperand(i)) {
w->numTimesBlockedDueWAXDependencies++;
w->stats.numTimesBlockedDueWAXDependencies++;
} else if (ii->isSrcOperand(i)) {
DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
w->wfDynId, ii->disassemble(), pVgpr);
w->numTimesBlockedDueRAWDependencies++;
w->stats.numTimesBlockedDueRAWDependencies++;
}
return false;
}
@@ -125,13 +125,13 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
{
// increment count of number of DWORDs read from VRF
int DWORDs = ii->numSrcVecDWORDs();
registerReads += (DWORDs * w->execMask().count());
stats.registerReads += (DWORDs * w->execMask().count());
uint64_t mask = w->execMask().to_ullong();
int srams = w->execMask().size() / 4;
for (int i = 0; i < srams; i++) {
if (mask & 0xF) {
sramReads += DWORDs;
stats.sramReads += DWORDs;
}
mask = mask >> 4;
}
@@ -163,13 +163,13 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
// increment count of number of DWORDs written to VRF
DWORDs = ii->numDstVecDWORDs();
registerWrites += (DWORDs * w->execMask().count());
stats.registerWrites += (DWORDs * w->execMask().count());
mask = w->execMask().to_ullong();
srams = w->execMask().size() / 4;
for (int i = 0; i < srams; i++) {
if (mask & 0xF) {
sramWrites += DWORDs;
stats.sramWrites += DWORDs;
}
mask = mask >> 4;
}
@@ -196,13 +196,13 @@ VectorRegisterFile::scheduleWriteOperandsFromLoad(
}
// increment count of number of DWORDs written to VRF
int DWORDs = ii->numDstVecDWORDs();
registerWrites += (DWORDs * ii->exec_mask.count());
stats.registerWrites += (DWORDs * ii->exec_mask.count());
uint64_t mask = ii->exec_mask.to_ullong();
int srams = ii->exec_mask.size() / 4;
for (int i = 0; i < srams; i++) {
if (mask & 0xF) {
sramWrites += DWORDs;
stats.sramWrites += DWORDs;
}
mask = mask >> 4;
}

View File

@@ -49,7 +49,7 @@ Wavefront::Wavefront(const Params &p)
maxIbSize(p.max_ib_size), _gpuISA(*this),
vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
barId(WFBarrier::InvalidID)
barId(WFBarrier::InvalidID), stats(this)
{
lastTrace = 0;
execUnitId = -1;
@@ -97,75 +97,6 @@ Wavefront::Wavefront(const Params &p)
vecReads.clear();
}
void
Wavefront::regStats()
{
SimObject::regStats();
// FIXME: the name of the WF needs to be unique
numTimesBlockedDueWAXDependencies
.name(name() + ".timesBlockedDueWAXDependencies")
.desc("number of times the wf's instructions are blocked due to WAW "
"or WAR dependencies")
;
// FIXME: the name of the WF needs to be unique
numTimesBlockedDueRAWDependencies
.name(name() + ".timesBlockedDueRAWDependencies")
.desc("number of times the wf's instructions are blocked due to RAW "
"dependencies")
;
numInstrExecuted
.name(name() + ".num_instr_executed")
.desc("number of instructions executed by this WF slot")
;
schCycles
.name(name() + ".sch_cycles")
.desc("number of cycles spent in schedule stage")
;
schStalls
.name(name() + ".sch_stalls")
.desc("number of cycles WF is stalled in SCH stage")
;
schRfAccessStalls
.name(name() + ".sch_rf_access_stalls")
.desc("number of cycles wave selected in SCH but RF denied adding "
"instruction")
;
schResourceStalls
.name(name() + ".sch_resource_stalls")
.desc("number of cycles stalled in sch by resource not available")
;
schOpdNrdyStalls
.name(name() + ".sch_opd_nrdy_stalls")
.desc("number of cycles stalled in sch waiting for RF reads to "
"complete")
;
schLdsArbStalls
.name(name() + ".sch_lds_arb_stalls")
.desc("number of cycles wave stalled due to LDS-VRF arbitration")
;
vecRawDistance
.init(0,20,1)
.name(name() + ".vec_raw_distance")
.desc("Count of RAW distance in dynamic instructions for this WF")
;
readsPerWrite
.init(0,4,1)
.name(name() + ".vec_reads_per_write")
.desc("Count of Vector reads per write for this WF")
;
}
void
Wavefront::init()
{
@@ -959,17 +890,19 @@ Wavefront::exec()
}
computeUnit->srf[simdId]->waveExecuteInst(this, ii);
computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++;
computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++;
computeUnit->numInstrExecuted++;
numInstrExecuted++;
computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecOperands());
computeUnit->shader->incVectorInstDstOperand(ii->numDstVecOperands());
computeUnit->stats.numInstrExecuted++;
stats.numInstrExecuted++;
computeUnit->instExecPerSimd[simdId]++;
computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
computeUnit->lastExecCycle[simdId]);
computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
computeUnit->stats.execRateDist.sample(
computeUnit->stats.totalCycles.value() -
computeUnit->lastExecCycle[simdId]);
computeUnit->lastExecCycle[simdId] =
computeUnit->stats.totalCycles.value();
if (lastInstExec) {
computeUnit->instInterleave[simdId].
computeUnit->stats.instInterleave[simdId].
sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
}
lastInstExec = computeUnit->instExecPerSimd[simdId];
@@ -987,8 +920,8 @@ Wavefront::exec()
if (ii->isSrcOperand(i)) {
// This check should never fail, but to be safe we check
if (rawDist.find(vgpr+n) != rawDist.end()) {
vecRawDistance.
sample(numInstrExecuted.value() - rawDist[vgpr+n]);
stats.vecRawDistance.sample(
stats.numInstrExecuted.value() - rawDist[vgpr+n]);
}
// increment number of reads to this register
vecReads[vgpr+n]++;
@@ -997,12 +930,12 @@ Wavefront::exec()
// for the first write to each physical register
if (rawDist.find(vgpr+n) != rawDist.end()) {
// sample the number of reads that were performed
readsPerWrite.sample(vecReads[vgpr+n]);
stats.readsPerWrite.sample(vecReads[vgpr+n]);
}
// on a write, reset count of reads to 0
vecReads[vgpr+n] = 0;
rawDist[vgpr+n] = numInstrExecuted.value();
rawDist[vgpr+n] = stats.numInstrExecuted.value();
}
}
}
@@ -1023,26 +956,29 @@ Wavefront::exec()
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
const int num_active_lanes = execMask().count();
computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->numVecOpsExecuted += num_active_lanes;
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
if (ii->isF16() && ii->isALU()) {
if (ii->isF32() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F16, and (2)"
"either F32 or F64.");
}
computeUnit->numVecOpsExecutedF16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->numVecOpsExecutedFMA16 += num_active_lanes;
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->numVecOpsExecutedMAC16 += num_active_lanes;
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->numVecOpsExecutedMAD16 += num_active_lanes;
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (ii->isF32() && ii->isALU()) {
@@ -1050,18 +986,21 @@ Wavefront::exec()
fatal("Instruction is tagged as both (1) F32, and (2)"
"either F16 or F64.");
}
computeUnit->numVecOpsExecutedF32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->numVecOpsExecutedFMA32 += num_active_lanes;
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->numVecOpsExecutedMAC32 += num_active_lanes;
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->numVecOpsExecutedMAD32 += num_active_lanes;
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (ii->isF64() && ii->isALU()) {
@@ -1069,24 +1008,29 @@ Wavefront::exec()
fatal("Instruction is tagged as both (1) F64, and (2)"
"either F16 or F32.");
}
computeUnit->numVecOpsExecutedF64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->numVecOpsExecutedFMA64 += num_active_lanes;
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->numVecOpsExecutedMAC64 += num_active_lanes;
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->numVecOpsExecutedMAD64 += num_active_lanes;
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (isGmInstruction(ii)) {
computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
computeUnit->stats.activeLanesPerGMemInstrDist.sample(
num_active_lanes);
} else if (isLmInstruction(ii)) {
computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
computeUnit->stats.activeLanesPerLMemInstrDist.sample(
num_active_lanes);
}
}
@@ -1133,14 +1077,14 @@ Wavefront::exec()
computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->instCyclesVMemPerSimd[simdId] +=
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
computeUnit->vrf_gm_bus_latency;
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(computeUnit->srf_scm_bus_latency));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->instCyclesScMemPerSimd[simdId] +=
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
computeUnit->srf_scm_bus_latency;
}
// GM or Flat as GM Store
@@ -1150,14 +1094,14 @@ Wavefront::exec()
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->instCyclesVMemPerSimd[simdId] +=
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->instCyclesScMemPerSimd[simdId] +=
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
} else if ((ii->isAtomic() || ii->isMemSync()) &&
@@ -1167,14 +1111,14 @@ Wavefront::exec()
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->instCyclesVMemPerSimd[simdId] +=
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->instCyclesScMemPerSimd[simdId] +=
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
// LM or Flat as LM Load
@@ -1183,7 +1127,7 @@ Wavefront::exec()
cyclesToTicks(computeUnit->vrf_lm_bus_latency));
computeUnit->vectorSharedMemUnit.
set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->instCyclesLdsPerSimd[simdId] +=
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
computeUnit->vrf_lm_bus_latency;
// LM or Flat as LM Store
} else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
@@ -1191,7 +1135,7 @@ Wavefront::exec()
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->instCyclesLdsPerSimd[simdId] +=
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
// LM or Flat as LM, Atomic or MemFence
} else if ((ii->isAtomic() || ii->isMemSync()) &&
@@ -1200,7 +1144,7 @@ Wavefront::exec()
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->instCyclesLdsPerSimd[simdId] +=
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
} else {
panic("Bad instruction type!\n");
@@ -1453,3 +1397,31 @@ Wavefront::releaseBarrier()
{
barId = WFBarrier::InvalidID;
}
Wavefront::WavefrontStats::WavefrontStats(Stats::Group *parent)
: Stats::Group(parent),
ADD_STAT(numInstrExecuted,
"number of instructions executed by this WF slot"),
ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
"RF denied adding instruction"),
ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
" not available"),
ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
"RF reads to complete"),
ADD_STAT(schLdsArbStalls,
"number of cycles wave stalled due to LDS-VRF arbitration"),
// FIXME: the name of the WF needs to be unique
ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
"instructions are blocked due to WAW or WAR dependencies"),
// FIXME: the name of the WF needs to be unique
ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
"instructions are blocked due to RAW dependencies"),
ADD_STAT(vecRawDistance,
"Count of RAW distance in dynamic instructions for this WF"),
ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
{
vecRawDistance.init(0, 20, 1);
readsPerWrite.init(0, 4, 1);
}

View File

@@ -43,6 +43,8 @@
#include "arch/gpu_isa.hh"
#include "base/logging.hh"
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "gpu-compute/compute_unit.hh"
@@ -217,52 +219,13 @@ class Wavefront : public SimObject
// unique WF id over all WFs executed across all CUs
uint64_t wfDynId;
// Wavefront slot stats
// Number of instructions executed by this wavefront slot across all
// dynamic wavefronts
Stats::Scalar numInstrExecuted;
// Number of cycles this WF spends in SCH stage
Stats::Scalar schCycles;
// Number of stall cycles encounterd by this WF in SCH stage
Stats::Scalar schStalls;
// The following stats sum to the value of schStalls, and record, per
// WF slot, what the cause of each stall was at a coarse granularity.
// Cycles WF is selected by scheduler, but RFs cannot support instruction
Stats::Scalar schRfAccessStalls;
// Cycles spent waiting for execution resources
Stats::Scalar schResourceStalls;
// cycles spent waiting for RF reads to complete in SCH stage
Stats::Scalar schOpdNrdyStalls;
// LDS arbitration stall cycles. WF attempts to execute LM instruction,
// but another wave is executing FLAT, which requires LM and GM and forces
// this WF to stall.
Stats::Scalar schLdsArbStalls;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueWAXDependencies;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueRAWDependencies;
// dyn inst id (per SIMD) of last instruction exec from this wave
uint64_t lastInstExec;
// Distribution to track the distance between producer and consumer
// for vector register values
Stats::Distribution vecRawDistance;
// Map to track the dyn instruction id of each vector register value
// produced, indexed by physical vector register ID
std::unordered_map<int,uint64_t> rawDist;
// Distribution to track the number of times every vector register
// value produced is consumed.
Stats::Distribution readsPerWrite;
// Counts the number of reads performed to each physical register
// - counts are reset to 0 for each dynamic wavefront launched
std::vector<int> vecReads;
@@ -289,7 +252,6 @@ class Wavefront : public SimObject
// called by SCH stage to reserve
std::vector<int> reserveResources();
bool stopFetch();
void regStats();
Addr pc() const;
void pc(Addr new_pc);
@@ -357,6 +319,52 @@ class Wavefront : public SimObject
Addr _pc;
VectorMask _execMask;
int barId;
public:
struct WavefrontStats : public Stats::Group
{
WavefrontStats(Stats::Group *parent);
// Number of instructions executed by this wavefront slot across all
// dynamic wavefronts
Stats::Scalar numInstrExecuted;
// Number of cycles this WF spends in SCH stage
Stats::Scalar schCycles;
// Number of stall cycles encounterd by this WF in SCH stage
Stats::Scalar schStalls;
// The following stats sum to the value of schStalls, and record, per
// WF slot, what the cause of each stall was at a coarse granularity.
// Cycles WF is selected by scheduler, but RFs cannot support
// instruction
Stats::Scalar schRfAccessStalls;
// Cycles spent waiting for execution resources
Stats::Scalar schResourceStalls;
// cycles spent waiting for RF reads to complete in SCH stage
Stats::Scalar schOpdNrdyStalls;
// LDS arbitration stall cycles. WF attempts to execute LM instruction,
// but another wave is executing FLAT, which requires LM and GM and
// forces this WF to stall.
Stats::Scalar schLdsArbStalls;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueWAXDependencies;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueRAWDependencies;
// Distribution to track the distance between producer and consumer
// for vector register values
Stats::Distribution vecRawDistance;
// Distribution to track the number of times every vector register
// value produced is consumed.
Stats::Distribution readsPerWrite;
} stats;
};
#endif // __GPU_COMPUTE_WAVEFRONT_HH__