arch-gcn3,gpu-compute: Update stats style for GPU
Convert all gpu-compute stats to Stats::Group style. Change-Id: I29116f1de53ae379210c6cfb5bed3fc74f50cca5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/39135 Reviewed-by: Matthew Poremba <matthew.poremba@amd.com> Maintainer: Matthew Poremba <matthew.poremba@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
@@ -3800,7 +3800,7 @@ namespace Gcn3ISA
|
||||
wf->computeUnit->cu_id, wf->wgId, refCount);
|
||||
|
||||
wf->computeUnit->registerManager->freeRegisters(wf);
|
||||
wf->computeUnit->completedWfs++;
|
||||
wf->computeUnit->stats.completedWfs++;
|
||||
wf->computeUnit->activeWaves--;
|
||||
|
||||
panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
|
||||
@@ -3811,7 +3811,7 @@ namespace Gcn3ISA
|
||||
|
||||
for (int i = 0; i < wf->vecReads.size(); i++) {
|
||||
if (wf->rawDist.find(i) != wf->rawDist.end()) {
|
||||
wf->readsPerWrite.sample(wf->vecReads.at(i));
|
||||
wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
|
||||
}
|
||||
}
|
||||
wf->vecReads.clear();
|
||||
@@ -3853,7 +3853,7 @@ namespace Gcn3ISA
|
||||
if (!kernelEnd || !relNeeded) {
|
||||
wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
|
||||
wf->setStatus(Wavefront::S_STOPPED);
|
||||
wf->computeUnit->completedWGs++;
|
||||
wf->computeUnit->stats.completedWGs++;
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -3877,7 +3877,7 @@ namespace Gcn3ISA
|
||||
// call shader to prepare the flush operations
|
||||
wf->computeUnit->shader->prepareFlush(gpuDynInst);
|
||||
|
||||
wf->computeUnit->completedWGs++;
|
||||
wf->computeUnit->stats.completedWGs++;
|
||||
} else {
|
||||
wf->computeUnit->shader->dispatcher().scheduleDispatch();
|
||||
}
|
||||
|
||||
@@ -106,7 +106,8 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
|
||||
_numBarrierSlots(p.num_barrier_slots),
|
||||
globalSeqNum(0), wavefrontSize(p.wf_size),
|
||||
scoreboardCheckToSchedule(p),
|
||||
scheduleToExecute(p)
|
||||
scheduleToExecute(p),
|
||||
stats(this, p.n_wf)
|
||||
{
|
||||
/**
|
||||
* This check is necessary because std::bitset only provides conversion
|
||||
@@ -367,7 +368,7 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
|
||||
w->initRegState(task, w->actualWgSzTotal);
|
||||
w->start(_n_wave++, task->codeAddr());
|
||||
|
||||
waveLevelParallelism.sample(activeWaves);
|
||||
stats.waveLevelParallelism.sample(activeWaves);
|
||||
activeWaves++;
|
||||
}
|
||||
|
||||
@@ -612,22 +613,22 @@ ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
|
||||
freeWfSlots, numMappedWfs, vregAvail, sregAvail);
|
||||
|
||||
if (!vregAvail) {
|
||||
++numTimesWgBlockedDueVgprAlloc;
|
||||
++stats.numTimesWgBlockedDueVgprAlloc;
|
||||
}
|
||||
|
||||
if (!sregAvail) {
|
||||
++numTimesWgBlockedDueSgprAlloc;
|
||||
++stats.numTimesWgBlockedDueSgprAlloc;
|
||||
}
|
||||
|
||||
// Return true if enough WF slots to submit workgroup and if there are
|
||||
// enough VGPRs to schedule all WFs to their SIMD units
|
||||
bool ldsAvail = lds.canReserve(task->ldsSize());
|
||||
if (!ldsAvail) {
|
||||
wgBlockedDueLdsAllocation++;
|
||||
stats.wgBlockedDueLdsAllocation++;
|
||||
}
|
||||
|
||||
if (!barrier_avail) {
|
||||
wgBlockedDueBarrierAllocation++;
|
||||
stats.wgBlockedDueBarrierAllocation++;
|
||||
}
|
||||
|
||||
// Return true if the following are all true:
|
||||
@@ -734,7 +735,7 @@ ComputeUnit::exec()
|
||||
scoreboardCheckStage.exec();
|
||||
fetchStage.exec();
|
||||
|
||||
totalCycles++;
|
||||
stats.totalCycles++;
|
||||
|
||||
// Put this CU to sleep if there is no more work to be done.
|
||||
if (!isDone()) {
|
||||
@@ -1032,8 +1033,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
|
||||
fatal("pkt is not a read nor a write\n");
|
||||
}
|
||||
|
||||
tlbCycles -= curTick();
|
||||
++tlbRequests;
|
||||
stats.tlbCycles -= curTick();
|
||||
++stats.tlbRequests;
|
||||
|
||||
PortID tlbPort_index = perLaneTLB ? index : 0;
|
||||
|
||||
@@ -1075,7 +1076,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
|
||||
// update the hitLevel distribution
|
||||
int hit_level = translation_state->hitLevel;
|
||||
assert(hit_level != -1);
|
||||
hitsPerTLBLevel[hit_level]++;
|
||||
stats.hitsPerTLBLevel[hit_level]++;
|
||||
|
||||
// New SenderState for the memory access
|
||||
X86ISA::GpuTLB::TranslationState *sender_state =
|
||||
@@ -1346,7 +1347,7 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
|
||||
// for the first cache block.
|
||||
if (compute_unit->headTailMap.count(gpuDynInst)) {
|
||||
Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
|
||||
compute_unit->headTailLatency.sample(curTick() - headTick);
|
||||
compute_unit->stats.headTailLatency.sample(curTick() - headTick);
|
||||
compute_unit->headTailMap.erase(gpuDynInst);
|
||||
}
|
||||
|
||||
@@ -1381,7 +1382,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
|
||||
pkt->req->getVaddr(), line);
|
||||
|
||||
assert(pkt->senderState);
|
||||
computeUnit->tlbCycles += curTick();
|
||||
computeUnit->stats.tlbCycles += curTick();
|
||||
|
||||
// pop off the TLB translation state
|
||||
X86ISA::GpuTLB::TranslationState *translation_state =
|
||||
@@ -1402,7 +1403,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
|
||||
|
||||
// update the hitLevel distribution
|
||||
int hit_level = translation_state->hitLevel;
|
||||
computeUnit->hitsPerTLBLevel[hit_level]++;
|
||||
computeUnit->stats.hitsPerTLBLevel[hit_level]++;
|
||||
|
||||
delete translation_state->tlbEntry;
|
||||
assert(!translation_state->ports.size());
|
||||
@@ -1788,561 +1789,17 @@ ComputeUnit::ITLBPort::recvReqRetry()
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ComputeUnit::regStats()
|
||||
{
|
||||
ClockedObject::regStats();
|
||||
|
||||
vALUInsts
|
||||
.name(name() + ".valu_insts")
|
||||
.desc("Number of vector ALU insts issued.")
|
||||
;
|
||||
vALUInstsPerWF
|
||||
.name(name() + ".valu_insts_per_wf")
|
||||
.desc("The avg. number of vector ALU insts issued per-wavefront.")
|
||||
;
|
||||
sALUInsts
|
||||
.name(name() + ".salu_insts")
|
||||
.desc("Number of scalar ALU insts issued.")
|
||||
;
|
||||
sALUInstsPerWF
|
||||
.name(name() + ".salu_insts_per_wf")
|
||||
.desc("The avg. number of scalar ALU insts issued per-wavefront.")
|
||||
;
|
||||
instCyclesVALU
|
||||
.name(name() + ".inst_cycles_valu")
|
||||
.desc("Number of cycles needed to execute VALU insts.")
|
||||
;
|
||||
instCyclesSALU
|
||||
.name(name() + ".inst_cycles_salu")
|
||||
.desc("Number of cycles needed to execute SALU insts.")
|
||||
;
|
||||
threadCyclesVALU
|
||||
.name(name() + ".thread_cycles_valu")
|
||||
.desc("Number of thread cycles used to execute vector ALU ops. "
|
||||
"Similar to instCyclesVALU but multiplied by the number of "
|
||||
"active threads.")
|
||||
;
|
||||
vALUUtilization
|
||||
.name(name() + ".valu_utilization")
|
||||
.desc("Percentage of active vector ALU threads in a wave.")
|
||||
;
|
||||
ldsNoFlatInsts
|
||||
.name(name() + ".lds_no_flat_insts")
|
||||
.desc("Number of LDS insts issued, not including FLAT "
|
||||
"accesses that resolve to LDS.")
|
||||
;
|
||||
ldsNoFlatInstsPerWF
|
||||
.name(name() + ".lds_no_flat_insts_per_wf")
|
||||
.desc("The avg. number of LDS insts (not including FLAT "
|
||||
"accesses that resolve to LDS) per-wavefront.")
|
||||
;
|
||||
flatVMemInsts
|
||||
.name(name() + ".flat_vmem_insts")
|
||||
.desc("The number of FLAT insts that resolve to vmem issued.")
|
||||
;
|
||||
flatVMemInstsPerWF
|
||||
.name(name() + ".flat_vmem_insts_per_wf")
|
||||
.desc("The average number of FLAT insts that resolve to vmem "
|
||||
"issued per-wavefront.")
|
||||
;
|
||||
flatLDSInsts
|
||||
.name(name() + ".flat_lds_insts")
|
||||
.desc("The number of FLAT insts that resolve to LDS issued.")
|
||||
;
|
||||
flatLDSInstsPerWF
|
||||
.name(name() + ".flat_lds_insts_per_wf")
|
||||
.desc("The average number of FLAT insts that resolve to LDS "
|
||||
"issued per-wavefront.")
|
||||
;
|
||||
vectorMemWrites
|
||||
.name(name() + ".vector_mem_writes")
|
||||
.desc("Number of vector mem write insts (excluding FLAT insts).")
|
||||
;
|
||||
vectorMemWritesPerWF
|
||||
.name(name() + ".vector_mem_writes_per_wf")
|
||||
.desc("The average number of vector mem write insts "
|
||||
"(excluding FLAT insts) per-wavefront.")
|
||||
;
|
||||
vectorMemReads
|
||||
.name(name() + ".vector_mem_reads")
|
||||
.desc("Number of vector mem read insts (excluding FLAT insts).")
|
||||
;
|
||||
vectorMemReadsPerWF
|
||||
.name(name() + ".vector_mem_reads_per_wf")
|
||||
.desc("The avg. number of vector mem read insts (excluding "
|
||||
"FLAT insts) per-wavefront.")
|
||||
;
|
||||
scalarMemWrites
|
||||
.name(name() + ".scalar_mem_writes")
|
||||
.desc("Number of scalar mem write insts.")
|
||||
;
|
||||
scalarMemWritesPerWF
|
||||
.name(name() + ".scalar_mem_writes_per_wf")
|
||||
.desc("The average number of scalar mem write insts per-wavefront.")
|
||||
;
|
||||
scalarMemReads
|
||||
.name(name() + ".scalar_mem_reads")
|
||||
.desc("Number of scalar mem read insts.")
|
||||
;
|
||||
scalarMemReadsPerWF
|
||||
.name(name() + ".scalar_mem_reads_per_wf")
|
||||
.desc("The average number of scalar mem read insts per-wavefront.")
|
||||
;
|
||||
|
||||
vALUInstsPerWF = vALUInsts / completedWfs;
|
||||
sALUInstsPerWF = sALUInsts / completedWfs;
|
||||
vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
|
||||
ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
|
||||
flatVMemInstsPerWF = flatVMemInsts / completedWfs;
|
||||
flatLDSInstsPerWF = flatLDSInsts / completedWfs;
|
||||
vectorMemWritesPerWF = vectorMemWrites / completedWfs;
|
||||
vectorMemReadsPerWF = vectorMemReads / completedWfs;
|
||||
scalarMemWritesPerWF = scalarMemWrites / completedWfs;
|
||||
scalarMemReadsPerWF = scalarMemReads / completedWfs;
|
||||
|
||||
vectorMemReadsPerKiloInst
|
||||
.name(name() + ".vector_mem_reads_per_kilo_inst")
|
||||
.desc("Number of vector mem reads per kilo-instruction")
|
||||
;
|
||||
vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
|
||||
vectorMemWritesPerKiloInst
|
||||
.name(name() + ".vector_mem_writes_per_kilo_inst")
|
||||
.desc("Number of vector mem writes per kilo-instruction")
|
||||
;
|
||||
vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
|
||||
vectorMemInstsPerKiloInst
|
||||
.name(name() + ".vector_mem_insts_per_kilo_inst")
|
||||
.desc("Number of vector mem insts per kilo-instruction")
|
||||
;
|
||||
vectorMemInstsPerKiloInst =
|
||||
((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
|
||||
scalarMemReadsPerKiloInst
|
||||
.name(name() + ".scalar_mem_reads_per_kilo_inst")
|
||||
.desc("Number of scalar mem reads per kilo-instruction")
|
||||
;
|
||||
scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
|
||||
scalarMemWritesPerKiloInst
|
||||
.name(name() + ".scalar_mem_writes_per_kilo_inst")
|
||||
.desc("Number of scalar mem writes per kilo-instruction")
|
||||
;
|
||||
scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
|
||||
scalarMemInstsPerKiloInst
|
||||
.name(name() + ".scalar_mem_insts_per_kilo_inst")
|
||||
.desc("Number of scalar mem insts per kilo-instruction")
|
||||
;
|
||||
scalarMemInstsPerKiloInst =
|
||||
((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
|
||||
|
||||
instCyclesVMemPerSimd
|
||||
.init(numVectorALUs)
|
||||
.name(name() + ".inst_cycles_vector_memory")
|
||||
.desc("Number of cycles to send address, command, data from VRF to "
|
||||
"vector memory unit, per SIMD")
|
||||
;
|
||||
|
||||
instCyclesScMemPerSimd
|
||||
.init(numVectorALUs)
|
||||
.name(name() + ".inst_cycles_scalar_memory")
|
||||
.desc("Number of cycles to send address, command, data from SRF to "
|
||||
"scalar memory unit, per SIMD")
|
||||
;
|
||||
|
||||
instCyclesLdsPerSimd
|
||||
.init(numVectorALUs)
|
||||
.name(name() + ".inst_cycles_lds")
|
||||
.desc("Number of cycles to send address, command, data from VRF to "
|
||||
"LDS unit, per SIMD")
|
||||
;
|
||||
|
||||
globalReads
|
||||
.name(name() + ".global_mem_reads")
|
||||
.desc("Number of reads to the global segment")
|
||||
;
|
||||
globalWrites
|
||||
.name(name() + ".global_mem_writes")
|
||||
.desc("Number of writes to the global segment")
|
||||
;
|
||||
globalMemInsts
|
||||
.name(name() + ".global_mem_insts")
|
||||
.desc("Number of memory instructions sent to the global segment")
|
||||
;
|
||||
globalMemInsts = globalReads + globalWrites;
|
||||
argReads
|
||||
.name(name() + ".arg_reads")
|
||||
.desc("Number of reads to the arg segment")
|
||||
;
|
||||
argWrites
|
||||
.name(name() + ".arg_writes")
|
||||
.desc("NUmber of writes to the arg segment")
|
||||
;
|
||||
argMemInsts
|
||||
.name(name() + ".arg_mem_insts")
|
||||
.desc("Number of memory instructions sent to the arg segment")
|
||||
;
|
||||
argMemInsts = argReads + argWrites;
|
||||
spillReads
|
||||
.name(name() + ".spill_reads")
|
||||
.desc("Number of reads to the spill segment")
|
||||
;
|
||||
spillWrites
|
||||
.name(name() + ".spill_writes")
|
||||
.desc("Number of writes to the spill segment")
|
||||
;
|
||||
spillMemInsts
|
||||
.name(name() + ".spill_mem_insts")
|
||||
.desc("Number of memory instructions sent to the spill segment")
|
||||
;
|
||||
spillMemInsts = spillReads + spillWrites;
|
||||
groupReads
|
||||
.name(name() + ".group_reads")
|
||||
.desc("Number of reads to the group segment")
|
||||
;
|
||||
groupWrites
|
||||
.name(name() + ".group_writes")
|
||||
.desc("Number of writes to the group segment")
|
||||
;
|
||||
groupMemInsts
|
||||
.name(name() + ".group_mem_insts")
|
||||
.desc("Number of memory instructions sent to the group segment")
|
||||
;
|
||||
groupMemInsts = groupReads + groupWrites;
|
||||
privReads
|
||||
.name(name() + ".private_reads")
|
||||
.desc("Number of reads to the private segment")
|
||||
;
|
||||
privWrites
|
||||
.name(name() + ".private_writes")
|
||||
.desc("Number of writes to the private segment")
|
||||
;
|
||||
privMemInsts
|
||||
.name(name() + ".private_mem_insts")
|
||||
.desc("Number of memory instructions sent to the private segment")
|
||||
;
|
||||
privMemInsts = privReads + privWrites;
|
||||
readonlyReads
|
||||
.name(name() + ".readonly_reads")
|
||||
.desc("Number of reads to the readonly segment")
|
||||
;
|
||||
readonlyWrites
|
||||
.name(name() + ".readonly_writes")
|
||||
.desc("Number of memory instructions sent to the readonly segment")
|
||||
;
|
||||
readonlyMemInsts
|
||||
.name(name() + ".readonly_mem_insts")
|
||||
.desc("Number of memory instructions sent to the readonly segment")
|
||||
;
|
||||
readonlyMemInsts = readonlyReads + readonlyWrites;
|
||||
kernargReads
|
||||
.name(name() + ".kernarg_reads")
|
||||
.desc("Number of reads sent to the kernarg segment")
|
||||
;
|
||||
kernargWrites
|
||||
.name(name() + ".kernarg_writes")
|
||||
.desc("Number of memory instructions sent to the kernarg segment")
|
||||
;
|
||||
kernargMemInsts
|
||||
.name(name() + ".kernarg_mem_insts")
|
||||
.desc("Number of memory instructions sent to the kernarg segment")
|
||||
;
|
||||
kernargMemInsts = kernargReads + kernargWrites;
|
||||
|
||||
tlbCycles
|
||||
.name(name() + ".tlb_cycles")
|
||||
.desc("total number of cycles for all uncoalesced requests")
|
||||
;
|
||||
|
||||
tlbRequests
|
||||
.name(name() + ".tlb_requests")
|
||||
.desc("number of uncoalesced requests")
|
||||
;
|
||||
|
||||
tlbLatency
|
||||
.name(name() + ".avg_translation_latency")
|
||||
.desc("Avg. translation latency for data translations")
|
||||
;
|
||||
|
||||
tlbLatency = tlbCycles / tlbRequests;
|
||||
|
||||
hitsPerTLBLevel
|
||||
.init(4)
|
||||
.name(name() + ".TLB_hits_distribution")
|
||||
.desc("TLB hits distribution (0 for page table, x for Lx-TLB")
|
||||
;
|
||||
|
||||
// fixed number of TLB levels
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (!i)
|
||||
hitsPerTLBLevel.subname(i,"page_table");
|
||||
else
|
||||
hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
|
||||
}
|
||||
|
||||
execRateDist
|
||||
.init(0, 10, 2)
|
||||
.name(name() + ".inst_exec_rate")
|
||||
.desc("Instruction Execution Rate: Number of executed vector "
|
||||
"instructions per cycle")
|
||||
;
|
||||
|
||||
ldsBankConflictDist
|
||||
.init(0, wfSize(), 2)
|
||||
.name(name() + ".lds_bank_conflicts")
|
||||
.desc("Number of bank conflicts per LDS memory packet")
|
||||
;
|
||||
|
||||
ldsBankAccesses
|
||||
.name(name() + ".lds_bank_access_cnt")
|
||||
.desc("Total number of LDS bank accesses")
|
||||
;
|
||||
|
||||
pageDivergenceDist
|
||||
// A wavefront can touch up to N pages per memory instruction where
|
||||
// N is equal to the wavefront size
|
||||
// The number of pages per bin can be configured (here it's 4).
|
||||
.init(1, wfSize(), 4)
|
||||
.name(name() + ".page_divergence_dist")
|
||||
.desc("pages touched per wf (over all mem. instr.)")
|
||||
;
|
||||
|
||||
controlFlowDivergenceDist
|
||||
.init(1, wfSize(), 4)
|
||||
.name(name() + ".warp_execution_dist")
|
||||
.desc("number of lanes active per instruction (oval all instructions)")
|
||||
;
|
||||
|
||||
activeLanesPerGMemInstrDist
|
||||
.init(1, wfSize(), 4)
|
||||
.name(name() + ".gmem_lanes_execution_dist")
|
||||
.desc("number of active lanes per global memory instruction")
|
||||
;
|
||||
|
||||
activeLanesPerLMemInstrDist
|
||||
.init(1, wfSize(), 4)
|
||||
.name(name() + ".lmem_lanes_execution_dist")
|
||||
.desc("number of active lanes per local memory instruction")
|
||||
;
|
||||
|
||||
numInstrExecuted
|
||||
.name(name() + ".num_instr_executed")
|
||||
.desc("number of instructions executed")
|
||||
;
|
||||
|
||||
numVecOpsExecuted
|
||||
.name(name() + ".num_vec_ops_executed")
|
||||
.desc("number of vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedF16
|
||||
.name(name() + ".num_vec_ops_f16_executed")
|
||||
.desc("number of f16 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedF32
|
||||
.name(name() + ".num_vec_ops_f32_executed")
|
||||
.desc("number of f32 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedF64
|
||||
.name(name() + ".num_vec_ops_f64_executed")
|
||||
.desc("number of f64 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedFMA16
|
||||
.name(name() + ".num_vec_ops_fma16_executed")
|
||||
.desc("number of fma16 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedFMA32
|
||||
.name(name() + ".num_vec_ops_fma32_executed")
|
||||
.desc("number of fma32 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedFMA64
|
||||
.name(name() + ".num_vec_ops_fma64_executed")
|
||||
.desc("number of fma64 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedMAD16
|
||||
.name(name() + ".num_vec_ops_mad16_executed")
|
||||
.desc("number of mad16 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedMAD32
|
||||
.name(name() + ".num_vec_ops_mad32_executed")
|
||||
.desc("number of mad32 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedMAD64
|
||||
.name(name() + ".num_vec_ops_mad64_executed")
|
||||
.desc("number of mad64 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedMAC16
|
||||
.name(name() + ".num_vec_ops_mac16_executed")
|
||||
.desc("number of mac16 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedMAC32
|
||||
.name(name() + ".num_vec_ops_mac32_executed")
|
||||
.desc("number of mac32 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedMAC64
|
||||
.name(name() + ".num_vec_ops_mac64_executed")
|
||||
.desc("number of mac64 vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
numVecOpsExecutedTwoOpFP
|
||||
.name(name() + ".num_vec_ops_two_op_fp_executed")
|
||||
.desc("number of two op FP vec ops executed (e.g. WF size/inst)")
|
||||
;
|
||||
|
||||
totalCycles
|
||||
.name(name() + ".num_total_cycles")
|
||||
.desc("number of cycles the CU ran for")
|
||||
;
|
||||
|
||||
ipc
|
||||
.name(name() + ".ipc")
|
||||
.desc("Instructions per cycle (this CU only)")
|
||||
;
|
||||
|
||||
vpc
|
||||
.name(name() + ".vpc")
|
||||
.desc("Vector Operations per cycle (this CU only)")
|
||||
;
|
||||
|
||||
vpc_f16
|
||||
.name(name() + ".vpc_f16")
|
||||
.desc("F16 Vector Operations per cycle (this CU only)")
|
||||
;
|
||||
|
||||
vpc_f32
|
||||
.name(name() + ".vpc_f32")
|
||||
.desc("F32 Vector Operations per cycle (this CU only)")
|
||||
;
|
||||
|
||||
vpc_f64
|
||||
.name(name() + ".vpc_f64")
|
||||
.desc("F64 Vector Operations per cycle (this CU only)")
|
||||
;
|
||||
|
||||
numALUInstsExecuted
|
||||
.name(name() + ".num_alu_insts_executed")
|
||||
.desc("Number of dynamic non-GM memory insts executed")
|
||||
;
|
||||
|
||||
wgBlockedDueBarrierAllocation
|
||||
.name(name() + ".wg_blocked_due_barrier_alloc")
|
||||
.desc("WG dispatch was blocked due to lack of barrier resources")
|
||||
;
|
||||
|
||||
wgBlockedDueLdsAllocation
|
||||
.name(name() + ".wg_blocked_due_lds_alloc")
|
||||
.desc("Workgroup blocked due to LDS capacity")
|
||||
;
|
||||
|
||||
ipc = numInstrExecuted / totalCycles;
|
||||
vpc = numVecOpsExecuted / totalCycles;
|
||||
vpc_f16 = numVecOpsExecutedF16 / totalCycles;
|
||||
vpc_f32 = numVecOpsExecutedF32 / totalCycles;
|
||||
vpc_f64 = numVecOpsExecutedF64 / totalCycles;
|
||||
|
||||
numTimesWgBlockedDueVgprAlloc
|
||||
.name(name() + ".times_wg_blocked_due_vgpr_alloc")
|
||||
.desc("Number of times WGs are blocked due to VGPR allocation per "
|
||||
"SIMD")
|
||||
;
|
||||
|
||||
numTimesWgBlockedDueSgprAlloc
|
||||
.name(name() + ".times_wg_blocked_due_sgpr_alloc")
|
||||
.desc("Number of times WGs are blocked due to SGPR allocation per "
|
||||
"SIMD")
|
||||
;
|
||||
|
||||
dynamicGMemInstrCnt
|
||||
.name(name() + ".global_mem_instr_cnt")
|
||||
.desc("dynamic non-flat global memory instruction count")
|
||||
;
|
||||
|
||||
dynamicFlatMemInstrCnt
|
||||
.name(name() + ".flat_global_mem_instr_cnt")
|
||||
.desc("dynamic flat global memory instruction count")
|
||||
;
|
||||
|
||||
dynamicLMemInstrCnt
|
||||
.name(name() + ".local_mem_instr_cnt")
|
||||
.desc("dynamic local memory intruction count")
|
||||
;
|
||||
|
||||
numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
|
||||
dynamicLMemInstrCnt;
|
||||
|
||||
completedWfs
|
||||
.name(name() + ".num_completed_wfs")
|
||||
.desc("number of completed wavefronts")
|
||||
;
|
||||
|
||||
completedWGs
|
||||
.name(name() + ".num_completed_wgs")
|
||||
.desc("number of completed workgroups")
|
||||
;
|
||||
|
||||
numCASOps
|
||||
.name(name() + ".num_CAS_ops")
|
||||
.desc("number of compare and swap operations")
|
||||
;
|
||||
|
||||
numFailedCASOps
|
||||
.name(name() + ".num_failed_CAS_ops")
|
||||
.desc("number of compare and swap operations that failed")
|
||||
;
|
||||
|
||||
headTailLatency
|
||||
.init(0, 1000000, 10000)
|
||||
.name(name() + ".head_tail_latency")
|
||||
.desc("ticks between first and last cache block arrival at coalescer")
|
||||
.flags(Stats::pdf | Stats::oneline)
|
||||
;
|
||||
|
||||
waveLevelParallelism
|
||||
.init(0, shader->n_wf * numVectorALUs, 1)
|
||||
.name(name() + ".wlp")
|
||||
.desc("wave level parallelism: count of active waves at wave launch")
|
||||
;
|
||||
|
||||
instInterleave
|
||||
.init(numVectorALUs, 0, 20, 1)
|
||||
.name(name() + ".interleaving")
|
||||
.desc("Measure of instruction interleaving per SIMD")
|
||||
;
|
||||
|
||||
// register stats of pipeline stages
|
||||
fetchStage.regStats();
|
||||
scoreboardCheckStage.regStats();
|
||||
scheduleStage.regStats();
|
||||
execStage.regStats();
|
||||
|
||||
// register stats of memory pipelines
|
||||
globalMemoryPipe.regStats();
|
||||
localMemoryPipe.regStats();
|
||||
scalarMemoryPipe.regStats();
|
||||
|
||||
registerManager->regStats();
|
||||
}
|
||||
|
||||
void
|
||||
ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
if (gpuDynInst->isScalar()) {
|
||||
if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
|
||||
sALUInsts++;
|
||||
instCyclesSALU++;
|
||||
stats.sALUInsts++;
|
||||
stats.instCyclesSALU++;
|
||||
} else if (gpuDynInst->isLoad()) {
|
||||
scalarMemReads++;
|
||||
stats.scalarMemReads++;
|
||||
} else if (gpuDynInst->isStore()) {
|
||||
scalarMemWrites++;
|
||||
stats.scalarMemWrites++;
|
||||
}
|
||||
} else {
|
||||
if (gpuDynInst->isALU()) {
|
||||
@@ -2350,45 +1807,46 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
|
||||
if (shader->total_valu_insts == shader->max_valu_insts) {
|
||||
exitSimLoop("max vALU insts");
|
||||
}
|
||||
vALUInsts++;
|
||||
instCyclesVALU++;
|
||||
threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
|
||||
stats.vALUInsts++;
|
||||
stats.instCyclesVALU++;
|
||||
stats.threadCyclesVALU
|
||||
+= gpuDynInst->wavefront()->execMask().count();
|
||||
} else if (gpuDynInst->isFlat()) {
|
||||
if (gpuDynInst->isLocalMem()) {
|
||||
flatLDSInsts++;
|
||||
stats.flatLDSInsts++;
|
||||
} else {
|
||||
flatVMemInsts++;
|
||||
stats.flatVMemInsts++;
|
||||
}
|
||||
} else if (gpuDynInst->isLocalMem()) {
|
||||
ldsNoFlatInsts++;
|
||||
stats.ldsNoFlatInsts++;
|
||||
} else if (gpuDynInst->isLoad()) {
|
||||
vectorMemReads++;
|
||||
stats.vectorMemReads++;
|
||||
} else if (gpuDynInst->isStore()) {
|
||||
vectorMemWrites++;
|
||||
stats.vectorMemWrites++;
|
||||
}
|
||||
|
||||
if (gpuDynInst->isLoad()) {
|
||||
switch (gpuDynInst->executedAs()) {
|
||||
case Enums::SC_SPILL:
|
||||
spillReads++;
|
||||
stats.spillReads++;
|
||||
break;
|
||||
case Enums::SC_GLOBAL:
|
||||
globalReads++;
|
||||
stats.globalReads++;
|
||||
break;
|
||||
case Enums::SC_GROUP:
|
||||
groupReads++;
|
||||
stats.groupReads++;
|
||||
break;
|
||||
case Enums::SC_PRIVATE:
|
||||
privReads++;
|
||||
stats.privReads++;
|
||||
break;
|
||||
case Enums::SC_READONLY:
|
||||
readonlyReads++;
|
||||
stats.readonlyReads++;
|
||||
break;
|
||||
case Enums::SC_KERNARG:
|
||||
kernargReads++;
|
||||
stats.kernargReads++;
|
||||
break;
|
||||
case Enums::SC_ARG:
|
||||
argReads++;
|
||||
stats.argReads++;
|
||||
break;
|
||||
case Enums::SC_NONE:
|
||||
/**
|
||||
@@ -2403,25 +1861,25 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
|
||||
} else if (gpuDynInst->isStore()) {
|
||||
switch (gpuDynInst->executedAs()) {
|
||||
case Enums::SC_SPILL:
|
||||
spillWrites++;
|
||||
stats.spillWrites++;
|
||||
break;
|
||||
case Enums::SC_GLOBAL:
|
||||
globalWrites++;
|
||||
stats.globalWrites++;
|
||||
break;
|
||||
case Enums::SC_GROUP:
|
||||
groupWrites++;
|
||||
stats.groupWrites++;
|
||||
break;
|
||||
case Enums::SC_PRIVATE:
|
||||
privWrites++;
|
||||
stats.privWrites++;
|
||||
break;
|
||||
case Enums::SC_READONLY:
|
||||
readonlyWrites++;
|
||||
stats.readonlyWrites++;
|
||||
break;
|
||||
case Enums::SC_KERNARG:
|
||||
kernargWrites++;
|
||||
stats.kernargWrites++;
|
||||
break;
|
||||
case Enums::SC_ARG:
|
||||
argWrites++;
|
||||
stats.argWrites++;
|
||||
break;
|
||||
case Enums::SC_NONE:
|
||||
/**
|
||||
@@ -2636,3 +2094,241 @@ ComputeUnit::LDSPort::recvReqRetry()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ComputeUnit::ComputeUnitStats::ComputeUnitStats(Stats::Group *parent, int n_wf)
|
||||
: Stats::Group(parent),
|
||||
ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
|
||||
ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
|
||||
"per-wavefront."),
|
||||
ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
|
||||
ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
|
||||
"per-wavefront."),
|
||||
ADD_STAT(instCyclesVALU,
|
||||
"Number of cycles needed to execute VALU insts."),
|
||||
ADD_STAT(instCyclesSALU,
|
||||
"Number of cycles needed to execute SALU insts."),
|
||||
ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
|
||||
"vector ALU ops. Similar to instCyclesVALU but multiplied by "
|
||||
"the number of active threads."),
|
||||
ADD_STAT(vALUUtilization,
|
||||
"Percentage of active vector ALU threads in a wave."),
|
||||
ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
|
||||
" accesses that resolve to LDS."),
|
||||
ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
|
||||
"including FLAT accesses that resolve to LDS) per-wavefront."),
|
||||
ADD_STAT(flatVMemInsts,
|
||||
"The number of FLAT insts that resolve to vmem issued."),
|
||||
ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
|
||||
"resolve to vmem issued per-wavefront."),
|
||||
ADD_STAT(flatLDSInsts,
|
||||
"The number of FLAT insts that resolve to LDS issued."),
|
||||
ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
|
||||
"resolve to LDS issued per-wavefront."),
|
||||
ADD_STAT(vectorMemWrites,
|
||||
"Number of vector mem write insts (excluding FLAT insts)."),
|
||||
ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
|
||||
"insts (excluding FLAT insts) per-wavefront."),
|
||||
ADD_STAT(vectorMemReads,
|
||||
"Number of vector mem read insts (excluding FLAT insts)."),
|
||||
ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
|
||||
"(excluding FLAT insts) per-wavefront."),
|
||||
ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
|
||||
ADD_STAT(scalarMemWritesPerWF,
|
||||
"The average number of scalar mem write insts per-wavefront."),
|
||||
ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
|
||||
ADD_STAT(scalarMemReadsPerWF,
|
||||
"The average number of scalar mem read insts per-wavefront."),
|
||||
ADD_STAT(vectorMemReadsPerKiloInst,
|
||||
"Number of vector mem reads per kilo-instruction"),
|
||||
ADD_STAT(vectorMemWritesPerKiloInst,
|
||||
"Number of vector mem writes per kilo-instruction"),
|
||||
ADD_STAT(vectorMemInstsPerKiloInst,
|
||||
"Number of vector mem insts per kilo-instruction"),
|
||||
ADD_STAT(scalarMemReadsPerKiloInst,
|
||||
"Number of scalar mem reads per kilo-instruction"),
|
||||
ADD_STAT(scalarMemWritesPerKiloInst,
|
||||
"Number of scalar mem writes per kilo-instruction"),
|
||||
ADD_STAT(scalarMemInstsPerKiloInst,
|
||||
"Number of scalar mem insts per kilo-instruction"),
|
||||
ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
|
||||
"command, data from VRF to vector memory unit, per SIMD"),
|
||||
ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
|
||||
"command, data from SRF to scalar memory unit, per SIMD"),
|
||||
ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
|
||||
"command, data from VRF to LDS unit, per SIMD"),
|
||||
ADD_STAT(globalReads, "Number of reads to the global segment"),
|
||||
ADD_STAT(globalWrites, "Number of writes to the global segment"),
|
||||
ADD_STAT(globalMemInsts,
|
||||
"Number of memory instructions sent to the global segment"),
|
||||
ADD_STAT(argReads, "Number of reads to the arg segment"),
|
||||
ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
|
||||
ADD_STAT(argMemInsts,
|
||||
"Number of memory instructions sent to the arg segment"),
|
||||
ADD_STAT(spillReads, "Number of reads to the spill segment"),
|
||||
ADD_STAT(spillWrites, "Number of writes to the spill segment"),
|
||||
ADD_STAT(spillMemInsts,
|
||||
"Number of memory instructions sent to the spill segment"),
|
||||
ADD_STAT(groupReads, "Number of reads to the group segment"),
|
||||
ADD_STAT(groupWrites, "Number of writes to the group segment"),
|
||||
ADD_STAT(groupMemInsts,
|
||||
"Number of memory instructions sent to the group segment"),
|
||||
ADD_STAT(privReads, "Number of reads to the private segment"),
|
||||
ADD_STAT(privWrites, "Number of writes to the private segment"),
|
||||
ADD_STAT(privMemInsts,
|
||||
"Number of memory instructions sent to the private segment"),
|
||||
ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
|
||||
ADD_STAT(readonlyWrites,
|
||||
"Number of memory instructions sent to the readonly segment"),
|
||||
ADD_STAT(readonlyMemInsts,
|
||||
"Number of memory instructions sent to the readonly segment"),
|
||||
ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
|
||||
ADD_STAT(kernargWrites,
|
||||
"Number of memory instructions sent to the kernarg segment"),
|
||||
ADD_STAT(kernargMemInsts,
|
||||
"Number of memory instructions sent to the kernarg segment"),
|
||||
ADD_STAT(waveLevelParallelism,
|
||||
"wave level parallelism: count of active waves at wave launch"),
|
||||
ADD_STAT(tlbRequests, "number of uncoalesced requests"),
|
||||
ADD_STAT(tlbCycles,
|
||||
"total number of cycles for all uncoalesced requests"),
|
||||
ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
|
||||
ADD_STAT(hitsPerTLBLevel,
|
||||
"TLB hits distribution (0 for page table, x for Lx-TLB)"),
|
||||
ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
|
||||
ADD_STAT(ldsBankConflictDist,
|
||||
"Number of bank conflicts per LDS memory packet"),
|
||||
ADD_STAT(pageDivergenceDist,
|
||||
"pages touched per wf (over all mem. instr.)"),
|
||||
ADD_STAT(dynamicGMemInstrCnt,
|
||||
"dynamic non-flat global memory instruction count"),
|
||||
ADD_STAT(dynamicFlatMemInstrCnt,
|
||||
"dynamic flat global memory instruction count"),
|
||||
ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
|
||||
ADD_STAT(wgBlockedDueBarrierAllocation,
|
||||
"WG dispatch was blocked due to lack of barrier resources"),
|
||||
ADD_STAT(wgBlockedDueLdsAllocation,
|
||||
"Workgroup blocked due to LDS capacity"),
|
||||
ADD_STAT(numInstrExecuted, "number of instructions executed"),
|
||||
ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
|
||||
"vector instructions per cycle"),
|
||||
ADD_STAT(numVecOpsExecuted,
|
||||
"number of vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedF16,
|
||||
"number of f16 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedF32,
|
||||
"number of f32 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedF64,
|
||||
"number of f64 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedFMA16,
|
||||
"number of fma16 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedFMA32,
|
||||
"number of fma32 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedFMA64,
|
||||
"number of fma64 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMAC16,
|
||||
"number of mac16 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMAC32,
|
||||
"number of mac32 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMAC64,
|
||||
"number of mac64 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMAD16,
|
||||
"number of mad16 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMAD32,
|
||||
"number of mad32 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMAD64,
|
||||
"number of mad64 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedTwoOpFP,
|
||||
"number of two op FP vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(totalCycles, "number of cycles the CU ran for"),
|
||||
ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
|
||||
ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
|
||||
ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
|
||||
ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
|
||||
ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
|
||||
ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
|
||||
"instruction (over all instructions)"),
|
||||
ADD_STAT(activeLanesPerGMemInstrDist,
|
||||
"number of active lanes per global memory instruction"),
|
||||
ADD_STAT(activeLanesPerLMemInstrDist,
|
||||
"number of active lanes per local memory instruction"),
|
||||
ADD_STAT(numALUInstsExecuted,
|
||||
"Number of dynamic non-GM memory insts executed"),
|
||||
ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
|
||||
"blocked due to VGPR allocation per SIMD"),
|
||||
ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
|
||||
"blocked due to SGPR allocation per SIMD"),
|
||||
ADD_STAT(numCASOps, "number of compare and swap operations"),
|
||||
ADD_STAT(numFailedCASOps,
|
||||
"number of compare and swap operations that failed"),
|
||||
ADD_STAT(completedWfs, "number of completed wavefronts"),
|
||||
ADD_STAT(completedWGs, "number of completed workgroups"),
|
||||
ADD_STAT(headTailLatency, "ticks between first and last cache block "
|
||||
"arrival at coalescer"),
|
||||
ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
|
||||
{
|
||||
ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
|
||||
|
||||
instCyclesVMemPerSimd.init(cu->numVectorALUs);
|
||||
instCyclesScMemPerSimd.init(cu->numVectorALUs);
|
||||
instCyclesLdsPerSimd.init(cu->numVectorALUs);
|
||||
|
||||
hitsPerTLBLevel.init(4);
|
||||
execRateDist.init(0, 10, 2);
|
||||
ldsBankConflictDist.init(0, cu->wfSize(), 2);
|
||||
|
||||
pageDivergenceDist.init(1, cu->wfSize(), 4);
|
||||
controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
|
||||
activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4);
|
||||
activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4);
|
||||
|
||||
headTailLatency.init(0, 1000000, 10000).flags(Stats::pdf | Stats::oneline);
|
||||
waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
|
||||
instInterleave.init(cu->numVectorALUs, 0, 20, 1);
|
||||
|
||||
vALUInstsPerWF = vALUInsts / completedWfs;
|
||||
sALUInstsPerWF = sALUInsts / completedWfs;
|
||||
vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
|
||||
ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
|
||||
flatVMemInstsPerWF = flatVMemInsts / completedWfs;
|
||||
flatLDSInstsPerWF = flatLDSInsts / completedWfs;
|
||||
vectorMemWritesPerWF = vectorMemWrites / completedWfs;
|
||||
vectorMemReadsPerWF = vectorMemReads / completedWfs;
|
||||
scalarMemWritesPerWF = scalarMemWrites / completedWfs;
|
||||
scalarMemReadsPerWF = scalarMemReads / completedWfs;
|
||||
|
||||
vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
|
||||
vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
|
||||
vectorMemInstsPerKiloInst =
|
||||
((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
|
||||
scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
|
||||
scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
|
||||
scalarMemInstsPerKiloInst =
|
||||
((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
|
||||
|
||||
globalMemInsts = globalReads + globalWrites;
|
||||
argMemInsts = argReads + argWrites;
|
||||
spillMemInsts = spillReads + spillWrites;
|
||||
groupMemInsts = groupReads + groupWrites;
|
||||
privMemInsts = privReads + privWrites;
|
||||
readonlyMemInsts = readonlyReads + readonlyWrites;
|
||||
kernargMemInsts = kernargReads + kernargWrites;
|
||||
|
||||
tlbLatency = tlbCycles / tlbRequests;
|
||||
|
||||
// fixed number of TLB levels
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (!i)
|
||||
hitsPerTLBLevel.subname(i,"page_table");
|
||||
else
|
||||
hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
|
||||
}
|
||||
|
||||
ipc = numInstrExecuted / totalCycles;
|
||||
vpc = numVecOpsExecuted / totalCycles;
|
||||
vpc_f16 = numVecOpsExecutedF16 / totalCycles;
|
||||
vpc_f32 = numVecOpsExecutedF32 / totalCycles;
|
||||
vpc_f64 = numVecOpsExecutedF64 / totalCycles;
|
||||
|
||||
numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
|
||||
dynamicLMemInstrCnt;
|
||||
}
|
||||
|
||||
@@ -42,6 +42,7 @@
|
||||
#include "base/callback.hh"
|
||||
#include "base/compiler.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
#include "base/types.hh"
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "enums/PrefetchType.hh"
|
||||
@@ -320,12 +321,6 @@ class ComputeUnit : public ClockedObject
|
||||
// tracks the last cycle a vector instruction was executed on a SIMD
|
||||
std::vector<uint64_t> lastExecCycle;
|
||||
|
||||
// Track the amount of interleaving between wavefronts on each SIMD.
|
||||
// This stat is sampled using instExecPerSimd to compute the number of
|
||||
// instructions that have been executed on a SIMD between a WF executing
|
||||
// two successive instructions.
|
||||
Stats::VectorDistribution instInterleave;
|
||||
|
||||
// tracks the number of dyn inst executed per SIMD
|
||||
std::vector<uint64_t> instExecPerSimd;
|
||||
|
||||
@@ -472,148 +467,6 @@ class ComputeUnit : public ClockedObject
|
||||
LdsState &lds;
|
||||
|
||||
public:
|
||||
Stats::Scalar vALUInsts;
|
||||
Stats::Formula vALUInstsPerWF;
|
||||
Stats::Scalar sALUInsts;
|
||||
Stats::Formula sALUInstsPerWF;
|
||||
Stats::Scalar instCyclesVALU;
|
||||
Stats::Scalar instCyclesSALU;
|
||||
Stats::Scalar threadCyclesVALU;
|
||||
Stats::Formula vALUUtilization;
|
||||
Stats::Scalar ldsNoFlatInsts;
|
||||
Stats::Formula ldsNoFlatInstsPerWF;
|
||||
Stats::Scalar flatVMemInsts;
|
||||
Stats::Formula flatVMemInstsPerWF;
|
||||
Stats::Scalar flatLDSInsts;
|
||||
Stats::Formula flatLDSInstsPerWF;
|
||||
Stats::Scalar vectorMemWrites;
|
||||
Stats::Formula vectorMemWritesPerWF;
|
||||
Stats::Scalar vectorMemReads;
|
||||
Stats::Formula vectorMemReadsPerWF;
|
||||
Stats::Scalar scalarMemWrites;
|
||||
Stats::Formula scalarMemWritesPerWF;
|
||||
Stats::Scalar scalarMemReads;
|
||||
Stats::Formula scalarMemReadsPerWF;
|
||||
|
||||
Stats::Formula vectorMemReadsPerKiloInst;
|
||||
Stats::Formula vectorMemWritesPerKiloInst;
|
||||
Stats::Formula vectorMemInstsPerKiloInst;
|
||||
Stats::Formula scalarMemReadsPerKiloInst;
|
||||
Stats::Formula scalarMemWritesPerKiloInst;
|
||||
Stats::Formula scalarMemInstsPerKiloInst;
|
||||
|
||||
// Cycles required to send register source (addr and data) from
|
||||
// register files to memory pipeline, per SIMD.
|
||||
Stats::Vector instCyclesVMemPerSimd;
|
||||
Stats::Vector instCyclesScMemPerSimd;
|
||||
Stats::Vector instCyclesLdsPerSimd;
|
||||
|
||||
Stats::Scalar globalReads;
|
||||
Stats::Scalar globalWrites;
|
||||
Stats::Formula globalMemInsts;
|
||||
Stats::Scalar argReads;
|
||||
Stats::Scalar argWrites;
|
||||
Stats::Formula argMemInsts;
|
||||
Stats::Scalar spillReads;
|
||||
Stats::Scalar spillWrites;
|
||||
Stats::Formula spillMemInsts;
|
||||
Stats::Scalar groupReads;
|
||||
Stats::Scalar groupWrites;
|
||||
Stats::Formula groupMemInsts;
|
||||
Stats::Scalar privReads;
|
||||
Stats::Scalar privWrites;
|
||||
Stats::Formula privMemInsts;
|
||||
Stats::Scalar readonlyReads;
|
||||
Stats::Scalar readonlyWrites;
|
||||
Stats::Formula readonlyMemInsts;
|
||||
Stats::Scalar kernargReads;
|
||||
Stats::Scalar kernargWrites;
|
||||
Stats::Formula kernargMemInsts;
|
||||
|
||||
int activeWaves;
|
||||
Stats::Distribution waveLevelParallelism;
|
||||
|
||||
void updateInstStats(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
// the following stats compute the avg. TLB accesslatency per
|
||||
// uncoalesced request (only for data)
|
||||
Stats::Scalar tlbRequests;
|
||||
Stats::Scalar tlbCycles;
|
||||
Stats::Formula tlbLatency;
|
||||
// hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
|
||||
Stats::Vector hitsPerTLBLevel;
|
||||
|
||||
Stats::Scalar ldsBankAccesses;
|
||||
Stats::Distribution ldsBankConflictDist;
|
||||
|
||||
// over all memory instructions executed over all wavefronts
|
||||
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
|
||||
Stats::Distribution pageDivergenceDist;
|
||||
// count of non-flat global memory vector instructions executed
|
||||
Stats::Scalar dynamicGMemInstrCnt;
|
||||
// count of flat global memory vector instructions executed
|
||||
Stats::Scalar dynamicFlatMemInstrCnt;
|
||||
Stats::Scalar dynamicLMemInstrCnt;
|
||||
|
||||
Stats::Scalar wgBlockedDueBarrierAllocation;
|
||||
Stats::Scalar wgBlockedDueLdsAllocation;
|
||||
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
|
||||
// active when the instruction is committed, this number is still
|
||||
// incremented by 1
|
||||
Stats::Scalar numInstrExecuted;
|
||||
// Number of cycles among successive instruction executions across all
|
||||
// wavefronts of the same CU
|
||||
Stats::Distribution execRateDist;
|
||||
// number of individual vector operations executed
|
||||
Stats::Scalar numVecOpsExecuted;
|
||||
// number of individual f16 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedF16;
|
||||
// number of individual f32 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedF32;
|
||||
// number of individual f64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedF64;
|
||||
// number of individual FMA 16,32,64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedFMA16;
|
||||
Stats::Scalar numVecOpsExecutedFMA32;
|
||||
Stats::Scalar numVecOpsExecutedFMA64;
|
||||
// number of individual MAC 16,32,64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedMAC16;
|
||||
Stats::Scalar numVecOpsExecutedMAC32;
|
||||
Stats::Scalar numVecOpsExecutedMAC64;
|
||||
// number of individual MAD 16,32,64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedMAD16;
|
||||
Stats::Scalar numVecOpsExecutedMAD32;
|
||||
Stats::Scalar numVecOpsExecutedMAD64;
|
||||
// total number of two op FP vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedTwoOpFP;
|
||||
// Total cycles that something is running on the GPU
|
||||
Stats::Scalar totalCycles;
|
||||
Stats::Formula vpc; // vector ops per cycle
|
||||
Stats::Formula vpc_f16; // vector ops per cycle
|
||||
Stats::Formula vpc_f32; // vector ops per cycle
|
||||
Stats::Formula vpc_f64; // vector ops per cycle
|
||||
Stats::Formula ipc; // vector instructions per cycle
|
||||
Stats::Distribution controlFlowDivergenceDist;
|
||||
Stats::Distribution activeLanesPerGMemInstrDist;
|
||||
Stats::Distribution activeLanesPerLMemInstrDist;
|
||||
// number of vector ALU instructions received
|
||||
Stats::Formula numALUInstsExecuted;
|
||||
// number of times a WG can not start due to lack of free VGPRs in SIMDs
|
||||
Stats::Scalar numTimesWgBlockedDueVgprAlloc;
|
||||
// number of times a WG can not start due to lack of free SGPRs in SIMDs
|
||||
Stats::Scalar numTimesWgBlockedDueSgprAlloc;
|
||||
Stats::Scalar numCASOps;
|
||||
Stats::Scalar numFailedCASOps;
|
||||
Stats::Scalar completedWfs;
|
||||
Stats::Scalar completedWGs;
|
||||
|
||||
// distrubtion in latency difference between first and last cache block
|
||||
// arrival ticks
|
||||
Stats::Distribution headTailLatency;
|
||||
|
||||
void
|
||||
regStats() override;
|
||||
|
||||
LdsState &
|
||||
getLds() const
|
||||
{
|
||||
@@ -1081,6 +934,158 @@ class ComputeUnit : public ClockedObject
|
||||
// a particular GPUDynInst. This is used to calculate the difference
|
||||
// between the first and last chace block arrival times.
|
||||
std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
|
||||
|
||||
public:
|
||||
void updateInstStats(GPUDynInstPtr gpuDynInst);
|
||||
int activeWaves;
|
||||
|
||||
struct ComputeUnitStats : public Stats::Group
|
||||
{
|
||||
ComputeUnitStats(Stats::Group *parent, int n_wf);
|
||||
|
||||
Stats::Scalar vALUInsts;
|
||||
Stats::Formula vALUInstsPerWF;
|
||||
Stats::Scalar sALUInsts;
|
||||
Stats::Formula sALUInstsPerWF;
|
||||
Stats::Scalar instCyclesVALU;
|
||||
Stats::Scalar instCyclesSALU;
|
||||
Stats::Scalar threadCyclesVALU;
|
||||
Stats::Formula vALUUtilization;
|
||||
Stats::Scalar ldsNoFlatInsts;
|
||||
Stats::Formula ldsNoFlatInstsPerWF;
|
||||
Stats::Scalar flatVMemInsts;
|
||||
Stats::Formula flatVMemInstsPerWF;
|
||||
Stats::Scalar flatLDSInsts;
|
||||
Stats::Formula flatLDSInstsPerWF;
|
||||
Stats::Scalar vectorMemWrites;
|
||||
Stats::Formula vectorMemWritesPerWF;
|
||||
Stats::Scalar vectorMemReads;
|
||||
Stats::Formula vectorMemReadsPerWF;
|
||||
Stats::Scalar scalarMemWrites;
|
||||
Stats::Formula scalarMemWritesPerWF;
|
||||
Stats::Scalar scalarMemReads;
|
||||
Stats::Formula scalarMemReadsPerWF;
|
||||
|
||||
Stats::Formula vectorMemReadsPerKiloInst;
|
||||
Stats::Formula vectorMemWritesPerKiloInst;
|
||||
Stats::Formula vectorMemInstsPerKiloInst;
|
||||
Stats::Formula scalarMemReadsPerKiloInst;
|
||||
Stats::Formula scalarMemWritesPerKiloInst;
|
||||
Stats::Formula scalarMemInstsPerKiloInst;
|
||||
|
||||
// Cycles required to send register source (addr and data) from
|
||||
// register files to memory pipeline, per SIMD.
|
||||
Stats::Vector instCyclesVMemPerSimd;
|
||||
Stats::Vector instCyclesScMemPerSimd;
|
||||
Stats::Vector instCyclesLdsPerSimd;
|
||||
|
||||
Stats::Scalar globalReads;
|
||||
Stats::Scalar globalWrites;
|
||||
Stats::Formula globalMemInsts;
|
||||
Stats::Scalar argReads;
|
||||
Stats::Scalar argWrites;
|
||||
Stats::Formula argMemInsts;
|
||||
Stats::Scalar spillReads;
|
||||
Stats::Scalar spillWrites;
|
||||
Stats::Formula spillMemInsts;
|
||||
Stats::Scalar groupReads;
|
||||
Stats::Scalar groupWrites;
|
||||
Stats::Formula groupMemInsts;
|
||||
Stats::Scalar privReads;
|
||||
Stats::Scalar privWrites;
|
||||
Stats::Formula privMemInsts;
|
||||
Stats::Scalar readonlyReads;
|
||||
Stats::Scalar readonlyWrites;
|
||||
Stats::Formula readonlyMemInsts;
|
||||
Stats::Scalar kernargReads;
|
||||
Stats::Scalar kernargWrites;
|
||||
Stats::Formula kernargMemInsts;
|
||||
|
||||
Stats::Distribution waveLevelParallelism;
|
||||
|
||||
// the following stats compute the avg. TLB accesslatency per
|
||||
// uncoalesced request (only for data)
|
||||
Stats::Scalar tlbRequests;
|
||||
Stats::Scalar tlbCycles;
|
||||
Stats::Formula tlbLatency;
|
||||
// hitsPerTLBLevel[x] are the hits in Level x TLB.
|
||||
// x = 0 is the page table.
|
||||
Stats::Vector hitsPerTLBLevel;
|
||||
|
||||
Stats::Scalar ldsBankAccesses;
|
||||
Stats::Distribution ldsBankConflictDist;
|
||||
|
||||
// over all memory instructions executed over all wavefronts
|
||||
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
|
||||
Stats::Distribution pageDivergenceDist;
|
||||
// count of non-flat global memory vector instructions executed
|
||||
Stats::Scalar dynamicGMemInstrCnt;
|
||||
// count of flat global memory vector instructions executed
|
||||
Stats::Scalar dynamicFlatMemInstrCnt;
|
||||
Stats::Scalar dynamicLMemInstrCnt;
|
||||
|
||||
Stats::Scalar wgBlockedDueBarrierAllocation;
|
||||
Stats::Scalar wgBlockedDueLdsAllocation;
|
||||
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
|
||||
// active when the instruction is committed, this number is still
|
||||
// incremented by 1
|
||||
Stats::Scalar numInstrExecuted;
|
||||
// Number of cycles among successive instruction executions across all
|
||||
// wavefronts of the same CU
|
||||
Stats::Distribution execRateDist;
|
||||
// number of individual vector operations executed
|
||||
Stats::Scalar numVecOpsExecuted;
|
||||
// number of individual f16 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedF16;
|
||||
// number of individual f32 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedF32;
|
||||
// number of individual f64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedF64;
|
||||
// number of individual FMA 16,32,64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedFMA16;
|
||||
Stats::Scalar numVecOpsExecutedFMA32;
|
||||
Stats::Scalar numVecOpsExecutedFMA64;
|
||||
// number of individual MAC 16,32,64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedMAC16;
|
||||
Stats::Scalar numVecOpsExecutedMAC32;
|
||||
Stats::Scalar numVecOpsExecutedMAC64;
|
||||
// number of individual MAD 16,32,64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedMAD16;
|
||||
Stats::Scalar numVecOpsExecutedMAD32;
|
||||
Stats::Scalar numVecOpsExecutedMAD64;
|
||||
// total number of two op FP vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedTwoOpFP;
|
||||
// Total cycles that something is running on the GPU
|
||||
Stats::Scalar totalCycles;
|
||||
Stats::Formula vpc; // vector ops per cycle
|
||||
Stats::Formula vpc_f16; // vector ops per cycle
|
||||
Stats::Formula vpc_f32; // vector ops per cycle
|
||||
Stats::Formula vpc_f64; // vector ops per cycle
|
||||
Stats::Formula ipc; // vector instructions per cycle
|
||||
Stats::Distribution controlFlowDivergenceDist;
|
||||
Stats::Distribution activeLanesPerGMemInstrDist;
|
||||
Stats::Distribution activeLanesPerLMemInstrDist;
|
||||
// number of vector ALU instructions received
|
||||
Stats::Formula numALUInstsExecuted;
|
||||
// number of times a WG cannot start due to lack of free VGPRs in SIMDs
|
||||
Stats::Scalar numTimesWgBlockedDueVgprAlloc;
|
||||
// number of times a WG cannot start due to lack of free SGPRs in SIMDs
|
||||
Stats::Scalar numTimesWgBlockedDueSgprAlloc;
|
||||
Stats::Scalar numCASOps;
|
||||
Stats::Scalar numFailedCASOps;
|
||||
Stats::Scalar completedWfs;
|
||||
Stats::Scalar completedWGs;
|
||||
|
||||
// distrubtion in latency difference between first and last cache block
|
||||
// arrival ticks
|
||||
Stats::Distribution headTailLatency;
|
||||
|
||||
// Track the amount of interleaving between wavefronts on each SIMD.
|
||||
// This stat is sampled using instExecPerSimd to compute the number
|
||||
// of instructions that have been executed on a SIMD between a WF
|
||||
// executing two successive instructions.
|
||||
Stats::VectorDistribution instInterleave;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __COMPUTE_UNIT_HH__
|
||||
|
||||
@@ -49,7 +49,7 @@ GPUDispatcher::GPUDispatcher(const Params &p)
|
||||
: SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
|
||||
tickEvent([this]{ exec(); },
|
||||
"GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
|
||||
dispatchActive(false)
|
||||
dispatchActive(false), stats(this)
|
||||
{
|
||||
schedule(&tickEvent, 0);
|
||||
}
|
||||
@@ -58,21 +58,6 @@ GPUDispatcher::~GPUDispatcher()
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
GPUDispatcher::regStats()
|
||||
{
|
||||
numKernelLaunched
|
||||
.name(name() + ".num_kernel_launched")
|
||||
.desc("number of kernel launched")
|
||||
;
|
||||
|
||||
cyclesWaitingForDispatch
|
||||
.name(name() + ".cycles_wait_dispatch")
|
||||
.desc("number of cycles with outstanding wavefronts "
|
||||
"that are waiting to be dispatched")
|
||||
;
|
||||
}
|
||||
|
||||
HSAQueueEntry*
|
||||
GPUDispatcher::hsaTask(int disp_id)
|
||||
{
|
||||
@@ -127,7 +112,7 @@ GPUDispatcher::unserialize(CheckpointIn &cp)
|
||||
void
|
||||
GPUDispatcher::dispatch(HSAQueueEntry *task)
|
||||
{
|
||||
++numKernelLaunched;
|
||||
++stats.numKernelLaunched;
|
||||
|
||||
DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
|
||||
task->kernelName(), task->dispatchId());
|
||||
@@ -158,7 +143,7 @@ GPUDispatcher::exec()
|
||||
DPRINTF(GPUAgentDisp, "Launching %d Kernels\n", execIds.size());
|
||||
|
||||
if (execIds.size() > 0) {
|
||||
++cyclesWaitingForDispatch;
|
||||
++stats.cyclesWaitingForDispatch;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -368,3 +353,11 @@ GPUDispatcher::scheduleDispatch()
|
||||
schedule(&tickEvent, curTick() + shader->clockPeriod());
|
||||
}
|
||||
}
|
||||
|
||||
GPUDispatcher::GPUDispatcherStats::GPUDispatcherStats(Stats::Group *parent)
|
||||
: Stats::Group(parent),
|
||||
ADD_STAT(numKernelLaunched, "number of kernel launched"),
|
||||
ADD_STAT(cyclesWaitingForDispatch, "number of cycles with outstanding "
|
||||
"wavefronts that are waiting to be dispatched")
|
||||
{
|
||||
}
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
#include "dev/hsa/hsa_packet.hh"
|
||||
#include "params/GPUDispatcher.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
@@ -67,7 +68,6 @@ class GPUDispatcher : public SimObject
|
||||
|
||||
void serialize(CheckpointOut &cp) const override;
|
||||
void unserialize(CheckpointIn &cp) override;
|
||||
void regStats() override;
|
||||
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc);
|
||||
void setShader(Shader *new_shader);
|
||||
void exec();
|
||||
@@ -91,9 +91,15 @@ class GPUDispatcher : public SimObject
|
||||
std::queue<int> doneIds;
|
||||
// is there a kernel in execution?
|
||||
bool dispatchActive;
|
||||
/*statistics*/
|
||||
Stats::Scalar numKernelLaunched;
|
||||
Stats::Scalar cyclesWaitingForDispatch;
|
||||
|
||||
protected:
|
||||
struct GPUDispatcherStats : public Stats::Group
|
||||
{
|
||||
GPUDispatcherStats(Stats::Group *parent);
|
||||
|
||||
Stats::Scalar numKernelLaunched;
|
||||
Stats::Scalar cyclesWaitingForDispatch;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __GPU_COMPUTE_DISPATCHER_HH__
|
||||
|
||||
@@ -46,10 +46,11 @@ ExecStage::ExecStage(const ComputeUnitParams &p, ComputeUnit &cu,
|
||||
: computeUnit(cu), fromSchedule(from_schedule),
|
||||
lastTimeInstExecuted(false),
|
||||
thisTimeInstExecuted(false), instrExecuted (false),
|
||||
executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
|
||||
executionResourcesUsed(0), _name(cu.name() + ".ExecStage"),
|
||||
stats(&cu)
|
||||
|
||||
{
|
||||
numTransActiveIdle = 0;
|
||||
stats.numTransActiveIdle = 0;
|
||||
idle_dur = 0;
|
||||
}
|
||||
|
||||
@@ -64,22 +65,22 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
|
||||
if (stage == IdleExec) {
|
||||
// count cycles when no instruction to a specific execution resource
|
||||
// is executed
|
||||
numCyclesWithNoInstrTypeIssued[unitId]++;
|
||||
stats.numCyclesWithNoInstrTypeIssued[unitId]++;
|
||||
} else if (stage == BusyExec) {
|
||||
// count the number of cycles an instruction to a specific execution
|
||||
// resource type was issued
|
||||
numCyclesWithInstrTypeIssued[unitId]++;
|
||||
stats.numCyclesWithInstrTypeIssued[unitId]++;
|
||||
thisTimeInstExecuted = true;
|
||||
instrExecuted = true;
|
||||
++executionResourcesUsed;
|
||||
} else if (stage == PostExec) {
|
||||
// count the number of transitions from active to idle
|
||||
if (lastTimeInstExecuted && !thisTimeInstExecuted) {
|
||||
++numTransActiveIdle;
|
||||
++stats.numTransActiveIdle;
|
||||
}
|
||||
|
||||
if (!lastTimeInstExecuted && thisTimeInstExecuted) {
|
||||
idleDur.sample(idle_dur);
|
||||
stats.idleDur.sample(idle_dur);
|
||||
idle_dur = 0;
|
||||
} else if (!thisTimeInstExecuted) {
|
||||
idle_dur++;
|
||||
@@ -89,11 +90,11 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
|
||||
// track the number of cycles we either issued at least
|
||||
// instruction or issued no instructions at all
|
||||
if (instrExecuted) {
|
||||
numCyclesWithInstrIssued++;
|
||||
stats.numCyclesWithInstrIssued++;
|
||||
} else {
|
||||
numCyclesWithNoIssue++;
|
||||
stats.numCyclesWithNoIssue++;
|
||||
}
|
||||
spc.sample(executionResourcesUsed);
|
||||
stats.spc.sample(executionResourcesUsed);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -196,57 +197,35 @@ ExecStage::exec()
|
||||
collectStatistics(PostExec, 0);
|
||||
}
|
||||
|
||||
void
|
||||
ExecStage::regStats()
|
||||
ExecStage::ExecStageStats::ExecStageStats(Stats::Group *parent)
|
||||
: Stats::Group(parent, "ExecStage"),
|
||||
ADD_STAT(numTransActiveIdle,
|
||||
"number of CU transitions from active to idle"),
|
||||
ADD_STAT(numCyclesWithNoIssue, "number of cycles the CU issues nothing"),
|
||||
ADD_STAT(numCyclesWithInstrIssued,
|
||||
"number of cycles the CU issued at least one instruction"),
|
||||
ADD_STAT(spc,
|
||||
"Execution units active per cycle (Exec unit=SIMD,MemPipe)"),
|
||||
ADD_STAT(idleDur, "duration of idle periods in cycles"),
|
||||
ADD_STAT(numCyclesWithInstrTypeIssued, "Number of cycles at least one "
|
||||
"instruction issued to execution resource type"),
|
||||
ADD_STAT(numCyclesWithNoInstrTypeIssued, "Number of clks no instructions"
|
||||
" issued to execution resource type")
|
||||
{
|
||||
numTransActiveIdle
|
||||
.name(name() + ".num_transitions_active_to_idle")
|
||||
.desc("number of CU transitions from active to idle")
|
||||
;
|
||||
ComputeUnit *compute_unit = static_cast<ComputeUnit*>(parent);
|
||||
|
||||
numCyclesWithNoIssue
|
||||
.name(name() + ".num_cycles_with_no_issue")
|
||||
.desc("number of cycles the CU issues nothing")
|
||||
;
|
||||
|
||||
numCyclesWithInstrIssued
|
||||
.name(name() + ".num_cycles_with_instr_issued")
|
||||
.desc("number of cycles the CU issued at least one instruction")
|
||||
;
|
||||
|
||||
spc
|
||||
.init(0, computeUnit.numExeUnits(), 1)
|
||||
.name(name() + ".spc")
|
||||
.desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
|
||||
;
|
||||
|
||||
idleDur
|
||||
.init(0,75,5)
|
||||
.name(name() + ".idle_duration_in_cycles")
|
||||
.desc("duration of idle periods in cycles")
|
||||
;
|
||||
|
||||
numCyclesWithInstrTypeIssued
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".num_cycles_issue_exec_rsrc")
|
||||
.desc("Number of cycles at least one instruction issued to "
|
||||
"execution resource type")
|
||||
;
|
||||
|
||||
numCyclesWithNoInstrTypeIssued
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".num_cycles_no_issue_exec_rsrc")
|
||||
.desc("Number of clks no instructions issued to execution "
|
||||
"resource type")
|
||||
;
|
||||
spc.init(0, compute_unit->numExeUnits(), 1);
|
||||
idleDur.init(0, 75, 5);
|
||||
numCyclesWithInstrTypeIssued.init(compute_unit->numExeUnits());
|
||||
numCyclesWithNoInstrTypeIssued.init(compute_unit->numExeUnits());
|
||||
|
||||
int c = 0;
|
||||
for (int i = 0; i < computeUnit.numVectorALUs; i++,c++) {
|
||||
for (int i = 0; i < compute_unit->numVectorALUs; i++,c++) {
|
||||
std::string s = "VectorALU" + std::to_string(i);
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, s);
|
||||
numCyclesWithInstrTypeIssued.subname(c, s);
|
||||
}
|
||||
for (int i = 0; i < computeUnit.numScalarALUs; i++,c++) {
|
||||
for (int i = 0; i < compute_unit->numScalarALUs; i++,c++) {
|
||||
std::string s = "ScalarALU" + std::to_string(i);
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, s);
|
||||
numCyclesWithInstrTypeIssued.subname(c, s);
|
||||
@@ -256,7 +235,4 @@ ExecStage::regStats()
|
||||
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe");
|
||||
numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe");
|
||||
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe");
|
||||
numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe");
|
||||
}
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "sim/stats.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class ScheduleToExecute;
|
||||
@@ -81,20 +82,6 @@ class ExecStage
|
||||
void dumpDispList();
|
||||
|
||||
const std::string& name() const { return _name; }
|
||||
void regStats();
|
||||
// number of idle cycles
|
||||
Stats::Scalar numCyclesWithNoIssue;
|
||||
// number of busy cycles
|
||||
Stats::Scalar numCyclesWithInstrIssued;
|
||||
// number of cycles during which at least one
|
||||
// instruction was issued to an execution resource type
|
||||
Stats::Vector numCyclesWithInstrTypeIssued;
|
||||
// number of idle cycles during which the scheduler
|
||||
// issued no instructions targeting a specific
|
||||
// execution resource type
|
||||
Stats::Vector numCyclesWithNoInstrTypeIssued;
|
||||
// SIMDs active per cycle
|
||||
Stats::Distribution spc;
|
||||
|
||||
private:
|
||||
void collectStatistics(enum STAT_STATUS stage, int unitId);
|
||||
@@ -105,11 +92,33 @@ class ExecStage
|
||||
bool lastTimeInstExecuted;
|
||||
bool thisTimeInstExecuted;
|
||||
bool instrExecuted;
|
||||
Stats::Scalar numTransActiveIdle;
|
||||
Stats::Distribution idleDur;
|
||||
int executionResourcesUsed;
|
||||
uint64_t idle_dur;
|
||||
const std::string _name;
|
||||
|
||||
protected:
|
||||
struct ExecStageStats : public Stats::Group
|
||||
{
|
||||
ExecStageStats(Stats::Group *parent);
|
||||
|
||||
// number of transitions from active to idle
|
||||
Stats::Scalar numTransActiveIdle;
|
||||
// number of idle cycles
|
||||
Stats::Scalar numCyclesWithNoIssue;
|
||||
// number of busy cycles
|
||||
Stats::Scalar numCyclesWithInstrIssued;
|
||||
// SIMDs active per cycle
|
||||
Stats::Distribution spc;
|
||||
// duration of idle periods in cycles
|
||||
Stats::Distribution idleDur;
|
||||
// number of cycles during which at least one
|
||||
// instruction was issued to an execution resource type
|
||||
Stats::Vector numCyclesWithInstrTypeIssued;
|
||||
// number of idle cycles during which the scheduler
|
||||
// issued no instructions targeting a specific
|
||||
// execution resource type
|
||||
Stats::Vector numCyclesWithNoInstrTypeIssued;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __EXEC_STAGE_HH__
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
FetchStage::FetchStage(const ComputeUnitParams &p, ComputeUnit &cu)
|
||||
: numVectorALUs(p.num_SIMDs), computeUnit(cu),
|
||||
_name(cu.name() + ".FetchStage")
|
||||
_name(cu.name() + ".FetchStage"), stats(&cu)
|
||||
{
|
||||
for (int j = 0; j < numVectorALUs; ++j) {
|
||||
FetchUnit newFetchUnit(p, cu);
|
||||
@@ -79,7 +79,7 @@ FetchStage::processFetchReturn(PacketPtr pkt)
|
||||
const unsigned num_instructions = pkt->req->getSize() /
|
||||
sizeof(TheGpuISA::RawMachInst);
|
||||
|
||||
instFetchInstReturned.sample(num_instructions);
|
||||
stats.instFetchInstReturned.sample(num_instructions);
|
||||
uint32_t simdId = wavefront->simdId;
|
||||
_fetchUnit[simdId].processFetchReturn(pkt);
|
||||
}
|
||||
@@ -90,13 +90,10 @@ FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
|
||||
_fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
|
||||
}
|
||||
|
||||
void
|
||||
FetchStage::regStats()
|
||||
FetchStage::FetchStageStats::FetchStageStats(Stats::Group *parent)
|
||||
: Stats::Group(parent, "FetchStage"),
|
||||
ADD_STAT(instFetchInstReturned, "For each instruction fetch request "
|
||||
"received record how many instructions you got from it")
|
||||
{
|
||||
instFetchInstReturned
|
||||
.init(1, 32, 1)
|
||||
.name(name() + ".inst_fetch_instr_returned")
|
||||
.desc("For each instruction fetch request recieved record how many "
|
||||
"instructions you got from it")
|
||||
;
|
||||
instFetchInstReturned.init(1, 32, 1);
|
||||
}
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
#include "gpu-compute/fetch_unit.hh"
|
||||
|
||||
// Instruction fetch stage.
|
||||
@@ -61,8 +62,6 @@ class FetchStage
|
||||
|
||||
// Stats related variables and methods
|
||||
const std::string& name() const { return _name; }
|
||||
void regStats();
|
||||
Stats::Distribution instFetchInstReturned;
|
||||
FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
|
||||
|
||||
private:
|
||||
@@ -73,6 +72,14 @@ class FetchStage
|
||||
// instantiated per VALU/SIMD
|
||||
std::vector<FetchUnit> _fetchUnit;
|
||||
const std::string _name;
|
||||
|
||||
protected:
|
||||
struct FetchStageStats : public Stats::Group
|
||||
{
|
||||
FetchStageStats(Stats::Group *parent);
|
||||
|
||||
Stats::Distribution instFetchInstReturned;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __FETCH_STAGE_HH__
|
||||
|
||||
@@ -48,7 +48,7 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
|
||||
: computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
|
||||
gmQueueSize(p.global_mem_queue_size),
|
||||
maxWaveRequests(p.max_wave_requests), inflightStores(0),
|
||||
inflightLoads(0)
|
||||
inflightLoads(0), stats(&cu)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -293,12 +293,10 @@ GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
|
||||
mem_req->second.second = true;
|
||||
}
|
||||
|
||||
void
|
||||
GlobalMemPipeline::regStats()
|
||||
GlobalMemPipeline::
|
||||
GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent)
|
||||
: Stats::Group(parent, "GlobalMemPipeline"),
|
||||
ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
|
||||
"are delayed before updating the VRF")
|
||||
{
|
||||
loadVrfBankConflictCycles
|
||||
.name(name() + ".load_vrf_bank_conflict_cycles")
|
||||
.desc("total number of cycles GM data are delayed before updating "
|
||||
"the VRF")
|
||||
;
|
||||
}
|
||||
|
||||
@@ -37,6 +37,8 @@
|
||||
#include <queue>
|
||||
#include <string>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "params/ComputeUnit.hh"
|
||||
#include "sim/stats.hh"
|
||||
@@ -95,11 +97,10 @@ class GlobalMemPipeline
|
||||
}
|
||||
|
||||
const std::string &name() const { return _name; }
|
||||
void regStats();
|
||||
void
|
||||
incLoadVRFBankConflictCycles(int num_cycles)
|
||||
{
|
||||
loadVrfBankConflictCycles += num_cycles;
|
||||
stats.loadVrfBankConflictCycles += num_cycles;
|
||||
}
|
||||
|
||||
bool coalescerReady(GPUDynInstPtr mp) const;
|
||||
@@ -113,10 +114,6 @@ class GlobalMemPipeline
|
||||
int gmQueueSize;
|
||||
int maxWaveRequests;
|
||||
|
||||
// number of cycles of delaying the update of a VGPR that is the
|
||||
// target of a load instruction (or the load component of an atomic)
|
||||
// The delay is due to VRF bank conflicts
|
||||
Stats::Scalar loadVrfBankConflictCycles;
|
||||
// Counters to track the inflight loads and stores
|
||||
// so that we can provide the proper backpressure
|
||||
// on the number of inflight memory operations.
|
||||
@@ -144,6 +141,17 @@ class GlobalMemPipeline
|
||||
// Global Memory Request FIFO: all global memory requests
|
||||
// are issued to this FIFO from the memory pipelines
|
||||
std::queue<GPUDynInstPtr> gmIssuedRequests;
|
||||
|
||||
protected:
|
||||
struct GlobalMemPipelineStats : public Stats::Group
|
||||
{
|
||||
GlobalMemPipelineStats(Stats::Group *parent);
|
||||
|
||||
// number of cycles of delaying the update of a VGPR that is the
|
||||
// target of a load instruction (or the load component of an atomic)
|
||||
// The delay is due to VRF bank conflicts
|
||||
Stats::Scalar loadVrfBankConflictCycles;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __GLOBAL_MEMORY_PIPELINE_HH__
|
||||
|
||||
@@ -930,16 +930,16 @@ GPUDynInst::updateStats()
|
||||
{
|
||||
if (_staticInst->isLocalMem()) {
|
||||
// access to LDS (shared) memory
|
||||
cu->dynamicLMemInstrCnt++;
|
||||
cu->stats.dynamicLMemInstrCnt++;
|
||||
} else if (_staticInst->isFlat()) {
|
||||
cu->dynamicFlatMemInstrCnt++;
|
||||
cu->stats.dynamicFlatMemInstrCnt++;
|
||||
} else {
|
||||
// access to global memory
|
||||
|
||||
// update PageDivergence histogram
|
||||
int number_pages_touched = cu->pagesTouched.size();
|
||||
assert(number_pages_touched);
|
||||
cu->pageDivergenceDist.sample(number_pages_touched);
|
||||
cu->stats.pageDivergenceDist.sample(number_pages_touched);
|
||||
|
||||
std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
|
||||
|
||||
@@ -962,7 +962,7 @@ GPUDynInst::updateStats()
|
||||
// total number of memory instructions (dynamic)
|
||||
// Atomics are counted as a single memory instruction.
|
||||
// this is # memory instructions per wavefronts, not per workitem
|
||||
cu->dynamicGMemInstrCnt++;
|
||||
cu->stats.dynamicGMemInstrCnt++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -63,12 +63,12 @@ class AtomicOpCAS : public TypedAtomicOpFunctor<T>
|
||||
void
|
||||
execute(T *b)
|
||||
{
|
||||
computeUnit->numCASOps++;
|
||||
computeUnit->stats.numCASOps++;
|
||||
|
||||
if (*b == c) {
|
||||
*b = s;
|
||||
} else {
|
||||
computeUnit->numFailedCASOps++;
|
||||
computeUnit->stats.numFailedCASOps++;
|
||||
}
|
||||
}
|
||||
AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
|
||||
|
||||
@@ -67,7 +67,7 @@ namespace X86ISA
|
||||
: ClockedObject(p), configAddress(0), size(p.size),
|
||||
cleanupEvent([this]{ cleanup(); }, name(), false,
|
||||
Event::Maximum_Pri),
|
||||
exitEvent([this]{ exitCallback(); }, name())
|
||||
exitEvent([this]{ exitCallback(); }, name()), stats(this)
|
||||
{
|
||||
assoc = p.assoc;
|
||||
assert(assoc <= size);
|
||||
@@ -402,12 +402,12 @@ namespace X86ISA
|
||||
return tlb_hit;
|
||||
}
|
||||
|
||||
localNumTLBAccesses++;
|
||||
stats.localNumTLBAccesses++;
|
||||
|
||||
if (!entry) {
|
||||
localNumTLBMisses++;
|
||||
stats.localNumTLBMisses++;
|
||||
} else {
|
||||
localNumTLBHits++;
|
||||
stats.localNumTLBHits++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -499,10 +499,10 @@ namespace X86ISA
|
||||
DPRINTF(GPUTLB, "Paging enabled.\n");
|
||||
// The vaddr already has the segment base applied.
|
||||
TlbEntry *entry = lookup(vaddr);
|
||||
localNumTLBAccesses++;
|
||||
stats.localNumTLBAccesses++;
|
||||
|
||||
if (!entry) {
|
||||
localNumTLBMisses++;
|
||||
stats.localNumTLBMisses++;
|
||||
if (timing) {
|
||||
latency = missLatency1;
|
||||
}
|
||||
@@ -544,7 +544,7 @@ namespace X86ISA
|
||||
DPRINTF(GPUTLB, "Miss was serviced.\n");
|
||||
}
|
||||
} else {
|
||||
localNumTLBHits++;
|
||||
stats.localNumTLBHits++;
|
||||
|
||||
if (timing) {
|
||||
latency = hitLatency;
|
||||
@@ -659,89 +659,6 @@ namespace X86ISA
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
GpuTLB::regStats()
|
||||
{
|
||||
ClockedObject::regStats();
|
||||
|
||||
localNumTLBAccesses
|
||||
.name(name() + ".local_TLB_accesses")
|
||||
.desc("Number of TLB accesses")
|
||||
;
|
||||
|
||||
localNumTLBHits
|
||||
.name(name() + ".local_TLB_hits")
|
||||
.desc("Number of TLB hits")
|
||||
;
|
||||
|
||||
localNumTLBMisses
|
||||
.name(name() + ".local_TLB_misses")
|
||||
.desc("Number of TLB misses")
|
||||
;
|
||||
|
||||
localTLBMissRate
|
||||
.name(name() + ".local_TLB_miss_rate")
|
||||
.desc("TLB miss rate")
|
||||
;
|
||||
|
||||
accessCycles
|
||||
.name(name() + ".access_cycles")
|
||||
.desc("Cycles spent accessing this TLB level")
|
||||
;
|
||||
|
||||
pageTableCycles
|
||||
.name(name() + ".page_table_cycles")
|
||||
.desc("Cycles spent accessing the page table")
|
||||
;
|
||||
|
||||
localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
|
||||
|
||||
numUniquePages
|
||||
.name(name() + ".unique_pages")
|
||||
.desc("Number of unique pages touched")
|
||||
;
|
||||
|
||||
localCycles
|
||||
.name(name() + ".local_cycles")
|
||||
.desc("Number of cycles spent in queue for all incoming reqs")
|
||||
;
|
||||
|
||||
localLatency
|
||||
.name(name() + ".local_latency")
|
||||
.desc("Avg. latency over incoming coalesced reqs")
|
||||
;
|
||||
|
||||
localLatency = localCycles / localNumTLBAccesses;
|
||||
|
||||
globalNumTLBAccesses
|
||||
.name(name() + ".global_TLB_accesses")
|
||||
.desc("Number of TLB accesses")
|
||||
;
|
||||
|
||||
globalNumTLBHits
|
||||
.name(name() + ".global_TLB_hits")
|
||||
.desc("Number of TLB hits")
|
||||
;
|
||||
|
||||
globalNumTLBMisses
|
||||
.name(name() + ".global_TLB_misses")
|
||||
.desc("Number of TLB misses")
|
||||
;
|
||||
|
||||
globalTLBMissRate
|
||||
.name(name() + ".global_TLB_miss_rate")
|
||||
.desc("TLB miss rate")
|
||||
;
|
||||
|
||||
globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
|
||||
|
||||
avgReuseDistance
|
||||
.name(name() + ".avg_reuse_distance")
|
||||
.desc("avg. reuse distance over all pages (in ticks)")
|
||||
;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Do the TLB lookup for this coalesced request and schedule
|
||||
* another event <TLB access latency> cycles later.
|
||||
@@ -768,10 +685,10 @@ namespace X86ISA
|
||||
int req_cnt = sender_state->reqCnt.back();
|
||||
|
||||
if (update_stats) {
|
||||
accessCycles -= (curTick() * req_cnt);
|
||||
localCycles -= curTick();
|
||||
stats.accessCycles -= (curTick() * req_cnt);
|
||||
stats.localCycles -= curTick();
|
||||
updatePageFootprint(virt_page_addr);
|
||||
globalNumTLBAccesses += req_cnt;
|
||||
stats.globalNumTLBAccesses += req_cnt;
|
||||
}
|
||||
|
||||
tlbOutcome lookup_outcome = TLB_MISS;
|
||||
@@ -795,11 +712,11 @@ namespace X86ISA
|
||||
// the reqCnt has an entry per level, so its size tells us
|
||||
// which level we are in
|
||||
sender_state->hitLevel = sender_state->reqCnt.size();
|
||||
globalNumTLBHits += req_cnt;
|
||||
stats.globalNumTLBHits += req_cnt;
|
||||
}
|
||||
} else {
|
||||
if (update_stats)
|
||||
globalNumTLBMisses += req_cnt;
|
||||
stats.globalNumTLBMisses += req_cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -981,16 +898,16 @@ namespace X86ISA
|
||||
handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
|
||||
|
||||
if (update_stats) {
|
||||
accessCycles += (req_cnt * curTick());
|
||||
localCycles += curTick();
|
||||
stats.accessCycles += (req_cnt * curTick());
|
||||
stats.localCycles += curTick();
|
||||
}
|
||||
|
||||
} else if (outcome == TLB_MISS) {
|
||||
|
||||
DPRINTF(GPUTLB, "This is a TLB miss\n");
|
||||
if (update_stats) {
|
||||
accessCycles += (req_cnt*curTick());
|
||||
localCycles += curTick();
|
||||
stats.accessCycles += (req_cnt*curTick());
|
||||
stats.localCycles += curTick();
|
||||
}
|
||||
|
||||
if (hasMemSidePort) {
|
||||
@@ -998,8 +915,8 @@ namespace X86ISA
|
||||
// the reply back till when we propagate it to the coalescer
|
||||
// above.
|
||||
if (update_stats) {
|
||||
accessCycles += (req_cnt * 1);
|
||||
localCycles += 1;
|
||||
stats.accessCycles += (req_cnt * 1);
|
||||
stats.localCycles += 1;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1022,7 +939,7 @@ namespace X86ISA
|
||||
"addr %#x\n", virtPageAddr);
|
||||
|
||||
if (update_stats)
|
||||
pageTableCycles -= (req_cnt*curTick());
|
||||
stats.pageTableCycles -= (req_cnt*curTick());
|
||||
|
||||
TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
|
||||
assert(tlb_event);
|
||||
@@ -1032,7 +949,7 @@ namespace X86ISA
|
||||
}
|
||||
} else if (outcome == PAGE_WALK) {
|
||||
if (update_stats)
|
||||
pageTableCycles += (req_cnt*curTick());
|
||||
stats.pageTableCycles += (req_cnt*curTick());
|
||||
|
||||
// Need to access the page table and update the TLB
|
||||
DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
|
||||
@@ -1222,17 +1139,17 @@ namespace X86ISA
|
||||
// functional mode means no coalescing
|
||||
// global metrics are the same as the local metrics
|
||||
if (update_stats) {
|
||||
tlb->globalNumTLBAccesses++;
|
||||
tlb->stats.globalNumTLBAccesses++;
|
||||
|
||||
if (success) {
|
||||
sender_state->hitLevel = sender_state->reqCnt.size();
|
||||
tlb->globalNumTLBHits++;
|
||||
tlb->stats.globalNumTLBHits++;
|
||||
}
|
||||
}
|
||||
|
||||
if (!success) {
|
||||
if (update_stats)
|
||||
tlb->globalNumTLBMisses++;
|
||||
tlb->stats.globalNumTLBMisses++;
|
||||
if (tlb->hasMemSidePort) {
|
||||
// there is a TLB below -> propagate down the TLB hierarchy
|
||||
tlb->memSidePort[0]->sendFunctional(pkt);
|
||||
@@ -1405,7 +1322,7 @@ namespace X86ISA
|
||||
bool first_page_access = ret.second;
|
||||
|
||||
if (first_page_access) {
|
||||
numUniquePages++;
|
||||
stats.numUniquePages++;
|
||||
} else {
|
||||
int accessed_before;
|
||||
accessed_before = curTick() - ret.first->second.lastTimeAccessed;
|
||||
@@ -1417,7 +1334,7 @@ namespace X86ISA
|
||||
|
||||
if (accessDistance) {
|
||||
ret.first->second.localTLBAccesses
|
||||
.push_back(localNumTLBAccesses.value());
|
||||
.push_back(stats.localNumTLBAccesses.value());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1506,11 +1423,36 @@ namespace X86ISA
|
||||
}
|
||||
|
||||
if (!TLBFootprint.empty()) {
|
||||
avgReuseDistance =
|
||||
stats.avgReuseDistance =
|
||||
sum_avg_reuse_distance_per_page / TLBFootprint.size();
|
||||
}
|
||||
|
||||
//clear the TLBFootprint map
|
||||
TLBFootprint.clear();
|
||||
}
|
||||
|
||||
GpuTLB::GpuTLBStats::GpuTLBStats(Stats::Group *parent)
|
||||
: Stats::Group(parent),
|
||||
ADD_STAT(localNumTLBAccesses, "Number of TLB accesses"),
|
||||
ADD_STAT(localNumTLBHits, "Number of TLB hits"),
|
||||
ADD_STAT(localNumTLBMisses, "Number of TLB misses"),
|
||||
ADD_STAT(localTLBMissRate, "TLB miss rate"),
|
||||
ADD_STAT(globalNumTLBAccesses, "Number of TLB accesses"),
|
||||
ADD_STAT(globalNumTLBHits, "Number of TLB hits"),
|
||||
ADD_STAT(globalNumTLBMisses, "Number of TLB misses"),
|
||||
ADD_STAT(globalTLBMissRate, "TLB miss rate"),
|
||||
ADD_STAT(accessCycles, "Cycles spent accessing this TLB level"),
|
||||
ADD_STAT(pageTableCycles, "Cycles spent accessing the page table"),
|
||||
ADD_STAT(numUniquePages, "Number of unique pages touched"),
|
||||
ADD_STAT(localCycles, "Number of cycles spent in queue for all "
|
||||
"incoming reqs"),
|
||||
ADD_STAT(localLatency, "Avg. latency over incoming coalesced reqs"),
|
||||
ADD_STAT(avgReuseDistance, "avg. reuse distance over all pages (in "
|
||||
"ticks)")
|
||||
{
|
||||
localLatency = localCycles / localNumTLBAccesses;
|
||||
|
||||
localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
|
||||
globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
|
||||
}
|
||||
} // namespace X86ISA
|
||||
|
||||
@@ -47,6 +47,7 @@
|
||||
#include "base/callback.hh"
|
||||
#include "base/logging.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "mem/port.hh"
|
||||
#include "mem/request.hh"
|
||||
@@ -169,35 +170,6 @@ namespace X86ISA
|
||||
int missLatency1;
|
||||
int missLatency2;
|
||||
|
||||
// local_stats are as seen from the TLB
|
||||
// without taking into account coalescing
|
||||
Stats::Scalar localNumTLBAccesses;
|
||||
Stats::Scalar localNumTLBHits;
|
||||
Stats::Scalar localNumTLBMisses;
|
||||
Stats::Formula localTLBMissRate;
|
||||
|
||||
// global_stats are as seen from the
|
||||
// CU's perspective taking into account
|
||||
// all coalesced requests.
|
||||
Stats::Scalar globalNumTLBAccesses;
|
||||
Stats::Scalar globalNumTLBHits;
|
||||
Stats::Scalar globalNumTLBMisses;
|
||||
Stats::Formula globalTLBMissRate;
|
||||
|
||||
// from the CU perspective (global)
|
||||
Stats::Scalar accessCycles;
|
||||
// from the CU perspective (global)
|
||||
Stats::Scalar pageTableCycles;
|
||||
Stats::Scalar numUniquePages;
|
||||
// from the perspective of this TLB
|
||||
Stats::Scalar localCycles;
|
||||
// from the perspective of this TLB
|
||||
Stats::Formula localLatency;
|
||||
// I take the avg. per page and then
|
||||
// the avg. over all pages.
|
||||
Stats::Scalar avgReuseDistance;
|
||||
|
||||
void regStats() override;
|
||||
void updatePageFootprint(Addr virt_page_addr);
|
||||
void printAccessPattern();
|
||||
|
||||
@@ -426,6 +398,40 @@ namespace X86ISA
|
||||
void exitCallback();
|
||||
|
||||
EventFunctionWrapper exitEvent;
|
||||
|
||||
protected:
|
||||
struct GpuTLBStats : public Stats::Group
|
||||
{
|
||||
GpuTLBStats(Stats::Group *parent);
|
||||
|
||||
// local_stats are as seen from the TLB
|
||||
// without taking into account coalescing
|
||||
Stats::Scalar localNumTLBAccesses;
|
||||
Stats::Scalar localNumTLBHits;
|
||||
Stats::Scalar localNumTLBMisses;
|
||||
Stats::Formula localTLBMissRate;
|
||||
|
||||
// global_stats are as seen from the
|
||||
// CU's perspective taking into account
|
||||
// all coalesced requests.
|
||||
Stats::Scalar globalNumTLBAccesses;
|
||||
Stats::Scalar globalNumTLBHits;
|
||||
Stats::Scalar globalNumTLBMisses;
|
||||
Stats::Formula globalTLBMissRate;
|
||||
|
||||
// from the CU perspective (global)
|
||||
Stats::Scalar accessCycles;
|
||||
// from the CU perspective (global)
|
||||
Stats::Scalar pageTableCycles;
|
||||
Stats::Scalar numUniquePages;
|
||||
// from the perspective of this TLB
|
||||
Stats::Scalar localCycles;
|
||||
// from the perspective of this TLB
|
||||
Stats::Formula localLatency;
|
||||
// I take the avg. per page and then
|
||||
// the avg. over all pages.
|
||||
Stats::Scalar avgReuseDistance;
|
||||
} stats;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -189,10 +189,10 @@ LdsState::processPacket(PacketPtr packet)
|
||||
// the number of conflicts this packet will have when accessing the LDS
|
||||
unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
|
||||
// count the total number of physical LDS bank accessed
|
||||
parent->ldsBankAccesses += bankAccesses;
|
||||
parent->stats.ldsBankAccesses += bankAccesses;
|
||||
// count the LDS bank conflicts. A number set to 1 indicates one
|
||||
// access per bank maximum so there are no bank conflicts
|
||||
parent->ldsBankConflictDist.sample(bankConflicts-1);
|
||||
parent->stats.ldsBankConflictDist.sample(bankConflicts-1);
|
||||
|
||||
GPUDynInstPtr dynInst = getDynInstr(packet);
|
||||
// account for the LDS bank conflict overhead
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
|
||||
LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
|
||||
: computeUnit(cu), _name(cu.name() + ".LocalMemPipeline"),
|
||||
lmQueueSize(p.local_mem_queue_size)
|
||||
lmQueueSize(p.local_mem_queue_size), stats(&cu)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -124,12 +124,11 @@ LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
|
||||
lmIssuedRequests.push(gpuDynInst);
|
||||
}
|
||||
|
||||
void
|
||||
LocalMemPipeline::regStats()
|
||||
|
||||
LocalMemPipeline::
|
||||
LocalMemPipelineStats::LocalMemPipelineStats(Stats::Group *parent)
|
||||
: Stats::Group(parent, "LocalMemPipeline"),
|
||||
ADD_STAT(loadVrfBankConflictCycles, "total number of cycles LDS data "
|
||||
"are delayed before updating the VRF")
|
||||
{
|
||||
loadVrfBankConflictCycles
|
||||
.name(name() + ".load_vrf_bank_conflict_cycles")
|
||||
.desc("total number of cycles LDS data are delayed before updating "
|
||||
"the VRF")
|
||||
;
|
||||
}
|
||||
|
||||
@@ -37,9 +37,10 @@
|
||||
#include <queue>
|
||||
#include <string>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "params/ComputeUnit.hh"
|
||||
#include "sim/stats.hh"
|
||||
|
||||
/*
|
||||
* @file local_memory_pipeline.hh
|
||||
@@ -75,19 +76,18 @@ class LocalMemPipeline
|
||||
}
|
||||
|
||||
const std::string& name() const { return _name; }
|
||||
void regStats();
|
||||
|
||||
void
|
||||
incLoadVRFBankConflictCycles(int num_cycles)
|
||||
{
|
||||
loadVrfBankConflictCycles += num_cycles;
|
||||
stats.loadVrfBankConflictCycles += num_cycles;
|
||||
}
|
||||
|
||||
private:
|
||||
ComputeUnit &computeUnit;
|
||||
const std::string _name;
|
||||
int lmQueueSize;
|
||||
Stats::Scalar loadVrfBankConflictCycles;
|
||||
|
||||
// Local Memory Request Fifo: all shared memory requests
|
||||
// are issued to this FIFO from the memory pipelines
|
||||
std::queue<GPUDynInstPtr> lmIssuedRequests;
|
||||
@@ -95,6 +95,14 @@ class LocalMemPipeline
|
||||
// Local Memory Response Fifo: all responses of shared memory
|
||||
// requests are sent to this FIFO from LDS
|
||||
std::queue<GPUDynInstPtr> lmReturnedRequests;
|
||||
|
||||
protected:
|
||||
struct LocalMemPipelineStats : public Stats::Group
|
||||
{
|
||||
LocalMemPipelineStats(Stats::Group *parent);
|
||||
|
||||
Stats::Scalar loadVrfBankConflictCycles;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __LOCAL_MEMORY_PIPELINE_HH__
|
||||
|
||||
@@ -49,7 +49,7 @@
|
||||
#include "params/RegisterFile.hh"
|
||||
|
||||
RegisterFile::RegisterFile(const RegisterFileParams &p)
|
||||
: SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs)
|
||||
: SimObject(p), simdId(p.simd_id), _numRegs(p.num_regs), stats(this)
|
||||
{
|
||||
fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n");
|
||||
fatal_if(simdId < 0, "Illegal SIMD id for VRF");
|
||||
@@ -192,26 +192,15 @@ RegisterFile::dispatchInstruction(GPUDynInstPtr ii)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::regStats()
|
||||
RegisterFile::RegisterFileStats::RegisterFileStats(Stats::Group *parent)
|
||||
: Stats::Group(parent),
|
||||
ADD_STAT(registerReads,
|
||||
"Total number of DWORDs read from register file"),
|
||||
ADD_STAT(registerWrites,
|
||||
"Total number of DWORDS written to register file"),
|
||||
ADD_STAT(sramReads,
|
||||
"Total number of register file bank SRAM activations for reads"),
|
||||
ADD_STAT(sramWrites,
|
||||
"Total number of register file bank SRAM activations for writes")
|
||||
{
|
||||
registerReads
|
||||
.name(name() + ".register_reads")
|
||||
.desc("Total number of DWORDs read from register file")
|
||||
;
|
||||
|
||||
registerWrites
|
||||
.name(name() + ".register_writes")
|
||||
.desc("Total number of DWORDS written to register file")
|
||||
;
|
||||
|
||||
sramReads
|
||||
.name(name() + ".sram_reads")
|
||||
.desc("Total number of register file bank SRAM activations for reads")
|
||||
;
|
||||
|
||||
sramWrites
|
||||
.name(name() + ".sram_writes")
|
||||
.desc("Total number of register file bank SRAM activations for writes")
|
||||
;
|
||||
}
|
||||
|
||||
@@ -62,7 +62,6 @@ class RegisterFile : public SimObject
|
||||
virtual ~RegisterFile();
|
||||
virtual void setParent(ComputeUnit *_computeUnit);
|
||||
int numRegs() const { return _numRegs; }
|
||||
virtual void regStats() override;
|
||||
|
||||
// State functions
|
||||
|
||||
@@ -154,18 +153,23 @@ class RegisterFile : public SimObject
|
||||
|
||||
// numer of registers in this register file
|
||||
int _numRegs;
|
||||
// Stats
|
||||
// Total number of register reads, incremented once per DWORD per thread
|
||||
Stats::Scalar registerReads;
|
||||
// Total number of register writes, incremented once per DWORD per thread
|
||||
Stats::Scalar registerWrites;
|
||||
|
||||
// Number of register file SRAM activations for reads.
|
||||
// The register file may be implemented with multiple SRAMs. This stat
|
||||
// tracks how many times the SRAMs are accessed for reads.
|
||||
Stats::Scalar sramReads;
|
||||
// Number of register file SRAM activations for writes
|
||||
Stats::Scalar sramWrites;
|
||||
struct RegisterFileStats : public Stats::Group
|
||||
{
|
||||
RegisterFileStats(Stats::Group *parent);
|
||||
|
||||
// Total number of register reads per DWORD per thread
|
||||
Stats::Scalar registerReads;
|
||||
// Total number of register writes per DWORD per thread
|
||||
Stats::Scalar registerWrites;
|
||||
|
||||
// Number of register file SRAM activations for reads.
|
||||
// The register file may be implemented with multiple SRAMs. This stat
|
||||
// tracks how many times the SRAMs are accessed for reads.
|
||||
Stats::Scalar sramReads;
|
||||
// Number of register file SRAM activations for writes
|
||||
Stats::Scalar sramWrites;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __REGISTER_FILE_HH__
|
||||
|
||||
@@ -129,9 +129,3 @@ RegisterManager::freeRegisters(Wavefront* w)
|
||||
{
|
||||
policy->freeRegisters(w);
|
||||
}
|
||||
|
||||
void
|
||||
RegisterManager::regStats()
|
||||
{
|
||||
policy->regStats();
|
||||
}
|
||||
|
||||
@@ -63,9 +63,6 @@ class RegisterManager : public SimObject
|
||||
void setParent(ComputeUnit *cu);
|
||||
void exec();
|
||||
|
||||
// Stats related variables and methods
|
||||
void regStats();
|
||||
|
||||
// lookup virtual to physical register translation
|
||||
int mapVgpr(Wavefront* w, int vgprIndex);
|
||||
int mapSgpr(Wavefront* w, int sgprIndex);
|
||||
|
||||
@@ -76,9 +76,6 @@ class RegisterManagerPolicy
|
||||
// free all remaining registers held by specified WF
|
||||
virtual void freeRegisters(Wavefront *w) = 0;
|
||||
|
||||
// stats
|
||||
virtual void regStats() = 0;
|
||||
|
||||
protected:
|
||||
ComputeUnit *cu;
|
||||
};
|
||||
|
||||
@@ -142,8 +142,3 @@ ScalarMemPipeline::exec()
|
||||
computeUnit.cu_id, mp->simdId, mp->wfSlotId);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScalarMemPipeline::regStats()
|
||||
{
|
||||
}
|
||||
|
||||
@@ -85,7 +85,6 @@ class ScalarMemPipeline
|
||||
}
|
||||
|
||||
const std::string& name() const { return _name; }
|
||||
void regStats();
|
||||
|
||||
private:
|
||||
ComputeUnit &computeUnit;
|
||||
|
||||
@@ -66,11 +66,11 @@ ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
|
||||
|
||||
if (regBusy(pSgpr)) {
|
||||
if (ii->isDstOperand(i)) {
|
||||
w->numTimesBlockedDueWAXDependencies++;
|
||||
w->stats.numTimesBlockedDueWAXDependencies++;
|
||||
} else if (ii->isSrcOperand(i)) {
|
||||
DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
|
||||
w->wfDynId, ii->disassemble(), pSgpr);
|
||||
w->numTimesBlockedDueRAWDependencies++;
|
||||
w->stats.numTimesBlockedDueRAWDependencies++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -109,7 +109,7 @@ ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
|
||||
if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
|
||||
int DWORDs = ii->getOperandSize(i) <= 4 ? 1
|
||||
: ii->getOperandSize(i) / 4;
|
||||
registerReads += DWORDs;
|
||||
stats.registerReads += DWORDs;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,7 +128,7 @@ ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
|
||||
enqRegFreeEvent(physReg, tickDelay);
|
||||
}
|
||||
|
||||
registerWrites += nRegs;
|
||||
stats.registerWrites += nRegs;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -152,7 +152,7 @@ ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w,
|
||||
enqRegFreeEvent(physReg, computeUnit->clockPeriod());
|
||||
}
|
||||
|
||||
registerWrites += nRegs;
|
||||
stats.registerWrites += nRegs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,
|
||||
_name(cu.name() + ".ScheduleStage"),
|
||||
vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
|
||||
scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
|
||||
locMemBusRdy(false), locMemIssueRdy(false)
|
||||
locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
|
||||
{
|
||||
for (int j = 0; j < cu.numExeUnits(); ++j) {
|
||||
scheduler.emplace_back(p);
|
||||
@@ -121,10 +121,10 @@ ScheduleStage::exec()
|
||||
// If no wave is ready to be scheduled on the execution resource
|
||||
// then skip scheduling for this execution resource
|
||||
if (!readyListSize) {
|
||||
rdyListEmpty[j]++;
|
||||
stats.rdyListEmpty[j]++;
|
||||
continue;
|
||||
}
|
||||
rdyListNotEmpty[j]++;
|
||||
stats.rdyListNotEmpty[j]++;
|
||||
|
||||
// Pick a wave and attempt to add it to schList
|
||||
Wavefront *wf = scheduler[j].chooseWave();
|
||||
@@ -133,8 +133,8 @@ ScheduleStage::exec()
|
||||
if (!addToSchList(j, gpu_dyn_inst)) {
|
||||
// For waves not added to schList, increment count of cycles
|
||||
// this wave spends in SCH stage.
|
||||
wf->schCycles++;
|
||||
addToSchListStalls[j]++;
|
||||
wf->stats.schCycles++;
|
||||
stats.addToSchListStalls[j]++;
|
||||
} else {
|
||||
if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
|
||||
wf->incLGKMInstsIssued();
|
||||
@@ -160,10 +160,10 @@ ScheduleStage::exec()
|
||||
// If no wave is ready to be scheduled on the execution resource
|
||||
// then skip scheduling for this execution resource
|
||||
if (!readyListSize) {
|
||||
rdyListEmpty[j]++;
|
||||
stats.rdyListEmpty[j]++;
|
||||
continue;
|
||||
}
|
||||
rdyListNotEmpty[j]++;
|
||||
stats.rdyListNotEmpty[j]++;
|
||||
|
||||
// Pick a wave and attempt to add it to schList
|
||||
Wavefront *wf = scheduler[j].chooseWave();
|
||||
@@ -172,8 +172,8 @@ ScheduleStage::exec()
|
||||
if (!addToSchList(j, gpu_dyn_inst)) {
|
||||
// For waves not added to schList, increment count of cycles
|
||||
// this wave spends in SCH stage.
|
||||
wf->schCycles++;
|
||||
addToSchListStalls[j]++;
|
||||
wf->stats.schCycles++;
|
||||
stats.addToSchListStalls[j]++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -241,17 +241,17 @@ ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
|
||||
computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
|
||||
return true;
|
||||
} else {
|
||||
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
|
||||
stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
|
||||
if (!accessSrfWr) {
|
||||
rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
|
||||
stats.rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
|
||||
}
|
||||
if (!accessVrfWr) {
|
||||
rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
|
||||
stats.rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
|
||||
}
|
||||
|
||||
// Increment stall counts for WF
|
||||
wf->schStalls++;
|
||||
wf->schRfAccessStalls++;
|
||||
wf->stats.schStalls++;
|
||||
wf->stats.schRfAccessStalls++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -329,19 +329,19 @@ ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
|
||||
return true;
|
||||
} else {
|
||||
// Number of stall cycles due to RF access denied
|
||||
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
|
||||
stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
|
||||
// Count number of denials due to each reason
|
||||
// Multiple items may contribute to the denied request
|
||||
if (!accessVrf) {
|
||||
rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
|
||||
stats.rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
|
||||
}
|
||||
if (!accessSrf) {
|
||||
rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
|
||||
stats.rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
|
||||
}
|
||||
|
||||
// Increment stall counts for WF
|
||||
wf->schStalls++;
|
||||
wf->schRfAccessStalls++;
|
||||
wf->stats.schStalls++;
|
||||
wf->stats.schRfAccessStalls++;
|
||||
DPRINTF(GPUSched, "schList[%d]: Could not add: "
|
||||
"SIMD[%d] WV[%d]: %d: %s\n",
|
||||
exeType, wf->simdId, wf->wfDynId,
|
||||
@@ -424,26 +424,26 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
|
||||
// TODO: Scalar NOP does not require SALU in hardware,
|
||||
// and is executed out of IB directly.
|
||||
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
|
||||
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
return false;
|
||||
} else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
|
||||
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
|
||||
return false;
|
||||
}
|
||||
} else if (gpu_dyn_inst->isEndOfKernel()) {
|
||||
// EndPgm instruction
|
||||
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
|
||||
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
return false;
|
||||
}
|
||||
} else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
|
||||
|| gpu_dyn_inst->isALU()) {
|
||||
// Barrier, Branch, or ALU instruction
|
||||
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
|
||||
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
return false;
|
||||
} else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
|
||||
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
|
||||
return false;
|
||||
}
|
||||
} else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
|
||||
@@ -451,19 +451,19 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
|
||||
bool rdy = true;
|
||||
if (!glbMemIssueRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
|
||||
}
|
||||
if (!glbMemBusRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
@@ -473,18 +473,18 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
|
||||
bool rdy = true;
|
||||
if (!scalarMemIssueRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
|
||||
}
|
||||
if (!scalarMemBusRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.scalarMemoryPipe
|
||||
.isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
|
||||
+ wf->scalarWrGmReqsInPipe))
|
||||
{
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
@@ -494,16 +494,16 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
|
||||
bool rdy = true;
|
||||
if (!locMemIssueRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
|
||||
}
|
||||
if (!locMemBusRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.localMemoryPipe.
|
||||
isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
@@ -513,24 +513,24 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
|
||||
bool rdy = true;
|
||||
if (!glbMemIssueRdy || !locMemIssueRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
|
||||
}
|
||||
if (!glbMemBusRdy || !locMemBusRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.localMemoryPipe.
|
||||
isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
|
||||
stats.dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
@@ -540,7 +540,7 @@ ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
|
||||
gpu_dyn_inst->disassemble());
|
||||
return false;
|
||||
}
|
||||
dispNrdyStalls[SCH_RDY]++;
|
||||
stats.dispNrdyStalls[SCH_RDY]++;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -584,10 +584,10 @@ ScheduleStage::fillDispatchList()
|
||||
} else {
|
||||
// Either another wave has been dispatched, or this wave
|
||||
// was not ready, so it is stalled this cycle
|
||||
schIter->first->wavefront()->schStalls++;
|
||||
schIter->first->wavefront()->stats.schStalls++;
|
||||
if (!dispRdy) {
|
||||
// not ready for dispatch, increment stall stat
|
||||
schIter->first->wavefront()->schResourceStalls++;
|
||||
schIter->first->wavefront()->stats.schResourceStalls++;
|
||||
}
|
||||
// Examine next wave for this resource
|
||||
schIter++;
|
||||
@@ -601,9 +601,9 @@ ScheduleStage::fillDispatchList()
|
||||
// Increment stall count if no wave sent to dispatchList for
|
||||
// current execution resource
|
||||
if (!dispatched) {
|
||||
schListToDispListStalls[j]++;
|
||||
stats.schListToDispListStalls[j]++;
|
||||
} else {
|
||||
schListToDispList[j]++;
|
||||
stats.schListToDispList[j]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -635,9 +635,9 @@ ScheduleStage::arbitrateVrfToLdsBus()
|
||||
reinsertToSchList(wf->localMem, toExecute
|
||||
.readyInst(wf->localMem));
|
||||
// Increment stall stats for LDS-VRF arbitration
|
||||
ldsBusArbStalls++;
|
||||
stats.ldsBusArbStalls++;
|
||||
toExecute.readyInst(wf->localMem)
|
||||
->wavefront()->schLdsArbStalls++;
|
||||
->wavefront()->stats.schLdsArbStalls++;
|
||||
}
|
||||
// With arbitration of LM pipe complete, transition the
|
||||
// LM pipe to SKIP state in the dispatchList to inform EX stage
|
||||
@@ -663,7 +663,7 @@ ScheduleStage::checkRfOperandReadComplete()
|
||||
|
||||
// Increment the number of cycles the wave spends in the
|
||||
// SCH stage, since this loop visits every wave in SCH.
|
||||
wf->schCycles++;
|
||||
wf->stats.schCycles++;
|
||||
|
||||
bool vrfRdy = true;
|
||||
if (!gpu_dyn_inst->isScalar()) {
|
||||
@@ -690,15 +690,15 @@ ScheduleStage::checkRfOperandReadComplete()
|
||||
p.second = RFBUSY;
|
||||
|
||||
// Increment stall stats
|
||||
wf->schStalls++;
|
||||
wf->schOpdNrdyStalls++;
|
||||
wf->stats.schStalls++;
|
||||
wf->stats.schOpdNrdyStalls++;
|
||||
|
||||
opdNrdyStalls[SCH_RF_OPD_NRDY]++;
|
||||
stats.opdNrdyStalls[SCH_RF_OPD_NRDY]++;
|
||||
if (!vrfRdy) {
|
||||
opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
|
||||
stats.opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
|
||||
}
|
||||
if (!srfRdy) {
|
||||
opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
|
||||
stats.opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -777,60 +777,40 @@ ScheduleStage::deleteFromSch(Wavefront *w)
|
||||
wavesInSch.erase(w->wfDynId);
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::regStats()
|
||||
ScheduleStage::ScheduleStageStats::ScheduleStageStats(Stats::Group *parent,
|
||||
int num_exec_units)
|
||||
: Stats::Group(parent, "ScheduleStage"),
|
||||
ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
|
||||
"execution resource"),
|
||||
ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
|
||||
"list per execution resource"),
|
||||
ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "
|
||||
"schList per execution resource when ready list is not empty"),
|
||||
ADD_STAT(schListToDispList, "number of cycles a wave is added to "
|
||||
"dispatchList per execution resource"),
|
||||
ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"
|
||||
" dispatchList per execution resource"),
|
||||
ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),
|
||||
ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "
|
||||
"conflicts"),
|
||||
ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "
|
||||
"ready"),
|
||||
ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "
|
||||
"ready")
|
||||
{
|
||||
rdyListNotEmpty
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".rdy_list_not_empty")
|
||||
.desc("number of cycles one or more wave on ready list per "
|
||||
"execution resource")
|
||||
;
|
||||
rdyListNotEmpty.init(num_exec_units);
|
||||
rdyListEmpty.init(num_exec_units);
|
||||
addToSchListStalls.init(num_exec_units);
|
||||
schListToDispList.init(num_exec_units);
|
||||
schListToDispListStalls.init(num_exec_units);
|
||||
opdNrdyStalls.init(SCH_RF_OPD_NRDY_CONDITIONS);
|
||||
dispNrdyStalls.init(SCH_NRDY_CONDITIONS);
|
||||
rfAccessStalls.init(SCH_RF_ACCESS_NRDY_CONDITIONS);
|
||||
|
||||
rdyListEmpty
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".rdy_list_empty")
|
||||
.desc("number of cycles no wave on ready list per "
|
||||
"execution resource")
|
||||
;
|
||||
|
||||
addToSchListStalls
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".sch_list_add_stalls")
|
||||
.desc("number of cycles a wave is not added to schList per "
|
||||
"execution resource when ready list is not empty")
|
||||
;
|
||||
|
||||
schListToDispList
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".sch_list_to_disp_list")
|
||||
.desc("number of cycles a wave is added to dispatchList per "
|
||||
"execution resource")
|
||||
;
|
||||
|
||||
schListToDispListStalls
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".sch_list_to_disp_list_stalls")
|
||||
.desc("number of cycles no wave is added to dispatchList per "
|
||||
"execution resource")
|
||||
;
|
||||
|
||||
// Operand Readiness Stall Cycles
|
||||
opdNrdyStalls
|
||||
.init(SCH_RF_OPD_NRDY_CONDITIONS)
|
||||
.name(name() + ".opd_nrdy_stalls")
|
||||
.desc("number of stalls in SCH due to operands not ready")
|
||||
;
|
||||
opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
|
||||
opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
|
||||
opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
|
||||
|
||||
// dispatchReady Stall Cycles
|
||||
dispNrdyStalls
|
||||
.init(SCH_NRDY_CONDITIONS)
|
||||
.name(name() + ".disp_nrdy_stalls")
|
||||
.desc("number of stalls in SCH due to resource not ready")
|
||||
;
|
||||
dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
|
||||
dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
|
||||
dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
|
||||
@@ -862,21 +842,9 @@ ScheduleStage::regStats()
|
||||
csprintf("FlatMemFIFO"));
|
||||
dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
|
||||
|
||||
// RF Access Stall Cycles
|
||||
rfAccessStalls
|
||||
.init(SCH_RF_ACCESS_NRDY_CONDITIONS)
|
||||
.name(name() + ".rf_access_stalls")
|
||||
.desc("number of stalls due to RF access denied")
|
||||
;
|
||||
rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
|
||||
rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
|
||||
rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
|
||||
rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
|
||||
rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
|
||||
|
||||
// Stall cycles due to wave losing LDS bus arbitration
|
||||
ldsBusArbStalls
|
||||
.name(name() + ".lds_bus_arb_stalls")
|
||||
.desc("number of stalls due to VRF->LDS bus conflicts")
|
||||
;
|
||||
}
|
||||
|
||||
@@ -40,6 +40,8 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
#include "gpu-compute/exec_stage.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "gpu-compute/scheduler.hh"
|
||||
@@ -105,8 +107,6 @@ class ScheduleStage
|
||||
SCH_RF_ACCESS_NRDY_CONDITIONS
|
||||
};
|
||||
|
||||
void regStats();
|
||||
|
||||
// Called by ExecStage to inform SCH of instruction execution
|
||||
void deleteFromSch(Wavefront *w);
|
||||
|
||||
@@ -126,48 +126,6 @@ class ScheduleStage
|
||||
// scheduler and a dispatch list
|
||||
std::vector<Scheduler> scheduler;
|
||||
|
||||
// Stats
|
||||
|
||||
// Number of cycles with empty (or not empty) readyList, per execution
|
||||
// resource, when the CU is active (not sleeping)
|
||||
Stats::Vector rdyListEmpty;
|
||||
Stats::Vector rdyListNotEmpty;
|
||||
|
||||
// Number of cycles, per execution resource, when at least one wave
|
||||
// was on the readyList and picked by scheduler, but was unable to be
|
||||
// added to the schList, when the CU is active (not sleeping)
|
||||
Stats::Vector addToSchListStalls;
|
||||
|
||||
// Number of cycles, per execution resource, when a wave is selected
|
||||
// as candidate for dispatchList from schList
|
||||
// Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
|
||||
Stats::Vector schListToDispList;
|
||||
|
||||
// Per execution resource stat, incremented once per cycle if no wave
|
||||
// was selected as candidate for dispatch and moved to dispatchList
|
||||
Stats::Vector schListToDispListStalls;
|
||||
|
||||
// Number of times a wave is selected by the scheduler but cannot
|
||||
// be added to the schList due to register files not being able to
|
||||
// support reads or writes of operands. RF_ACCESS_NRDY condition is always
|
||||
// incremented if at least one read/write not supported, other
|
||||
// conditions are incremented independently from each other.
|
||||
Stats::Vector rfAccessStalls;
|
||||
|
||||
// Number of times a wave is executing FLAT instruction and
|
||||
// forces another wave occupying its required local memory resource
|
||||
// to be deselected for execution, and placed back on schList
|
||||
Stats::Scalar ldsBusArbStalls;
|
||||
|
||||
// Count of times VRF and/or SRF blocks waves on schList from
|
||||
// performing RFBUSY->RFREADY transition
|
||||
Stats::Vector opdNrdyStalls;
|
||||
|
||||
// Count of times resource required for dispatch is not ready and
|
||||
// blocks wave in RFREADY state on schList from potentially moving
|
||||
// to dispatchList
|
||||
Stats::Vector dispNrdyStalls;
|
||||
|
||||
const std::string _name;
|
||||
|
||||
// called by exec() to add a wave to schList if the RFs can support it
|
||||
@@ -221,6 +179,52 @@ class ScheduleStage
|
||||
// the VRF/SRF availability or limits imposed by paremeters (to be added)
|
||||
// of the SCH stage or CU.
|
||||
std::vector<std::deque<std::pair<GPUDynInstPtr, SCH_STATUS>>> schList;
|
||||
|
||||
protected:
|
||||
struct ScheduleStageStats : public Stats::Group
|
||||
{
|
||||
ScheduleStageStats(Stats::Group *parent, int num_exec_units);
|
||||
|
||||
// Number of cycles with empty (or not empty) readyList, per execution
|
||||
// resource, when the CU is active (not sleeping)
|
||||
Stats::Vector rdyListEmpty;
|
||||
Stats::Vector rdyListNotEmpty;
|
||||
|
||||
// Number of cycles, per execution resource, when at least one wave
|
||||
// was on the readyList and picked by scheduler, but was unable to be
|
||||
// added to the schList, when the CU is active (not sleeping)
|
||||
Stats::Vector addToSchListStalls;
|
||||
|
||||
// Number of cycles, per execution resource, when a wave is selected
|
||||
// as candidate for dispatchList from schList
|
||||
// Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
|
||||
Stats::Vector schListToDispList;
|
||||
|
||||
// Per execution resource stat, incremented once per cycle if no wave
|
||||
// was selected as candidate for dispatch and moved to dispatchList
|
||||
Stats::Vector schListToDispListStalls;
|
||||
|
||||
// Number of times a wave is selected by the scheduler but cannot
|
||||
// be added to the schList due to register files not being able to
|
||||
// support reads or writes of operands. RF_ACCESS_NRDY condition is
|
||||
// always incremented if at least one read/write not supported, other
|
||||
// conditions are incremented independently from each other.
|
||||
Stats::Vector rfAccessStalls;
|
||||
|
||||
// Number of times a wave is executing FLAT instruction and
|
||||
// forces another wave occupying its required local memory resource
|
||||
// to be deselected for execution, and placed back on schList
|
||||
Stats::Scalar ldsBusArbStalls;
|
||||
|
||||
// Count of times VRF and/or SRF blocks waves on schList from
|
||||
// performing RFBUSY->RFREADY transition
|
||||
Stats::Vector opdNrdyStalls;
|
||||
|
||||
// Count of times resource required for dispatch is not ready and
|
||||
// blocks wave in RFREADY state on schList from potentially moving
|
||||
// to dispatchList
|
||||
Stats::Vector dispNrdyStalls;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __SCHEDULE_STAGE_HH__
|
||||
|
||||
@@ -49,7 +49,7 @@ ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams &p,
|
||||
ScoreboardCheckToSchedule
|
||||
&to_schedule)
|
||||
: computeUnit(cu), toSchedule(to_schedule),
|
||||
_name(cu.name() + ".ScoreboardCheckStage")
|
||||
_name(cu.name() + ".ScoreboardCheckStage"), stats(&cu)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -62,7 +62,7 @@ ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus)
|
||||
{
|
||||
panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS,
|
||||
"Instruction ready status %d is illegal!!!", rdyStatus);
|
||||
stallCycles[rdyStatus]++;
|
||||
stats.stallCycles[rdyStatus]++;
|
||||
}
|
||||
|
||||
// Return true if this wavefront is ready
|
||||
@@ -266,14 +266,13 @@ ScoreboardCheckStage::exec()
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckStage::regStats()
|
||||
ScoreboardCheckStage::
|
||||
ScoreboardCheckStageStats::ScoreboardCheckStageStats(Stats::Group *parent)
|
||||
: Stats::Group(parent, "ScoreboardCheckStage"),
|
||||
ADD_STAT(stallCycles, "number of cycles wave stalled in SCB")
|
||||
{
|
||||
stallCycles
|
||||
.init(NRDY_CONDITIONS)
|
||||
.name(name() + ".stall_cycles")
|
||||
.desc("number of cycles wave stalled in SCB")
|
||||
;
|
||||
stallCycles.init(NRDY_CONDITIONS);
|
||||
|
||||
stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop"));
|
||||
stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty"));
|
||||
stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt"));
|
||||
|
||||
@@ -40,7 +40,8 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "sim/stats.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class ScoreboardCheckToSchedule;
|
||||
@@ -78,7 +79,6 @@ class ScoreboardCheckStage
|
||||
|
||||
// Stats related variables and methods
|
||||
const std::string& name() const { return _name; }
|
||||
void regStats();
|
||||
|
||||
private:
|
||||
void collectStatistics(nonrdytype_e rdyStatus);
|
||||
@@ -94,10 +94,15 @@ class ScoreboardCheckStage
|
||||
*/
|
||||
ScoreboardCheckToSchedule &toSchedule;
|
||||
|
||||
// Stats
|
||||
Stats::Vector stallCycles;
|
||||
|
||||
const std::string _name;
|
||||
|
||||
protected:
|
||||
struct ScoreboardCheckStageStats : public Stats::Group
|
||||
{
|
||||
ScoreboardCheckStageStats(Stats::Group *parent);
|
||||
|
||||
Stats::Vector stallCycles;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __SCOREBOARD_CHECK_STAGE_HH__
|
||||
|
||||
@@ -65,7 +65,8 @@ Shader::Shader(const Params &p) : ClockedObject(p),
|
||||
globalMemSize(p.globalmem),
|
||||
nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
|
||||
_dispatcher(*p.dispatcher),
|
||||
max_valu_insts(p.max_valu_insts), total_valu_insts(0)
|
||||
max_valu_insts(p.max_valu_insts), total_valu_insts(0),
|
||||
stats(this, p.CUs[0]->wfSize())
|
||||
{
|
||||
gpuCmdProc.setShader(this);
|
||||
_dispatcher.setShader(this);
|
||||
@@ -278,86 +279,6 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
|
||||
return scheduledSomething;
|
||||
}
|
||||
|
||||
void
|
||||
Shader::regStats()
|
||||
{
|
||||
ClockedObject::regStats();
|
||||
|
||||
shaderActiveTicks
|
||||
.name(name() + ".shader_active_ticks")
|
||||
.desc("Total ticks that any CU attached to this shader is active")
|
||||
;
|
||||
allLatencyDist
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".allLatencyDist")
|
||||
.desc("delay distribution for all")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
loadLatencyDist
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".loadLatencyDist")
|
||||
.desc("delay distribution for loads")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
storeLatencyDist
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".storeLatencyDist")
|
||||
.desc("delay distribution for stores")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
vectorInstSrcOperand
|
||||
.init(4)
|
||||
.name(name() + ".vec_inst_src_operand")
|
||||
.desc("vector instruction source operand distribution");
|
||||
|
||||
vectorInstDstOperand
|
||||
.init(4)
|
||||
.name(name() + ".vec_inst_dst_operand")
|
||||
.desc("vector instruction destination operand distribution");
|
||||
|
||||
initToCoalesceLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".initToCoalesceLatency")
|
||||
.desc("Ticks from vmem inst initiateAcc to coalescer issue")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
rubyNetworkLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".rubyNetworkLatency")
|
||||
.desc("Ticks from coalescer issue to coalescer hit callback")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
gmEnqueueLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".gmEnqueueLatency")
|
||||
.desc("Ticks from coalescer hit callback to GM pipe enqueue")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
gmToCompleteLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".gmToCompleteLatency")
|
||||
.desc("Ticks queued in GM pipes ordered response buffer")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
coalsrLineAddresses
|
||||
.init(0, 20, 1)
|
||||
.name(name() + ".coalsrLineAddresses")
|
||||
.desc("Number of cache lines for coalesced request")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
int wfSize = cuList[0]->wfSize();
|
||||
cacheBlockRoundTrip = new Stats::Distribution[wfSize];
|
||||
for (int idx = 0; idx < wfSize; ++idx) {
|
||||
std::stringstream namestr;
|
||||
ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
|
||||
cacheBlockRoundTrip[idx]
|
||||
.init(0, 1600000, 10000)
|
||||
.name(namestr.str())
|
||||
.desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
|
||||
bool suppress_func_errors, int cu_id)
|
||||
@@ -528,8 +449,8 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
|
||||
void
|
||||
Shader::sampleStore(const Tick accessTime)
|
||||
{
|
||||
storeLatencyDist.sample(accessTime);
|
||||
allLatencyDist.sample(accessTime);
|
||||
stats.storeLatencyDist.sample(accessTime);
|
||||
stats.allLatencyDist.sample(accessTime);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -538,8 +459,8 @@ Shader::sampleStore(const Tick accessTime)
|
||||
void
|
||||
Shader::sampleLoad(const Tick accessTime)
|
||||
{
|
||||
loadLatencyDist.sample(accessTime);
|
||||
allLatencyDist.sample(accessTime);
|
||||
stats.loadLatencyDist.sample(accessTime);
|
||||
stats.allLatencyDist.sample(accessTime);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -556,16 +477,16 @@ Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)
|
||||
Tick t4 = roundTripTime[3];
|
||||
Tick t5 = roundTripTime[4];
|
||||
|
||||
initToCoalesceLatency.sample(t2-t1);
|
||||
rubyNetworkLatency.sample(t3-t2);
|
||||
gmEnqueueLatency.sample(t4-t3);
|
||||
gmToCompleteLatency.sample(t5-t4);
|
||||
stats.initToCoalesceLatency.sample(t2-t1);
|
||||
stats.rubyNetworkLatency.sample(t3-t2);
|
||||
stats.gmEnqueueLatency.sample(t4-t3);
|
||||
stats.gmToCompleteLatency.sample(t5-t4);
|
||||
}
|
||||
|
||||
void
|
||||
Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
|
||||
{
|
||||
coalsrLineAddresses.sample(lineMap.size());
|
||||
stats.coalsrLineAddresses.sample(lineMap.size());
|
||||
std::vector<Tick> netTimes;
|
||||
|
||||
// For each cache block address generated by a vmem inst, calculate
|
||||
@@ -586,7 +507,7 @@ Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
|
||||
// Nth distribution.
|
||||
int idx = 0;
|
||||
for (auto& time : netTimes) {
|
||||
cacheBlockRoundTrip[idx].sample(time);
|
||||
stats.cacheBlockRoundTrip[idx].sample(time);
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
@@ -598,5 +519,75 @@ Shader::notifyCuSleep() {
|
||||
"Invalid activeCu size\n");
|
||||
_activeCus--;
|
||||
if (!_activeCus)
|
||||
shaderActiveTicks += curTick() - _lastInactiveTick;
|
||||
stats.shaderActiveTicks += curTick() - _lastInactiveTick;
|
||||
}
|
||||
|
||||
Shader::ShaderStats::ShaderStats(Stats::Group *parent, int wf_size)
|
||||
: Stats::Group(parent),
|
||||
ADD_STAT(allLatencyDist, "delay distribution for all"),
|
||||
ADD_STAT(loadLatencyDist, "delay distribution for loads"),
|
||||
ADD_STAT(storeLatencyDist, "delay distribution for stores"),
|
||||
ADD_STAT(initToCoalesceLatency,
|
||||
"Ticks from vmem inst initiateAcc to coalescer issue"),
|
||||
ADD_STAT(rubyNetworkLatency,
|
||||
"Ticks from coalescer issue to coalescer hit callback"),
|
||||
ADD_STAT(gmEnqueueLatency,
|
||||
"Ticks from coalescer hit callback to GM pipe enqueue"),
|
||||
ADD_STAT(gmToCompleteLatency,
|
||||
"Ticks queued in GM pipes ordered response buffer"),
|
||||
ADD_STAT(coalsrLineAddresses,
|
||||
"Number of cache lines for coalesced request"),
|
||||
ADD_STAT(shaderActiveTicks,
|
||||
"Total ticks that any CU attached to this shader is active"),
|
||||
ADD_STAT(vectorInstSrcOperand,
|
||||
"vector instruction source operand distribution"),
|
||||
ADD_STAT(vectorInstDstOperand,
|
||||
"vector instruction destination operand distribution")
|
||||
{
|
||||
allLatencyDist
|
||||
.init(0, 1600000, 10000)
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
loadLatencyDist
|
||||
.init(0, 1600000, 10000)
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
storeLatencyDist
|
||||
.init(0, 1600000, 10000)
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
initToCoalesceLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
rubyNetworkLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
gmEnqueueLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
gmToCompleteLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
coalsrLineAddresses
|
||||
.init(0, 20, 1)
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
vectorInstSrcOperand.init(4);
|
||||
vectorInstDstOperand.init(4);
|
||||
|
||||
cacheBlockRoundTrip = new Stats::Distribution[wf_size];
|
||||
for (int idx = 0; idx < wf_size; ++idx) {
|
||||
std::stringstream namestr;
|
||||
ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
|
||||
static_cast<Shader*>(parent)->name(), idx);
|
||||
cacheBlockRoundTrip[idx]
|
||||
.init(0, 1600000, 10000)
|
||||
.name(namestr.str())
|
||||
.desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,6 +40,8 @@
|
||||
#include <string>
|
||||
|
||||
#include "arch/isa.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
#include "base/types.hh"
|
||||
#include "cpu/simple/atomic.hh"
|
||||
#include "cpu/simple/timing.hh"
|
||||
@@ -98,26 +100,6 @@ class Shader : public ClockedObject
|
||||
// Last tick that all CUs attached to this shader were inactive
|
||||
Tick _lastInactiveTick;
|
||||
|
||||
// some stats for measuring latency
|
||||
Stats::Distribution allLatencyDist;
|
||||
Stats::Distribution loadLatencyDist;
|
||||
Stats::Distribution storeLatencyDist;
|
||||
|
||||
// average ticks from vmem inst initiateAcc to coalescer issue,
|
||||
// average ticks from coalescer issue to coalescer hit callback,
|
||||
// average ticks from coalescer hit callback to GM pipe enqueue,
|
||||
// and average ticks spent in GM pipe's ordered resp buffer.
|
||||
Stats::Distribution initToCoalesceLatency;
|
||||
Stats::Distribution rubyNetworkLatency;
|
||||
Stats::Distribution gmEnqueueLatency;
|
||||
Stats::Distribution gmToCompleteLatency;
|
||||
|
||||
// average number of cache blocks requested by vmem inst, and
|
||||
// average ticks for cache blocks to main memory for the Nth
|
||||
// cache block generated by a vmem inst.
|
||||
Stats::Distribution coalsrLineAddresses;
|
||||
Stats::Distribution *cacheBlockRoundTrip;
|
||||
|
||||
public:
|
||||
typedef ShaderParams Params;
|
||||
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
|
||||
@@ -249,14 +231,6 @@ class Shader : public ClockedObject
|
||||
GPUCommandProcessor &gpuCmdProc;
|
||||
GPUDispatcher &_dispatcher;
|
||||
|
||||
/**
|
||||
* Statistics
|
||||
*/
|
||||
Stats::Scalar shaderActiveTicks;
|
||||
Stats::Vector vectorInstSrcOperand;
|
||||
Stats::Vector vectorInstDstOperand;
|
||||
void regStats();
|
||||
|
||||
int64_t max_valu_insts;
|
||||
int64_t total_valu_insts;
|
||||
|
||||
@@ -301,6 +275,52 @@ class Shader : public ClockedObject
|
||||
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
|
||||
void updateContext(int cid);
|
||||
void notifyCuSleep();
|
||||
|
||||
void
|
||||
incVectorInstSrcOperand(int num_operands)
|
||||
{
|
||||
stats.vectorInstSrcOperand[num_operands]++;
|
||||
}
|
||||
|
||||
void
|
||||
incVectorInstDstOperand(int num_operands)
|
||||
{
|
||||
stats.vectorInstDstOperand[num_operands]++;
|
||||
}
|
||||
|
||||
protected:
|
||||
struct ShaderStats : public Stats::Group
|
||||
{
|
||||
ShaderStats(Stats::Group *parent, int wf_size);
|
||||
|
||||
// some stats for measuring latency
|
||||
Stats::Distribution allLatencyDist;
|
||||
Stats::Distribution loadLatencyDist;
|
||||
Stats::Distribution storeLatencyDist;
|
||||
|
||||
// average ticks from vmem inst initiateAcc to coalescer issue,
|
||||
Stats::Distribution initToCoalesceLatency;
|
||||
|
||||
// average ticks from coalescer issue to coalescer hit callback,
|
||||
Stats::Distribution rubyNetworkLatency;
|
||||
|
||||
// average ticks from coalescer hit callback to GM pipe enqueue,
|
||||
Stats::Distribution gmEnqueueLatency;
|
||||
|
||||
// average ticks spent in GM pipe's ordered resp buffer.
|
||||
Stats::Distribution gmToCompleteLatency;
|
||||
|
||||
// average number of cache blocks requested by vmem inst
|
||||
Stats::Distribution coalsrLineAddresses;
|
||||
|
||||
// average ticks for cache blocks to main memory for the Nth
|
||||
// cache block generated by a vmem inst.
|
||||
Stats::Distribution *cacheBlockRoundTrip;
|
||||
|
||||
Stats::Scalar shaderActiveTicks;
|
||||
Stats::Vector vectorInstSrcOperand;
|
||||
Stats::Vector vectorInstDstOperand;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __SHADER_HH__
|
||||
|
||||
@@ -180,8 +180,3 @@ StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
|
||||
w->reservedScalarRegs = 0;
|
||||
w->startSgprIndex = 0;
|
||||
}
|
||||
|
||||
void
|
||||
StaticRegisterManagerPolicy::regStats()
|
||||
{
|
||||
}
|
||||
|
||||
@@ -58,8 +58,6 @@ class StaticRegisterManagerPolicy : public RegisterManagerPolicy
|
||||
int scalarDemand) override;
|
||||
|
||||
void freeRegisters(Wavefront *w) override;
|
||||
|
||||
void regStats() override;
|
||||
};
|
||||
|
||||
#endif // __STATIC_REGISTER_MANAGER_POLICY_HH__
|
||||
|
||||
@@ -50,7 +50,8 @@ TLBCoalescer::TLBCoalescer(const Params &p)
|
||||
false, Event::CPU_Tick_Pri),
|
||||
cleanupEvent([this]{ processCleanupEvent(); },
|
||||
"Cleanup issuedTranslationsTable hashmap",
|
||||
false, Event::Maximum_Pri)
|
||||
false, Event::Maximum_Pri),
|
||||
stats(this)
|
||||
{
|
||||
// create the response ports based on the number of connected ports
|
||||
for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) {
|
||||
@@ -256,11 +257,11 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
|
||||
sender_state->reqCnt.push_back(req_cnt);
|
||||
|
||||
// update statistics
|
||||
coalescer->uncoalescedAccesses++;
|
||||
coalescer->stats.uncoalescedAccesses++;
|
||||
req_cnt = sender_state->reqCnt.back();
|
||||
DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
|
||||
coalescer->queuingCycles -= (curTick() * req_cnt);
|
||||
coalescer->localqueuingCycles -= curTick();
|
||||
coalescer->stats.queuingCycles -= (curTick() * req_cnt);
|
||||
coalescer->stats.localqueuingCycles -= curTick();
|
||||
}
|
||||
|
||||
// FIXME if you want to coalesce not based on the issueTime
|
||||
@@ -302,7 +303,7 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
|
||||
// and make necessary allocations.
|
||||
if (!coalescedReq_cnt || !didCoalesce) {
|
||||
if (update_stats)
|
||||
coalescer->coalescedAccesses++;
|
||||
coalescer->stats.coalescedAccesses++;
|
||||
|
||||
std::vector<PacketPtr> new_array;
|
||||
new_array.push_back(pkt);
|
||||
@@ -339,7 +340,7 @@ TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
|
||||
bool update_stats = !sender_state->prefetch;
|
||||
|
||||
if (update_stats)
|
||||
coalescer->uncoalescedAccesses++;
|
||||
coalescer->stats.uncoalescedAccesses++;
|
||||
|
||||
// If there is a pending timing request for this virtual address
|
||||
// print a warning message. This is a temporary caveat of
|
||||
@@ -467,7 +468,7 @@ TLBCoalescer::processProbeTLBEvent()
|
||||
// by the one we just sent counting all the way from
|
||||
// the top of TLB hiearchy (i.e., from the CU)
|
||||
int req_cnt = tmp_sender_state->reqCnt.back();
|
||||
queuingCycles += (curTick() * req_cnt);
|
||||
stats.queuingCycles += (curTick() * req_cnt);
|
||||
|
||||
DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
|
||||
name(), req_cnt);
|
||||
@@ -475,7 +476,7 @@ TLBCoalescer::processProbeTLBEvent()
|
||||
// pkt_cnt is number of packets we coalesced into the one
|
||||
// we just sent but only at this coalescer level
|
||||
int pkt_cnt = iter->second[vector_index].size();
|
||||
localqueuingCycles += (curTick() * pkt_cnt);
|
||||
stats.localqueuingCycles += (curTick() * pkt_cnt);
|
||||
}
|
||||
|
||||
DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
|
||||
@@ -520,35 +521,14 @@ TLBCoalescer::processCleanupEvent()
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
TLBCoalescer::regStats()
|
||||
TLBCoalescer::TLBCoalescerStats::TLBCoalescerStats(Stats::Group *parent)
|
||||
: Stats::Group(parent),
|
||||
ADD_STAT(uncoalescedAccesses, "Number of uncoalesced TLB accesses"),
|
||||
ADD_STAT(coalescedAccesses, "Number of coalesced TLB accesses"),
|
||||
ADD_STAT(queuingCycles, "Number of cycles spent in queue"),
|
||||
ADD_STAT(localqueuingCycles,
|
||||
"Number of cycles spent in queue for all incoming reqs"),
|
||||
ADD_STAT(localLatency, "Avg. latency over all incoming pkts")
|
||||
{
|
||||
ClockedObject::regStats();
|
||||
|
||||
uncoalescedAccesses
|
||||
.name(name() + ".uncoalesced_accesses")
|
||||
.desc("Number of uncoalesced TLB accesses")
|
||||
;
|
||||
|
||||
coalescedAccesses
|
||||
.name(name() + ".coalesced_accesses")
|
||||
.desc("Number of coalesced TLB accesses")
|
||||
;
|
||||
|
||||
queuingCycles
|
||||
.name(name() + ".queuing_cycles")
|
||||
.desc("Number of cycles spent in queue")
|
||||
;
|
||||
|
||||
localqueuingCycles
|
||||
.name(name() + ".local_queuing_cycles")
|
||||
.desc("Number of cycles spent in queue for all incoming reqs")
|
||||
;
|
||||
|
||||
localLatency
|
||||
.name(name() + ".local_latency")
|
||||
.desc("Avg. latency over all incoming pkts")
|
||||
;
|
||||
|
||||
localLatency = localqueuingCycles / uncoalescedAccesses;
|
||||
}
|
||||
|
||||
@@ -115,26 +115,8 @@ class TLBCoalescer : public ClockedObject
|
||||
|
||||
CoalescingTable issuedTranslationsTable;
|
||||
|
||||
// number of packets the coalescer receives
|
||||
Stats::Scalar uncoalescedAccesses;
|
||||
// number packets the coalescer send to the TLB
|
||||
Stats::Scalar coalescedAccesses;
|
||||
|
||||
// Number of cycles the coalesced requests spend waiting in
|
||||
// coalescerFIFO. For each packet the coalescer receives we take into
|
||||
// account the number of all uncoalesced requests this pkt "represents"
|
||||
Stats::Scalar queuingCycles;
|
||||
|
||||
// On average how much time a request from the
|
||||
// uncoalescedAccesses that reaches the TLB
|
||||
// spends waiting?
|
||||
Stats::Scalar localqueuingCycles;
|
||||
// localqueuingCycles/uncoalescedAccesses
|
||||
Stats::Formula localLatency;
|
||||
|
||||
bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
|
||||
void updatePhysAddresses(PacketPtr pkt);
|
||||
void regStats() override;
|
||||
|
||||
class CpuSidePort : public ResponsePort
|
||||
{
|
||||
@@ -211,6 +193,29 @@ class TLBCoalescer : public ClockedObject
|
||||
// this FIFO queue keeps track of the virt. page
|
||||
// addresses that are pending cleanup
|
||||
std::queue<Addr> cleanupQueue;
|
||||
|
||||
protected:
|
||||
struct TLBCoalescerStats : public Stats::Group
|
||||
{
|
||||
TLBCoalescerStats(Stats::Group *parent);
|
||||
|
||||
// number of packets the coalescer receives
|
||||
Stats::Scalar uncoalescedAccesses;
|
||||
// number packets the coalescer send to the TLB
|
||||
Stats::Scalar coalescedAccesses;
|
||||
|
||||
// Number of cycles the coalesced requests spend waiting in
|
||||
// coalescerFIFO. For each packet the coalescer receives we take into
|
||||
// account the number of all uncoalesced requests this pkt "represents"
|
||||
Stats::Scalar queuingCycles;
|
||||
|
||||
// On average how much time a request from the
|
||||
// uncoalescedAccesses that reaches the TLB
|
||||
// spends waiting?
|
||||
Stats::Scalar localqueuingCycles;
|
||||
// localqueuingCycles/uncoalescedAccesses
|
||||
Stats::Formula localLatency;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __TLB_COALESCER_HH__
|
||||
|
||||
@@ -69,11 +69,11 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
|
||||
->mapVgpr(w, vgprIdx + j);
|
||||
if (regBusy(pVgpr)) {
|
||||
if (ii->isDstOperand(i)) {
|
||||
w->numTimesBlockedDueWAXDependencies++;
|
||||
w->stats.numTimesBlockedDueWAXDependencies++;
|
||||
} else if (ii->isSrcOperand(i)) {
|
||||
DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
|
||||
w->wfDynId, ii->disassemble(), pVgpr);
|
||||
w->numTimesBlockedDueRAWDependencies++;
|
||||
w->stats.numTimesBlockedDueRAWDependencies++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -125,13 +125,13 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
// increment count of number of DWORDs read from VRF
|
||||
int DWORDs = ii->numSrcVecDWORDs();
|
||||
registerReads += (DWORDs * w->execMask().count());
|
||||
stats.registerReads += (DWORDs * w->execMask().count());
|
||||
|
||||
uint64_t mask = w->execMask().to_ullong();
|
||||
int srams = w->execMask().size() / 4;
|
||||
for (int i = 0; i < srams; i++) {
|
||||
if (mask & 0xF) {
|
||||
sramReads += DWORDs;
|
||||
stats.sramReads += DWORDs;
|
||||
}
|
||||
mask = mask >> 4;
|
||||
}
|
||||
@@ -163,13 +163,13 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
|
||||
|
||||
// increment count of number of DWORDs written to VRF
|
||||
DWORDs = ii->numDstVecDWORDs();
|
||||
registerWrites += (DWORDs * w->execMask().count());
|
||||
stats.registerWrites += (DWORDs * w->execMask().count());
|
||||
|
||||
mask = w->execMask().to_ullong();
|
||||
srams = w->execMask().size() / 4;
|
||||
for (int i = 0; i < srams; i++) {
|
||||
if (mask & 0xF) {
|
||||
sramWrites += DWORDs;
|
||||
stats.sramWrites += DWORDs;
|
||||
}
|
||||
mask = mask >> 4;
|
||||
}
|
||||
@@ -196,13 +196,13 @@ VectorRegisterFile::scheduleWriteOperandsFromLoad(
|
||||
}
|
||||
// increment count of number of DWORDs written to VRF
|
||||
int DWORDs = ii->numDstVecDWORDs();
|
||||
registerWrites += (DWORDs * ii->exec_mask.count());
|
||||
stats.registerWrites += (DWORDs * ii->exec_mask.count());
|
||||
|
||||
uint64_t mask = ii->exec_mask.to_ullong();
|
||||
int srams = ii->exec_mask.size() / 4;
|
||||
for (int i = 0; i < srams; i++) {
|
||||
if (mask & 0xF) {
|
||||
sramWrites += DWORDs;
|
||||
stats.sramWrites += DWORDs;
|
||||
}
|
||||
mask = mask >> 4;
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ Wavefront::Wavefront(const Params &p)
|
||||
maxIbSize(p.max_ib_size), _gpuISA(*this),
|
||||
vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
|
||||
vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
|
||||
barId(WFBarrier::InvalidID)
|
||||
barId(WFBarrier::InvalidID), stats(this)
|
||||
{
|
||||
lastTrace = 0;
|
||||
execUnitId = -1;
|
||||
@@ -97,75 +97,6 @@ Wavefront::Wavefront(const Params &p)
|
||||
vecReads.clear();
|
||||
}
|
||||
|
||||
void
|
||||
Wavefront::regStats()
|
||||
{
|
||||
SimObject::regStats();
|
||||
|
||||
// FIXME: the name of the WF needs to be unique
|
||||
numTimesBlockedDueWAXDependencies
|
||||
.name(name() + ".timesBlockedDueWAXDependencies")
|
||||
.desc("number of times the wf's instructions are blocked due to WAW "
|
||||
"or WAR dependencies")
|
||||
;
|
||||
|
||||
// FIXME: the name of the WF needs to be unique
|
||||
numTimesBlockedDueRAWDependencies
|
||||
.name(name() + ".timesBlockedDueRAWDependencies")
|
||||
.desc("number of times the wf's instructions are blocked due to RAW "
|
||||
"dependencies")
|
||||
;
|
||||
|
||||
numInstrExecuted
|
||||
.name(name() + ".num_instr_executed")
|
||||
.desc("number of instructions executed by this WF slot")
|
||||
;
|
||||
|
||||
schCycles
|
||||
.name(name() + ".sch_cycles")
|
||||
.desc("number of cycles spent in schedule stage")
|
||||
;
|
||||
|
||||
schStalls
|
||||
.name(name() + ".sch_stalls")
|
||||
.desc("number of cycles WF is stalled in SCH stage")
|
||||
;
|
||||
|
||||
schRfAccessStalls
|
||||
.name(name() + ".sch_rf_access_stalls")
|
||||
.desc("number of cycles wave selected in SCH but RF denied adding "
|
||||
"instruction")
|
||||
;
|
||||
|
||||
schResourceStalls
|
||||
.name(name() + ".sch_resource_stalls")
|
||||
.desc("number of cycles stalled in sch by resource not available")
|
||||
;
|
||||
|
||||
schOpdNrdyStalls
|
||||
.name(name() + ".sch_opd_nrdy_stalls")
|
||||
.desc("number of cycles stalled in sch waiting for RF reads to "
|
||||
"complete")
|
||||
;
|
||||
|
||||
schLdsArbStalls
|
||||
.name(name() + ".sch_lds_arb_stalls")
|
||||
.desc("number of cycles wave stalled due to LDS-VRF arbitration")
|
||||
;
|
||||
|
||||
vecRawDistance
|
||||
.init(0,20,1)
|
||||
.name(name() + ".vec_raw_distance")
|
||||
.desc("Count of RAW distance in dynamic instructions for this WF")
|
||||
;
|
||||
|
||||
readsPerWrite
|
||||
.init(0,4,1)
|
||||
.name(name() + ".vec_reads_per_write")
|
||||
.desc("Count of Vector reads per write for this WF")
|
||||
;
|
||||
}
|
||||
|
||||
void
|
||||
Wavefront::init()
|
||||
{
|
||||
@@ -959,17 +890,19 @@ Wavefront::exec()
|
||||
}
|
||||
computeUnit->srf[simdId]->waveExecuteInst(this, ii);
|
||||
|
||||
computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++;
|
||||
computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++;
|
||||
computeUnit->numInstrExecuted++;
|
||||
numInstrExecuted++;
|
||||
computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecOperands());
|
||||
computeUnit->shader->incVectorInstDstOperand(ii->numDstVecOperands());
|
||||
computeUnit->stats.numInstrExecuted++;
|
||||
stats.numInstrExecuted++;
|
||||
computeUnit->instExecPerSimd[simdId]++;
|
||||
computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
|
||||
computeUnit->lastExecCycle[simdId]);
|
||||
computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
|
||||
computeUnit->stats.execRateDist.sample(
|
||||
computeUnit->stats.totalCycles.value() -
|
||||
computeUnit->lastExecCycle[simdId]);
|
||||
computeUnit->lastExecCycle[simdId] =
|
||||
computeUnit->stats.totalCycles.value();
|
||||
|
||||
if (lastInstExec) {
|
||||
computeUnit->instInterleave[simdId].
|
||||
computeUnit->stats.instInterleave[simdId].
|
||||
sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
|
||||
}
|
||||
lastInstExec = computeUnit->instExecPerSimd[simdId];
|
||||
@@ -987,8 +920,8 @@ Wavefront::exec()
|
||||
if (ii->isSrcOperand(i)) {
|
||||
// This check should never fail, but to be safe we check
|
||||
if (rawDist.find(vgpr+n) != rawDist.end()) {
|
||||
vecRawDistance.
|
||||
sample(numInstrExecuted.value() - rawDist[vgpr+n]);
|
||||
stats.vecRawDistance.sample(
|
||||
stats.numInstrExecuted.value() - rawDist[vgpr+n]);
|
||||
}
|
||||
// increment number of reads to this register
|
||||
vecReads[vgpr+n]++;
|
||||
@@ -997,12 +930,12 @@ Wavefront::exec()
|
||||
// for the first write to each physical register
|
||||
if (rawDist.find(vgpr+n) != rawDist.end()) {
|
||||
// sample the number of reads that were performed
|
||||
readsPerWrite.sample(vecReads[vgpr+n]);
|
||||
stats.readsPerWrite.sample(vecReads[vgpr+n]);
|
||||
}
|
||||
// on a write, reset count of reads to 0
|
||||
vecReads[vgpr+n] = 0;
|
||||
|
||||
rawDist[vgpr+n] = numInstrExecuted.value();
|
||||
rawDist[vgpr+n] = stats.numInstrExecuted.value();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1023,26 +956,29 @@ Wavefront::exec()
|
||||
|
||||
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
|
||||
const int num_active_lanes = execMask().count();
|
||||
computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
|
||||
computeUnit->numVecOpsExecuted += num_active_lanes;
|
||||
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
|
||||
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
|
||||
|
||||
if (ii->isF16() && ii->isALU()) {
|
||||
if (ii->isF32() || ii->isF64()) {
|
||||
fatal("Instruction is tagged as both (1) F16, and (2)"
|
||||
"either F32 or F64.");
|
||||
}
|
||||
computeUnit->numVecOpsExecutedF16 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
|
||||
if (ii->isFMA()) {
|
||||
computeUnit->numVecOpsExecutedFMA16 += num_active_lanes;
|
||||
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMAC()) {
|
||||
computeUnit->numVecOpsExecutedMAC16 += num_active_lanes;
|
||||
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMAD()) {
|
||||
computeUnit->numVecOpsExecutedMAD16 += num_active_lanes;
|
||||
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
if (ii->isF32() && ii->isALU()) {
|
||||
@@ -1050,18 +986,21 @@ Wavefront::exec()
|
||||
fatal("Instruction is tagged as both (1) F32, and (2)"
|
||||
"either F16 or F64.");
|
||||
}
|
||||
computeUnit->numVecOpsExecutedF32 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
|
||||
if (ii->isFMA()) {
|
||||
computeUnit->numVecOpsExecutedFMA32 += num_active_lanes;
|
||||
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMAC()) {
|
||||
computeUnit->numVecOpsExecutedMAC32 += num_active_lanes;
|
||||
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMAD()) {
|
||||
computeUnit->numVecOpsExecutedMAD32 += num_active_lanes;
|
||||
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
if (ii->isF64() && ii->isALU()) {
|
||||
@@ -1069,24 +1008,29 @@ Wavefront::exec()
|
||||
fatal("Instruction is tagged as both (1) F64, and (2)"
|
||||
"either F16 or F32.");
|
||||
}
|
||||
computeUnit->numVecOpsExecutedF64 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
|
||||
if (ii->isFMA()) {
|
||||
computeUnit->numVecOpsExecutedFMA64 += num_active_lanes;
|
||||
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMAC()) {
|
||||
computeUnit->numVecOpsExecutedMAC64 += num_active_lanes;
|
||||
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMAD()) {
|
||||
computeUnit->numVecOpsExecutedMAD64 += num_active_lanes;
|
||||
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
if (isGmInstruction(ii)) {
|
||||
computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
|
||||
computeUnit->stats.activeLanesPerGMemInstrDist.sample(
|
||||
num_active_lanes);
|
||||
} else if (isLmInstruction(ii)) {
|
||||
computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
|
||||
computeUnit->stats.activeLanesPerLMemInstrDist.sample(
|
||||
num_active_lanes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1133,14 +1077,14 @@ Wavefront::exec()
|
||||
computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
|
||||
computeUnit->vectorGlobalMemUnit.
|
||||
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
||||
computeUnit->instCyclesVMemPerSimd[simdId] +=
|
||||
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
|
||||
computeUnit->vrf_gm_bus_latency;
|
||||
} else {
|
||||
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
|
||||
cyclesToTicks(computeUnit->srf_scm_bus_latency));
|
||||
computeUnit->scalarMemUnit.
|
||||
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
||||
computeUnit->instCyclesScMemPerSimd[simdId] +=
|
||||
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
|
||||
computeUnit->srf_scm_bus_latency;
|
||||
}
|
||||
// GM or Flat as GM Store
|
||||
@@ -1150,14 +1094,14 @@ Wavefront::exec()
|
||||
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
|
||||
computeUnit->vectorGlobalMemUnit.
|
||||
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
||||
computeUnit->instCyclesVMemPerSimd[simdId] +=
|
||||
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
|
||||
(2 * computeUnit->vrf_gm_bus_latency);
|
||||
} else {
|
||||
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
|
||||
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
|
||||
computeUnit->scalarMemUnit.
|
||||
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
||||
computeUnit->instCyclesScMemPerSimd[simdId] +=
|
||||
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
|
||||
(2 * computeUnit->srf_scm_bus_latency);
|
||||
}
|
||||
} else if ((ii->isAtomic() || ii->isMemSync()) &&
|
||||
@@ -1167,14 +1111,14 @@ Wavefront::exec()
|
||||
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
|
||||
computeUnit->vectorGlobalMemUnit.
|
||||
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
||||
computeUnit->instCyclesVMemPerSimd[simdId] +=
|
||||
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
|
||||
(2 * computeUnit->vrf_gm_bus_latency);
|
||||
} else {
|
||||
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
|
||||
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
|
||||
computeUnit->scalarMemUnit.
|
||||
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
||||
computeUnit->instCyclesScMemPerSimd[simdId] +=
|
||||
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
|
||||
(2 * computeUnit->srf_scm_bus_latency);
|
||||
}
|
||||
// LM or Flat as LM Load
|
||||
@@ -1183,7 +1127,7 @@ Wavefront::exec()
|
||||
cyclesToTicks(computeUnit->vrf_lm_bus_latency));
|
||||
computeUnit->vectorSharedMemUnit.
|
||||
set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
|
||||
computeUnit->instCyclesLdsPerSimd[simdId] +=
|
||||
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
|
||||
computeUnit->vrf_lm_bus_latency;
|
||||
// LM or Flat as LM Store
|
||||
} else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
|
||||
@@ -1191,7 +1135,7 @@ Wavefront::exec()
|
||||
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
|
||||
computeUnit->vectorSharedMemUnit.
|
||||
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
||||
computeUnit->instCyclesLdsPerSimd[simdId] +=
|
||||
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
|
||||
(2 * computeUnit->vrf_lm_bus_latency);
|
||||
// LM or Flat as LM, Atomic or MemFence
|
||||
} else if ((ii->isAtomic() || ii->isMemSync()) &&
|
||||
@@ -1200,7 +1144,7 @@ Wavefront::exec()
|
||||
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
|
||||
computeUnit->vectorSharedMemUnit.
|
||||
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
||||
computeUnit->instCyclesLdsPerSimd[simdId] +=
|
||||
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
|
||||
(2 * computeUnit->vrf_lm_bus_latency);
|
||||
} else {
|
||||
panic("Bad instruction type!\n");
|
||||
@@ -1453,3 +1397,31 @@ Wavefront::releaseBarrier()
|
||||
{
|
||||
barId = WFBarrier::InvalidID;
|
||||
}
|
||||
|
||||
Wavefront::WavefrontStats::WavefrontStats(Stats::Group *parent)
|
||||
: Stats::Group(parent),
|
||||
ADD_STAT(numInstrExecuted,
|
||||
"number of instructions executed by this WF slot"),
|
||||
ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
|
||||
ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
|
||||
ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
|
||||
"RF denied adding instruction"),
|
||||
ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
|
||||
" not available"),
|
||||
ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
|
||||
"RF reads to complete"),
|
||||
ADD_STAT(schLdsArbStalls,
|
||||
"number of cycles wave stalled due to LDS-VRF arbitration"),
|
||||
// FIXME: the name of the WF needs to be unique
|
||||
ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
|
||||
"instructions are blocked due to WAW or WAR dependencies"),
|
||||
// FIXME: the name of the WF needs to be unique
|
||||
ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
|
||||
"instructions are blocked due to RAW dependencies"),
|
||||
ADD_STAT(vecRawDistance,
|
||||
"Count of RAW distance in dynamic instructions for this WF"),
|
||||
ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
|
||||
{
|
||||
vecRawDistance.init(0, 20, 1);
|
||||
readsPerWrite.init(0, 4, 1);
|
||||
}
|
||||
|
||||
@@ -43,6 +43,8 @@
|
||||
|
||||
#include "arch/gpu_isa.hh"
|
||||
#include "base/logging.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "base/stats/group.hh"
|
||||
#include "base/types.hh"
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
@@ -217,52 +219,13 @@ class Wavefront : public SimObject
|
||||
// unique WF id over all WFs executed across all CUs
|
||||
uint64_t wfDynId;
|
||||
|
||||
// Wavefront slot stats
|
||||
|
||||
// Number of instructions executed by this wavefront slot across all
|
||||
// dynamic wavefronts
|
||||
Stats::Scalar numInstrExecuted;
|
||||
|
||||
// Number of cycles this WF spends in SCH stage
|
||||
Stats::Scalar schCycles;
|
||||
|
||||
// Number of stall cycles encounterd by this WF in SCH stage
|
||||
Stats::Scalar schStalls;
|
||||
|
||||
// The following stats sum to the value of schStalls, and record, per
|
||||
// WF slot, what the cause of each stall was at a coarse granularity.
|
||||
|
||||
// Cycles WF is selected by scheduler, but RFs cannot support instruction
|
||||
Stats::Scalar schRfAccessStalls;
|
||||
// Cycles spent waiting for execution resources
|
||||
Stats::Scalar schResourceStalls;
|
||||
// cycles spent waiting for RF reads to complete in SCH stage
|
||||
Stats::Scalar schOpdNrdyStalls;
|
||||
// LDS arbitration stall cycles. WF attempts to execute LM instruction,
|
||||
// but another wave is executing FLAT, which requires LM and GM and forces
|
||||
// this WF to stall.
|
||||
Stats::Scalar schLdsArbStalls;
|
||||
|
||||
// number of times an instruction of a WF is blocked from being issued
|
||||
// due to WAR and WAW dependencies
|
||||
Stats::Scalar numTimesBlockedDueWAXDependencies;
|
||||
// number of times an instruction of a WF is blocked from being issued
|
||||
// due to WAR and WAW dependencies
|
||||
Stats::Scalar numTimesBlockedDueRAWDependencies;
|
||||
|
||||
// dyn inst id (per SIMD) of last instruction exec from this wave
|
||||
uint64_t lastInstExec;
|
||||
|
||||
// Distribution to track the distance between producer and consumer
|
||||
// for vector register values
|
||||
Stats::Distribution vecRawDistance;
|
||||
// Map to track the dyn instruction id of each vector register value
|
||||
// produced, indexed by physical vector register ID
|
||||
std::unordered_map<int,uint64_t> rawDist;
|
||||
|
||||
// Distribution to track the number of times every vector register
|
||||
// value produced is consumed.
|
||||
Stats::Distribution readsPerWrite;
|
||||
// Counts the number of reads performed to each physical register
|
||||
// - counts are reset to 0 for each dynamic wavefront launched
|
||||
std::vector<int> vecReads;
|
||||
@@ -289,7 +252,6 @@ class Wavefront : public SimObject
|
||||
// called by SCH stage to reserve
|
||||
std::vector<int> reserveResources();
|
||||
bool stopFetch();
|
||||
void regStats();
|
||||
|
||||
Addr pc() const;
|
||||
void pc(Addr new_pc);
|
||||
@@ -357,6 +319,52 @@ class Wavefront : public SimObject
|
||||
Addr _pc;
|
||||
VectorMask _execMask;
|
||||
int barId;
|
||||
|
||||
public:
|
||||
struct WavefrontStats : public Stats::Group
|
||||
{
|
||||
WavefrontStats(Stats::Group *parent);
|
||||
|
||||
// Number of instructions executed by this wavefront slot across all
|
||||
// dynamic wavefronts
|
||||
Stats::Scalar numInstrExecuted;
|
||||
|
||||
// Number of cycles this WF spends in SCH stage
|
||||
Stats::Scalar schCycles;
|
||||
|
||||
// Number of stall cycles encounterd by this WF in SCH stage
|
||||
Stats::Scalar schStalls;
|
||||
|
||||
// The following stats sum to the value of schStalls, and record, per
|
||||
// WF slot, what the cause of each stall was at a coarse granularity.
|
||||
|
||||
// Cycles WF is selected by scheduler, but RFs cannot support
|
||||
// instruction
|
||||
Stats::Scalar schRfAccessStalls;
|
||||
// Cycles spent waiting for execution resources
|
||||
Stats::Scalar schResourceStalls;
|
||||
// cycles spent waiting for RF reads to complete in SCH stage
|
||||
Stats::Scalar schOpdNrdyStalls;
|
||||
// LDS arbitration stall cycles. WF attempts to execute LM instruction,
|
||||
// but another wave is executing FLAT, which requires LM and GM and
|
||||
// forces this WF to stall.
|
||||
Stats::Scalar schLdsArbStalls;
|
||||
|
||||
// number of times an instruction of a WF is blocked from being issued
|
||||
// due to WAR and WAW dependencies
|
||||
Stats::Scalar numTimesBlockedDueWAXDependencies;
|
||||
// number of times an instruction of a WF is blocked from being issued
|
||||
// due to WAR and WAW dependencies
|
||||
Stats::Scalar numTimesBlockedDueRAWDependencies;
|
||||
|
||||
// Distribution to track the distance between producer and consumer
|
||||
// for vector register values
|
||||
Stats::Distribution vecRawDistance;
|
||||
|
||||
// Distribution to track the number of times every vector register
|
||||
// value produced is consumed.
|
||||
Stats::Distribution readsPerWrite;
|
||||
} stats;
|
||||
};
|
||||
|
||||
#endif // __GPU_COMPUTE_WAVEFRONT_HH__
|
||||
|
||||
Reference in New Issue
Block a user