The only thing left in isa_traits.hh are two constants, one for the number of bytes in a page, and one for how far to shift an address to get the page number. To make it clear that this is the only thing isa_traits.hh should be used for from this point forward (until it is entirely eliminated), this change renames it to the much less generic page_size.hh. Also, because isa_traits.hh used to have *much* more stuff in it, it was included in a lot of places it didn't need to be. This change also clears out all these legacy includes while updating the actually needed ones to the new name. Change-Id: I939b01b117c53d620b6b0a98982f6f21dc2ada72 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/40179 Reviewed-by: Gabe Black <gabe.black@gmail.com> Maintainer: Gabe Black <gabe.black@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
594 lines
19 KiB
C++
594 lines
19 KiB
C++
/*
|
|
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "gpu-compute/shader.hh"
|
|
|
|
#include <limits>
|
|
|
|
#include "arch/x86/linux/linux.hh"
|
|
#include "arch/x86/page_size.hh"
|
|
#include "base/chunk_generator.hh"
|
|
#include "debug/GPUAgentDisp.hh"
|
|
#include "debug/GPUDisp.hh"
|
|
#include "debug/GPUMem.hh"
|
|
#include "debug/GPUShader.hh"
|
|
#include "debug/GPUWgLatency.hh"
|
|
#include "gpu-compute/dispatcher.hh"
|
|
#include "gpu-compute/gpu_command_processor.hh"
|
|
#include "gpu-compute/gpu_static_inst.hh"
|
|
#include "gpu-compute/hsa_queue_entry.hh"
|
|
#include "gpu-compute/wavefront.hh"
|
|
#include "mem/packet.hh"
|
|
#include "mem/ruby/system/RubySystem.hh"
|
|
#include "sim/sim_exit.hh"
|
|
|
|
Shader::Shader(const Params &p) : ClockedObject(p),
|
|
_activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
|
|
gpuTc(nullptr), cpuPointer(p.cpu_pointer),
|
|
tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
|
|
false, Event::CPU_Tick_Pri),
|
|
timingSim(p.timing), hsail_mode(SIMT),
|
|
impl_kern_launch_acq(p.impl_kern_launch_acq),
|
|
impl_kern_end_rel(p.impl_kern_end_rel),
|
|
coissue_return(1),
|
|
trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
|
|
globalMemSize(p.globalmem),
|
|
nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
|
|
_dispatcher(*p.dispatcher),
|
|
max_valu_insts(p.max_valu_insts), total_valu_insts(0),
|
|
stats(this, p.CUs[0]->wfSize())
|
|
{
|
|
gpuCmdProc.setShader(this);
|
|
_dispatcher.setShader(this);
|
|
|
|
_gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
|
|
_gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
|
|
|
|
_ldsApe.base = ((Addr)1 << 61) + 0x0;
|
|
_ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
|
|
|
|
_scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
|
|
_scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
|
|
|
|
shHiddenPrivateBaseVmid = 0;
|
|
|
|
cuList.resize(n_cu);
|
|
|
|
panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
|
|
|
|
for (int i = 0; i < n_cu; ++i) {
|
|
cuList[i] = p.CUs[i];
|
|
assert(i == cuList[i]->cu_id);
|
|
cuList[i]->shader = this;
|
|
cuList[i]->idleCUTimeout = p.idlecu_timeout;
|
|
}
|
|
}
|
|
|
|
GPUDispatcher&
|
|
Shader::dispatcher()
|
|
{
|
|
return _dispatcher;
|
|
}
|
|
|
|
Addr
|
|
Shader::mmap(int length)
|
|
{
|
|
|
|
Addr start;
|
|
|
|
// round up length to the next page
|
|
length = roundUp(length, X86ISA::PageBytes);
|
|
|
|
Process *proc = gpuTc->getProcessPtr();
|
|
auto mem_state = proc->memState;
|
|
|
|
if (proc->mmapGrowsDown()) {
|
|
DPRINTF(GPUShader, "GROWS DOWN");
|
|
start = mem_state->getMmapEnd() - length;
|
|
mem_state->setMmapEnd(start);
|
|
} else {
|
|
DPRINTF(GPUShader, "GROWS UP");
|
|
start = mem_state->getMmapEnd();
|
|
mem_state->setMmapEnd(start + length);
|
|
|
|
// assertion to make sure we don't overwrite the stack (it grows down)
|
|
assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
|
|
mem_state->getMmapEnd());
|
|
}
|
|
|
|
DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
|
|
|
|
proc->allocateMem(start, length);
|
|
|
|
return start;
|
|
}
|
|
|
|
void
|
|
Shader::init()
|
|
{
|
|
// grab the threadContext of the thread running on the CPU
|
|
assert(cpuPointer);
|
|
gpuTc = cpuPointer->getContext(0);
|
|
assert(gpuTc);
|
|
}
|
|
|
|
Shader::~Shader()
|
|
{
|
|
for (int j = 0; j < n_cu; ++j)
|
|
delete cuList[j];
|
|
}
|
|
|
|
void
|
|
Shader::updateContext(int cid) {
|
|
// context of the thread which dispatched work
|
|
assert(cpuPointer);
|
|
gpuTc = cpuPointer->getContext(cid);
|
|
assert(gpuTc);
|
|
}
|
|
|
|
void
|
|
Shader::execScheduledAdds()
|
|
{
|
|
assert(!sa_when.empty());
|
|
|
|
// apply any scheduled adds
|
|
for (int i = 0; i < sa_n; ++i) {
|
|
if (sa_when[i] <= curTick()) {
|
|
*sa_val[i] += sa_x[i];
|
|
panic_if(*sa_val[i] < 0, "Negative counter value\n");
|
|
sa_val.erase(sa_val.begin() + i);
|
|
sa_x.erase(sa_x.begin() + i);
|
|
sa_when.erase(sa_when.begin() + i);
|
|
--sa_n;
|
|
--i;
|
|
}
|
|
}
|
|
if (!sa_when.empty()) {
|
|
Tick shader_wakeup = *std::max_element(sa_when.begin(),
|
|
sa_when.end());
|
|
DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
|
|
schedule(tickEvent, shader_wakeup);
|
|
} else {
|
|
DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
|
|
}
|
|
}
|
|
|
|
/*
|
|
* dispatcher/shader arranges invalidate requests to the CUs
|
|
*/
|
|
void
|
|
Shader::prepareInvalidate(HSAQueueEntry *task) {
|
|
// if invalidate has already started/finished, then do nothing
|
|
if (task->isInvStarted()) return;
|
|
|
|
// invalidate has never started; it can only perform once at kernel launch
|
|
assert(task->outstandingInvs() == -1);
|
|
int kernId = task->dispatchId();
|
|
// counter value is 0 now, indicating the inv is about to start
|
|
_dispatcher.updateInvCounter(kernId, +1);
|
|
|
|
// iterate all cus managed by the shader, to perform invalidate.
|
|
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
|
|
// create a request to hold INV info; the request's fields will
|
|
// be updated in cu before use
|
|
auto req = std::make_shared<Request>(0, 0, 0,
|
|
cuList[i_cu]->requestorId(),
|
|
0, -1);
|
|
|
|
_dispatcher.updateInvCounter(kernId, +1);
|
|
// all necessary INV flags are all set now, call cu to execute
|
|
cuList[i_cu]->doInvalidate(req, task->dispatchId());
|
|
|
|
// I don't like this. This is intrusive coding.
|
|
cuList[i_cu]->resetRegisterPool();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* dispatcher/shader arranges flush requests to the CUs
|
|
*/
|
|
void
|
|
Shader::prepareFlush(GPUDynInstPtr gpuDynInst){
|
|
int kernId = gpuDynInst->kern_id;
|
|
// flush has never been started, performed only once at kernel end
|
|
assert(_dispatcher.getOutstandingWbs(kernId) == 0);
|
|
|
|
// the first cu, managed by the shader, performs flush operation,
|
|
// assuming that L2 cache is shared by all cus in the shader
|
|
int i_cu = 0;
|
|
_dispatcher.updateWbCounter(kernId, +1);
|
|
cuList[i_cu]->doFlush(gpuDynInst);
|
|
}
|
|
|
|
bool
|
|
Shader::dispatchWorkgroups(HSAQueueEntry *task)
|
|
{
|
|
bool scheduledSomething = false;
|
|
int cuCount = 0;
|
|
int curCu = nextSchedCu;
|
|
int disp_count(0);
|
|
|
|
while (cuCount < n_cu) {
|
|
//Every time we try a CU, update nextSchedCu
|
|
nextSchedCu = (nextSchedCu + 1) % n_cu;
|
|
|
|
// dispatch workgroup iff the following two conditions are met:
|
|
// (a) wg_rem is true - there are unassigned workgroups in the grid
|
|
// (b) there are enough free slots in cu cuList[i] for this wg
|
|
int num_wfs_in_wg = 0;
|
|
bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
|
|
if (!task->dispComplete() && can_disp) {
|
|
scheduledSomething = true;
|
|
DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
|
|
curCu, task->globalWgId());
|
|
DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
|
|
curCu, task->globalWgId());
|
|
DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
|
|
curTick(), task->globalWgId(), curCu);
|
|
|
|
if (!cuList[curCu]->tickEvent.scheduled()) {
|
|
if (!_activeCus)
|
|
_lastInactiveTick = curTick();
|
|
_activeCus++;
|
|
}
|
|
|
|
panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
|
|
"Invalid activeCu size\n");
|
|
cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
|
|
|
|
task->markWgDispatch();
|
|
++disp_count;
|
|
}
|
|
|
|
++cuCount;
|
|
curCu = nextSchedCu;
|
|
}
|
|
|
|
DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
|
|
|
|
return scheduledSomething;
|
|
}
|
|
|
|
void
|
|
Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
|
|
bool suppress_func_errors, int cu_id)
|
|
{
|
|
int block_size = cuList.at(cu_id)->cacheLineSize();
|
|
unsigned size = req->getSize();
|
|
|
|
Addr tmp_addr;
|
|
BaseTLB::Mode trans_mode;
|
|
|
|
if (cmd == MemCmd::ReadReq) {
|
|
trans_mode = BaseTLB::Read;
|
|
} else if (cmd == MemCmd::WriteReq) {
|
|
trans_mode = BaseTLB::Write;
|
|
} else {
|
|
fatal("unexcepted MemCmd\n");
|
|
}
|
|
|
|
tmp_addr = req->getVaddr();
|
|
Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
|
|
|
|
assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
|
|
|
|
// Misaligned access
|
|
if (split_addr > tmp_addr) {
|
|
RequestPtr req1, req2;
|
|
req->splitOnVaddr(split_addr, req1, req2);
|
|
|
|
PacketPtr pkt1 = new Packet(req2, cmd);
|
|
PacketPtr pkt2 = new Packet(req1, cmd);
|
|
|
|
functionalTLBAccess(pkt1, cu_id, trans_mode);
|
|
functionalTLBAccess(pkt2, cu_id, trans_mode);
|
|
|
|
PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
|
|
PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
|
|
|
|
new_pkt1->dataStatic(data);
|
|
new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
|
|
|
|
if (suppress_func_errors) {
|
|
new_pkt1->setSuppressFuncError();
|
|
new_pkt2->setSuppressFuncError();
|
|
}
|
|
|
|
// fixme: this should be cuList[cu_id] if cu_id != n_cu
|
|
// The latter requires a memPort in the dispatcher
|
|
cuList[0]->memPort[0].sendFunctional(new_pkt1);
|
|
cuList[0]->memPort[0].sendFunctional(new_pkt2);
|
|
|
|
delete new_pkt1;
|
|
delete new_pkt2;
|
|
delete pkt1;
|
|
delete pkt2;
|
|
} else {
|
|
PacketPtr pkt = new Packet(req, cmd);
|
|
functionalTLBAccess(pkt, cu_id, trans_mode);
|
|
PacketPtr new_pkt = new Packet(pkt->req, cmd);
|
|
new_pkt->dataStatic(data);
|
|
|
|
if (suppress_func_errors) {
|
|
new_pkt->setSuppressFuncError();
|
|
};
|
|
|
|
// fixme: this should be cuList[cu_id] if cu_id != n_cu
|
|
// The latter requires a memPort in the dispatcher
|
|
cuList[0]->memPort[0].sendFunctional(new_pkt);
|
|
|
|
delete new_pkt;
|
|
delete pkt;
|
|
}
|
|
}
|
|
|
|
void
|
|
Shader::ScheduleAdd(int *val,Tick when,int x)
|
|
{
|
|
sa_val.push_back(val);
|
|
when += curTick();
|
|
sa_when.push_back(when);
|
|
sa_x.push_back(x);
|
|
++sa_n;
|
|
if (!tickEvent.scheduled() || (when < tickEvent.when())) {
|
|
DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
|
|
"%lu\n", when);
|
|
reschedule(tickEvent, when, true);
|
|
} else {
|
|
assert(tickEvent.scheduled());
|
|
DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
|
|
"%lu\n", when);
|
|
}
|
|
}
|
|
|
|
void
|
|
Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
|
MemCmd cmd, bool suppress_func_errors)
|
|
{
|
|
uint8_t *data_buf = (uint8_t*)ptr;
|
|
|
|
for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
|
|
!gen.done(); gen.next()) {
|
|
|
|
RequestPtr req = std::make_shared<Request>(
|
|
gen.addr(), gen.size(), 0,
|
|
cuList[0]->requestorId(), 0, 0, nullptr);
|
|
|
|
doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
|
|
data_buf += gen.size();
|
|
}
|
|
}
|
|
|
|
void
|
|
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
|
|
{
|
|
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
|
|
}
|
|
|
|
void
|
|
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
|
bool suppress_func_errors)
|
|
{
|
|
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
|
|
suppress_func_errors);
|
|
}
|
|
|
|
void
|
|
Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
|
|
{
|
|
AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
|
|
}
|
|
|
|
void
|
|
Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
|
bool suppress_func_errors)
|
|
{
|
|
AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
|
|
suppress_func_errors);
|
|
}
|
|
|
|
/*
|
|
* Send a packet through the appropriate TLB functional port.
|
|
* If cu_id=n_cu, then this is the dispatcher's TLB.
|
|
* Otherwise it's the TLB of the cu_id compute unit.
|
|
*/
|
|
void
|
|
Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
|
|
{
|
|
// update senderState. Need to know the gpuTc and the TLB mode
|
|
pkt->senderState =
|
|
new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
|
|
|
|
// even when the perLaneTLB flag is turned on
|
|
// it's ok tp send all accesses through lane 0
|
|
// since the lane # is not known here,
|
|
// This isn't important since these are functional accesses.
|
|
cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
|
|
|
|
/* safe_cast the senderState */
|
|
TheISA::GpuTLB::TranslationState *sender_state =
|
|
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
|
|
|
|
delete sender_state->tlbEntry;
|
|
delete pkt->senderState;
|
|
}
|
|
|
|
/*
|
|
* allow the shader to sample stats from constituent devices
|
|
*/
|
|
void
|
|
Shader::sampleStore(const Tick accessTime)
|
|
{
|
|
stats.storeLatencyDist.sample(accessTime);
|
|
stats.allLatencyDist.sample(accessTime);
|
|
}
|
|
|
|
/*
|
|
* allow the shader to sample stats from constituent devices
|
|
*/
|
|
void
|
|
Shader::sampleLoad(const Tick accessTime)
|
|
{
|
|
stats.loadLatencyDist.sample(accessTime);
|
|
stats.allLatencyDist.sample(accessTime);
|
|
}
|
|
|
|
void
|
|
Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)
|
|
{
|
|
// Only sample instructions that go all the way to main memory
|
|
if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
|
|
return;
|
|
}
|
|
|
|
Tick t1 = roundTripTime[0];
|
|
Tick t2 = roundTripTime[1];
|
|
Tick t3 = roundTripTime[2];
|
|
Tick t4 = roundTripTime[3];
|
|
Tick t5 = roundTripTime[4];
|
|
|
|
stats.initToCoalesceLatency.sample(t2-t1);
|
|
stats.rubyNetworkLatency.sample(t3-t2);
|
|
stats.gmEnqueueLatency.sample(t4-t3);
|
|
stats.gmToCompleteLatency.sample(t5-t4);
|
|
}
|
|
|
|
void
|
|
Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
|
|
{
|
|
stats.coalsrLineAddresses.sample(lineMap.size());
|
|
std::vector<Tick> netTimes;
|
|
|
|
// For each cache block address generated by a vmem inst, calculate
|
|
// the round-trip time for that cache block.
|
|
for (auto& it : lineMap) {
|
|
const std::vector<Tick>& timeVec = it.second;
|
|
if (timeVec.size() == 2) {
|
|
netTimes.push_back(timeVec[1] - timeVec[0]);
|
|
}
|
|
}
|
|
|
|
// Sort the cache block round trip times so that the first
|
|
// distrubtion is always measuring the fastests and the last
|
|
// distrubtion is always measuring the slowest cache block.
|
|
std::sort(netTimes.begin(), netTimes.end());
|
|
|
|
// Sample the round trip time for each N cache blocks into the
|
|
// Nth distribution.
|
|
int idx = 0;
|
|
for (auto& time : netTimes) {
|
|
stats.cacheBlockRoundTrip[idx].sample(time);
|
|
++idx;
|
|
}
|
|
}
|
|
|
|
void
|
|
Shader::notifyCuSleep() {
|
|
// If all CUs attached to his shader are asleep, update shaderActiveTicks
|
|
panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
|
|
"Invalid activeCu size\n");
|
|
_activeCus--;
|
|
if (!_activeCus)
|
|
stats.shaderActiveTicks += curTick() - _lastInactiveTick;
|
|
}
|
|
|
|
Shader::ShaderStats::ShaderStats(Stats::Group *parent, int wf_size)
|
|
: Stats::Group(parent),
|
|
ADD_STAT(allLatencyDist, "delay distribution for all"),
|
|
ADD_STAT(loadLatencyDist, "delay distribution for loads"),
|
|
ADD_STAT(storeLatencyDist, "delay distribution for stores"),
|
|
ADD_STAT(initToCoalesceLatency,
|
|
"Ticks from vmem inst initiateAcc to coalescer issue"),
|
|
ADD_STAT(rubyNetworkLatency,
|
|
"Ticks from coalescer issue to coalescer hit callback"),
|
|
ADD_STAT(gmEnqueueLatency,
|
|
"Ticks from coalescer hit callback to GM pipe enqueue"),
|
|
ADD_STAT(gmToCompleteLatency,
|
|
"Ticks queued in GM pipes ordered response buffer"),
|
|
ADD_STAT(coalsrLineAddresses,
|
|
"Number of cache lines for coalesced request"),
|
|
ADD_STAT(shaderActiveTicks,
|
|
"Total ticks that any CU attached to this shader is active"),
|
|
ADD_STAT(vectorInstSrcOperand,
|
|
"vector instruction source operand distribution"),
|
|
ADD_STAT(vectorInstDstOperand,
|
|
"vector instruction destination operand distribution")
|
|
{
|
|
allLatencyDist
|
|
.init(0, 1600000, 10000)
|
|
.flags(Stats::pdf | Stats::oneline);
|
|
|
|
loadLatencyDist
|
|
.init(0, 1600000, 10000)
|
|
.flags(Stats::pdf | Stats::oneline);
|
|
|
|
storeLatencyDist
|
|
.init(0, 1600000, 10000)
|
|
.flags(Stats::pdf | Stats::oneline);
|
|
|
|
initToCoalesceLatency
|
|
.init(0, 1600000, 10000)
|
|
.flags(Stats::pdf | Stats::oneline);
|
|
|
|
rubyNetworkLatency
|
|
.init(0, 1600000, 10000)
|
|
.flags(Stats::pdf | Stats::oneline);
|
|
|
|
gmEnqueueLatency
|
|
.init(0, 1600000, 10000)
|
|
.flags(Stats::pdf | Stats::oneline);
|
|
|
|
gmToCompleteLatency
|
|
.init(0, 1600000, 10000)
|
|
.flags(Stats::pdf | Stats::oneline);
|
|
|
|
coalsrLineAddresses
|
|
.init(0, 20, 1)
|
|
.flags(Stats::pdf | Stats::oneline);
|
|
|
|
vectorInstSrcOperand.init(4);
|
|
vectorInstDstOperand.init(4);
|
|
|
|
cacheBlockRoundTrip = new Stats::Distribution[wf_size];
|
|
for (int idx = 0; idx < wf_size; ++idx) {
|
|
std::stringstream namestr;
|
|
ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
|
|
static_cast<Shader*>(parent)->name(), idx);
|
|
cacheBlockRoundTrip[idx]
|
|
.init(0, 1600000, 10000)
|
|
.name(namestr.str())
|
|
.desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
|
|
.flags(Stats::pdf | Stats::oneline);
|
|
}
|
|
}
|