gpu-compute: Use refs to CU in pipe stages/mem pipes
The pipe stages and memory pipes are changed to store a reference to their parent CU as opposed to a pointer. These objects will never change which CU they belong to, and they are constructed by their parent CU. Change-Id: Ie5476e1e2e124a024c2efebceb28cb3a9baa78c1 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29969 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
committed by
Anthony Gutierrez
parent
f509fa735c
commit
5f0378b8d0
@@ -67,13 +67,13 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
|
||||
vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
|
||||
coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
|
||||
registerManager(p->register_manager),
|
||||
fetchStage(p, this),
|
||||
scoreboardCheckStage(p, this),
|
||||
scheduleStage(p, this),
|
||||
execStage(p, this),
|
||||
globalMemoryPipe(p, this),
|
||||
localMemoryPipe(p, this),
|
||||
scalarMemoryPipe(p, this),
|
||||
fetchStage(p, *this),
|
||||
scoreboardCheckStage(p, *this),
|
||||
scheduleStage(p, *this),
|
||||
execStage(p, *this),
|
||||
globalMemoryPipe(p, *this),
|
||||
localMemoryPipe(p, *this),
|
||||
scalarMemoryPipe(p, *this),
|
||||
tickEvent([this]{ exec(); }, "Compute unit tick event",
|
||||
false, Event::CPU_Tick_Pri),
|
||||
cu_id(p->cu_id),
|
||||
|
||||
@@ -41,10 +41,10 @@
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu)
|
||||
ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu)
|
||||
: computeUnit(cu), lastTimeInstExecuted(false),
|
||||
thisTimeInstExecuted(false), instrExecuted (false),
|
||||
executionResourcesUsed(0), _name(cu->name() + ".ExecStage")
|
||||
executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
|
||||
|
||||
{
|
||||
numTransActiveIdle = 0;
|
||||
@@ -54,7 +54,7 @@ ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu)
|
||||
void
|
||||
ExecStage::init()
|
||||
{
|
||||
dispatchList = &computeUnit->dispatchList;
|
||||
dispatchList = &computeUnit.dispatchList;
|
||||
idle_dur = 0;
|
||||
}
|
||||
|
||||
@@ -127,7 +127,7 @@ ExecStage::dumpDispList()
|
||||
{
|
||||
std::stringstream ss;
|
||||
bool empty = true;
|
||||
for (int i = 0; i < computeUnit->numExeUnits(); i++) {
|
||||
for (int i = 0; i < computeUnit.numExeUnits(); i++) {
|
||||
DISPATCH_STATUS s = dispatchList->at(i).second;
|
||||
ss << i << ": " << dispStatusToStr(s);
|
||||
if (s != EMPTY) {
|
||||
@@ -151,7 +151,7 @@ ExecStage::exec()
|
||||
if (Debug::GPUSched) {
|
||||
dumpDispList();
|
||||
}
|
||||
for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
|
||||
for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
|
||||
DISPATCH_STATUS s = dispatchList->at(unitId).second;
|
||||
switch (s) {
|
||||
case EMPTY:
|
||||
@@ -168,7 +168,7 @@ ExecStage::exec()
|
||||
(w->instructionBuffer.front())->disassemble());
|
||||
DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
|
||||
dispatchList->at(unitId).first->exec();
|
||||
(computeUnit->scheduleStage).deleteFromSch(w);
|
||||
(computeUnit.scheduleStage).deleteFromSch(w);
|
||||
dispatchList->at(unitId).second = EMPTY;
|
||||
dispatchList->at(unitId).first->freeResources();
|
||||
dispatchList->at(unitId).first = nullptr;
|
||||
@@ -208,7 +208,7 @@ ExecStage::regStats()
|
||||
;
|
||||
|
||||
spc
|
||||
.init(0, computeUnit->numExeUnits(), 1)
|
||||
.init(0, computeUnit.numExeUnits(), 1)
|
||||
.name(name() + ".spc")
|
||||
.desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
|
||||
;
|
||||
@@ -220,26 +220,26 @@ ExecStage::regStats()
|
||||
;
|
||||
|
||||
numCyclesWithInstrTypeIssued
|
||||
.init(computeUnit->numExeUnits())
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".num_cycles_issue_exec_rsrc")
|
||||
.desc("Number of cycles at least one instruction issued to "
|
||||
"execution resource type")
|
||||
;
|
||||
|
||||
numCyclesWithNoInstrTypeIssued
|
||||
.init(computeUnit->numExeUnits())
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".num_cycles_no_issue_exec_rsrc")
|
||||
.desc("Number of clks no instructions issued to execution "
|
||||
"resource type")
|
||||
;
|
||||
|
||||
int c = 0;
|
||||
for (int i = 0; i < computeUnit->numVectorALUs; i++,c++) {
|
||||
for (int i = 0; i < computeUnit.numVectorALUs; i++,c++) {
|
||||
std::string s = "VectorALU" + std::to_string(i);
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, s);
|
||||
numCyclesWithInstrTypeIssued.subname(c, s);
|
||||
}
|
||||
for (int i = 0; i < computeUnit->numScalarALUs; i++,c++) {
|
||||
for (int i = 0; i < computeUnit.numScalarALUs; i++,c++) {
|
||||
std::string s = "ScalarALU" + std::to_string(i);
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, s);
|
||||
numCyclesWithInstrTypeIssued.subname(c, s);
|
||||
|
||||
@@ -69,7 +69,7 @@ enum DISPATCH_STATUS
|
||||
class ExecStage
|
||||
{
|
||||
public:
|
||||
ExecStage(const ComputeUnitParams* p, ComputeUnit *cu);
|
||||
ExecStage(const ComputeUnitParams* p, ComputeUnit &cu);
|
||||
~ExecStage() { }
|
||||
void init();
|
||||
void exec();
|
||||
@@ -77,7 +77,7 @@ class ExecStage
|
||||
std::string dispStatusToStr(int j);
|
||||
void dumpDispList();
|
||||
|
||||
std::string name() { return _name; }
|
||||
const std::string& name() const { return _name; }
|
||||
void regStats();
|
||||
// number of idle cycles
|
||||
Stats::Scalar numCyclesWithNoIssue;
|
||||
@@ -96,7 +96,7 @@ class ExecStage
|
||||
private:
|
||||
void collectStatistics(enum STAT_STATUS stage, int unitId);
|
||||
void initStatistics();
|
||||
ComputeUnit *computeUnit;
|
||||
ComputeUnit &computeUnit;
|
||||
|
||||
// List of waves which will be dispatched to
|
||||
// each execution resource. A FILLED implies
|
||||
@@ -115,7 +115,7 @@ class ExecStage
|
||||
Stats::Distribution idleDur;
|
||||
int executionResourcesUsed;
|
||||
uint64_t idle_dur;
|
||||
std::string _name;
|
||||
const std::string _name;
|
||||
};
|
||||
|
||||
#endif // __EXEC_STAGE_HH__
|
||||
|
||||
@@ -36,9 +36,9 @@
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
FetchStage::FetchStage(const ComputeUnitParams* p, ComputeUnit *cu)
|
||||
FetchStage::FetchStage(const ComputeUnitParams* p, ComputeUnit &cu)
|
||||
: numVectorALUs(p->num_SIMDs), computeUnit(cu),
|
||||
_name(cu->name() + ".FetchStage")
|
||||
_name(cu.name() + ".FetchStage")
|
||||
{
|
||||
for (int j = 0; j < numVectorALUs; ++j) {
|
||||
FetchUnit newFetchUnit(p, cu);
|
||||
@@ -55,7 +55,7 @@ void
|
||||
FetchStage::init()
|
||||
{
|
||||
for (int j = 0; j < numVectorALUs; ++j) {
|
||||
_fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
|
||||
_fetchUnit[j].bindWaveList(&computeUnit.wfList[j]);
|
||||
_fetchUnit[j].init();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ class Wavefront;
|
||||
class FetchStage
|
||||
{
|
||||
public:
|
||||
FetchStage(const ComputeUnitParams* p, ComputeUnit *cu);
|
||||
FetchStage(const ComputeUnitParams* p, ComputeUnit &cu);
|
||||
~FetchStage();
|
||||
void init();
|
||||
void exec();
|
||||
@@ -59,19 +59,19 @@ class FetchStage
|
||||
void fetch(PacketPtr pkt, Wavefront *wave);
|
||||
|
||||
// Stats related variables and methods
|
||||
std::string name() { return _name; }
|
||||
const std::string& name() const { return _name; }
|
||||
void regStats();
|
||||
Stats::Distribution instFetchInstReturned;
|
||||
FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
|
||||
|
||||
private:
|
||||
int numVectorALUs;
|
||||
ComputeUnit *computeUnit;
|
||||
ComputeUnit &computeUnit;
|
||||
|
||||
// List of fetch units. A fetch unit is
|
||||
// instantiated per VALU/SIMD
|
||||
std::vector<FetchUnit> _fetchUnit;
|
||||
std::string _name;
|
||||
const std::string _name;
|
||||
};
|
||||
|
||||
#endif // __FETCH_STAGE_HH__
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
|
||||
uint32_t FetchUnit::globalFetchUnitID;
|
||||
|
||||
FetchUnit::FetchUnit(const ComputeUnitParams *p, ComputeUnit *cu)
|
||||
FetchUnit::FetchUnit(const ComputeUnitParams *p, ComputeUnit &cu)
|
||||
: timingSim(true), computeUnit(cu), fetchScheduler(p),
|
||||
waveList(nullptr), fetchDepth(p->fetch_depth)
|
||||
{
|
||||
@@ -60,16 +60,16 @@ FetchUnit::~FetchUnit()
|
||||
void
|
||||
FetchUnit::init()
|
||||
{
|
||||
timingSim = computeUnit->shader->timingSim;
|
||||
timingSim = computeUnit.shader->timingSim;
|
||||
fetchQueue.clear();
|
||||
fetchStatusQueue.resize(computeUnit->shader->n_wf);
|
||||
fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc());
|
||||
fetchStatusQueue.resize(computeUnit.shader->n_wf);
|
||||
fetchBuf.resize(computeUnit.shader->n_wf, FetchBufDesc());
|
||||
|
||||
for (int i = 0; i < computeUnit->shader->n_wf; ++i) {
|
||||
for (int i = 0; i < computeUnit.shader->n_wf; ++i) {
|
||||
Wavefront *wf = waveList->at(i);
|
||||
assert(wf->wfSlotId == i);
|
||||
fetchStatusQueue[i] = std::make_pair(wf, false);
|
||||
fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf);
|
||||
fetchBuf[i].allocateBuf(fetchDepth, computeUnit.cacheLineSize(), wf);
|
||||
fetchBuf[i].decoder(&decoder);
|
||||
}
|
||||
|
||||
@@ -97,7 +97,7 @@ FetchUnit::exec()
|
||||
}
|
||||
|
||||
// re-evaluate waves which are marked as not ready for fetch
|
||||
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
|
||||
for (int j = 0; j < computeUnit.shader->n_wf; ++j) {
|
||||
// Following code assumes 64-bit opertaion and all insts are
|
||||
// represented by 64-bit pointers to inst objects.
|
||||
Wavefront *curWave = fetchStatusQueue[j].first;
|
||||
@@ -143,7 +143,7 @@ FetchUnit::initiateFetch(Wavefront *wavefront)
|
||||
|
||||
// this should already be aligned to a cache line
|
||||
assert(vaddr == makeLineAddress(vaddr,
|
||||
computeUnit->getCacheLineBits()));
|
||||
computeUnit.getCacheLineBits()));
|
||||
|
||||
// shouldn't be fetching a line that is already buffered
|
||||
assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
|
||||
@@ -151,16 +151,16 @@ FetchUnit::initiateFetch(Wavefront *wavefront)
|
||||
fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
|
||||
|
||||
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
|
||||
"from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId,
|
||||
"from pc: %d %#x\n", computeUnit.cu_id, wavefront->simdId,
|
||||
wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
|
||||
|
||||
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
|
||||
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
|
||||
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
|
||||
|
||||
// set up virtual request
|
||||
RequestPtr req = std::make_shared<Request>(
|
||||
vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH,
|
||||
computeUnit->masterId(), 0, 0, nullptr);
|
||||
vaddr, computeUnit.cacheLineSize(), Request::INST_FETCH,
|
||||
computeUnit.masterId(), 0, 0, nullptr);
|
||||
|
||||
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
|
||||
|
||||
@@ -171,36 +171,36 @@ FetchUnit::initiateFetch(Wavefront *wavefront)
|
||||
// Sender State needed by TLB hierarchy
|
||||
pkt->senderState =
|
||||
new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
|
||||
computeUnit->shader->gpuTc,
|
||||
computeUnit.shader->gpuTc,
|
||||
false, pkt->senderState);
|
||||
|
||||
if (computeUnit->sqcTLBPort->isStalled()) {
|
||||
assert(computeUnit->sqcTLBPort->retries.size() > 0);
|
||||
if (computeUnit.sqcTLBPort->isStalled()) {
|
||||
assert(computeUnit.sqcTLBPort->retries.size() > 0);
|
||||
|
||||
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
|
||||
vaddr);
|
||||
|
||||
computeUnit->sqcTLBPort->retries.push_back(pkt);
|
||||
} else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
|
||||
computeUnit.sqcTLBPort->retries.push_back(pkt);
|
||||
} else if (!computeUnit.sqcTLBPort->sendTimingReq(pkt)) {
|
||||
// Stall the data port;
|
||||
// No more packet is issued till
|
||||
// ruby indicates resources are freed by
|
||||
// a recvReqRetry() call back on this port.
|
||||
computeUnit->sqcTLBPort->stallPort();
|
||||
computeUnit.sqcTLBPort->stallPort();
|
||||
|
||||
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
|
||||
vaddr);
|
||||
|
||||
computeUnit->sqcTLBPort->retries.push_back(pkt);
|
||||
computeUnit.sqcTLBPort->retries.push_back(pkt);
|
||||
} else {
|
||||
DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
|
||||
}
|
||||
} else {
|
||||
pkt->senderState =
|
||||
new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
|
||||
computeUnit->shader->gpuTc);
|
||||
computeUnit.shader->gpuTc);
|
||||
|
||||
computeUnit->sqcTLBPort->sendFunctional(pkt);
|
||||
computeUnit.sqcTLBPort->sendFunctional(pkt);
|
||||
|
||||
TheISA::GpuTLB::TranslationState *sender_state =
|
||||
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
|
||||
@@ -220,7 +220,7 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
|
||||
assert(pkt->req->hasSize());
|
||||
|
||||
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
|
||||
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
|
||||
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
|
||||
pkt->req->getPaddr());
|
||||
|
||||
/**
|
||||
@@ -257,20 +257,20 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
|
||||
if (timingSim) {
|
||||
// translation is done. Send the appropriate timing memory request.
|
||||
|
||||
if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
|
||||
computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
|
||||
if (!computeUnit.sqcPort->sendTimingReq(pkt)) {
|
||||
computeUnit.sqcPort->retries.push_back(std::make_pair(pkt,
|
||||
wavefront));
|
||||
|
||||
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
|
||||
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
|
||||
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
|
||||
pkt->req->getPaddr());
|
||||
} else {
|
||||
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
|
||||
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
|
||||
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
|
||||
pkt->req->getPaddr());
|
||||
}
|
||||
} else {
|
||||
computeUnit->sqcPort->sendFunctional(pkt);
|
||||
computeUnit.sqcPort->sendFunctional(pkt);
|
||||
processFetchReturn(pkt);
|
||||
}
|
||||
}
|
||||
@@ -284,7 +284,7 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
|
||||
Wavefront *wavefront = sender_state->wavefront;
|
||||
|
||||
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
|
||||
"%d bytes!\n", computeUnit->cu_id, wavefront->simdId,
|
||||
"%d bytes!\n", computeUnit.cu_id, wavefront->simdId,
|
||||
wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
|
||||
|
||||
if (wavefront->dropFetch) {
|
||||
@@ -553,7 +553,7 @@ FetchUnit::FetchBufDesc::decodeInsts()
|
||||
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
|
||||
wavefront, gpu_static_inst,
|
||||
wavefront->computeUnit->
|
||||
getAndIncSeqNum());
|
||||
getAndIncSeqNum());
|
||||
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
|
||||
|
||||
DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
|
||||
|
||||
@@ -49,7 +49,7 @@ class Wavefront;
|
||||
class FetchUnit
|
||||
{
|
||||
public:
|
||||
FetchUnit(const ComputeUnitParams* p, ComputeUnit *cu);
|
||||
FetchUnit(const ComputeUnitParams* p, ComputeUnit &cu);
|
||||
~FetchUnit();
|
||||
void init();
|
||||
void exec();
|
||||
@@ -234,7 +234,7 @@ class FetchUnit
|
||||
};
|
||||
|
||||
bool timingSim;
|
||||
ComputeUnit *computeUnit;
|
||||
ComputeUnit &computeUnit;
|
||||
TheGpuISA::Decoder decoder;
|
||||
|
||||
// Fetch scheduler; Selects one wave from
|
||||
|
||||
@@ -44,8 +44,8 @@
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p,
|
||||
ComputeUnit *cu)
|
||||
: computeUnit(cu), _name(cu->name() + ".GlobalMemPipeline"),
|
||||
ComputeUnit &cu)
|
||||
: computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
|
||||
gmQueueSize(p->global_mem_queue_size),
|
||||
maxWaveRequests(p->max_wave_requests), inflightStores(0),
|
||||
inflightLoads(0)
|
||||
@@ -55,7 +55,7 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p,
|
||||
void
|
||||
GlobalMemPipeline::init()
|
||||
{
|
||||
globalMemSize = computeUnit->shader->globalMemSize;
|
||||
globalMemSize = computeUnit.shader->globalMemSize;
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -121,9 +121,9 @@ GlobalMemPipeline::exec()
|
||||
|
||||
}
|
||||
|
||||
if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
|
||||
accessVrf && (computeUnit->shader->coissue_return ||
|
||||
computeUnit->vectorGlobalMemUnit.rdy())) {
|
||||
if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
|
||||
accessVrf && (computeUnit.shader->coissue_return ||
|
||||
computeUnit.vectorGlobalMemUnit.rdy())) {
|
||||
|
||||
w = m->wavefront();
|
||||
|
||||
@@ -141,16 +141,16 @@ GlobalMemPipeline::exec()
|
||||
Tick accessTime = curTick() - m->getAccessTime();
|
||||
|
||||
// Decrement outstanding requests count
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
|
||||
computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
|
||||
if (m->isStore() || m->isAtomic() || m->isMemSync()) {
|
||||
computeUnit->shader->sampleStore(accessTime);
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
|
||||
computeUnit.shader->sampleStore(accessTime);
|
||||
computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
|
||||
computeUnit->shader->sampleLoad(accessTime);
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
|
||||
computeUnit.shader->sampleLoad(accessTime);
|
||||
computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
@@ -160,12 +160,12 @@ GlobalMemPipeline::exec()
|
||||
// going all the way to memory and stats for individual cache
|
||||
// blocks generated by the instruction.
|
||||
m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
|
||||
computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
|
||||
computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
|
||||
computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
|
||||
computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
|
||||
|
||||
// Mark write bus busy for appropriate amount of time
|
||||
computeUnit->glbMemToVrfBus.set(m->time);
|
||||
if (!computeUnit->shader->coissue_return)
|
||||
computeUnit.glbMemToVrfBus.set(m->time);
|
||||
if (!computeUnit.shader->coissue_return)
|
||||
w->computeUnit->vectorGlobalMemUnit.set(m->time);
|
||||
}
|
||||
|
||||
@@ -217,13 +217,13 @@ GlobalMemPipeline::exec()
|
||||
* correctly.
|
||||
*/
|
||||
handleResponse(mp);
|
||||
computeUnit->getTokenManager()->recvTokens(1);
|
||||
computeUnit.getTokenManager()->recvTokens(1);
|
||||
}
|
||||
|
||||
gmIssuedRequests.pop();
|
||||
|
||||
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
|
||||
computeUnit->cu_id, mp->simdId, mp->wfSlotId);
|
||||
computeUnit.cu_id, mp->simdId, mp->wfSlotId);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ class ComputeUnit;
|
||||
class GlobalMemPipeline
|
||||
{
|
||||
public:
|
||||
GlobalMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu);
|
||||
GlobalMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu);
|
||||
void init();
|
||||
void exec();
|
||||
|
||||
@@ -108,8 +108,8 @@ class GlobalMemPipeline
|
||||
void acqCoalescerToken(GPUDynInstPtr mp);
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
std::string _name;
|
||||
ComputeUnit &computeUnit;
|
||||
const std::string _name;
|
||||
int gmQueueSize;
|
||||
int maxWaveRequests;
|
||||
|
||||
|
||||
@@ -41,8 +41,8 @@
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p, ComputeUnit *cu)
|
||||
: computeUnit(cu), _name(cu->name() + ".LocalMemPipeline"),
|
||||
LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p, ComputeUnit &cu)
|
||||
: computeUnit(cu), _name(cu.name() + ".LocalMemPipeline"),
|
||||
lmQueueSize(p->local_mem_queue_size)
|
||||
{
|
||||
}
|
||||
@@ -66,9 +66,9 @@ LocalMemPipeline::exec()
|
||||
}
|
||||
|
||||
if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
|
||||
computeUnit->locMemToVrfBus.rdy()
|
||||
&& (computeUnit->shader->coissue_return
|
||||
|| computeUnit->vectorSharedMemUnit.rdy())) {
|
||||
computeUnit.locMemToVrfBus.rdy()
|
||||
&& (computeUnit.shader->coissue_return
|
||||
|| computeUnit.vectorSharedMemUnit.rdy())) {
|
||||
|
||||
lmReturnedRequests.pop();
|
||||
w = m->wavefront();
|
||||
@@ -83,21 +83,21 @@ LocalMemPipeline::exec()
|
||||
}
|
||||
|
||||
// Decrement outstanding request count
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
|
||||
computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
|
||||
|
||||
if (m->isStore() || m->isAtomic()) {
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm,
|
||||
computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrLm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
if (m->isLoad() || m->isAtomic()) {
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm,
|
||||
computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdLm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
// Mark write bus busy for appropriate amount of time
|
||||
computeUnit->locMemToVrfBus.set(m->time);
|
||||
if (computeUnit->shader->coissue_return == 0)
|
||||
computeUnit.locMemToVrfBus.set(m->time);
|
||||
if (computeUnit.shader->coissue_return == 0)
|
||||
w->computeUnit->vectorSharedMemUnit.set(m->time);
|
||||
}
|
||||
|
||||
@@ -108,7 +108,7 @@ LocalMemPipeline::exec()
|
||||
|
||||
GPUDynInstPtr m = lmIssuedRequests.front();
|
||||
|
||||
bool returnVal = computeUnit->sendToLds(m);
|
||||
bool returnVal = computeUnit.sendToLds(m);
|
||||
if (!returnVal) {
|
||||
DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
|
||||
}
|
||||
|
||||
@@ -55,7 +55,7 @@ class Wavefront;
|
||||
class LocalMemPipeline
|
||||
{
|
||||
public:
|
||||
LocalMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu);
|
||||
LocalMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu);
|
||||
void exec();
|
||||
std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
|
||||
|
||||
@@ -84,8 +84,8 @@ class LocalMemPipeline
|
||||
}
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
std::string _name;
|
||||
ComputeUnit &computeUnit;
|
||||
const std::string _name;
|
||||
int lmQueueSize;
|
||||
Stats::Scalar loadVrfBankConflictCycles;
|
||||
// Local Memory Request Fifo: all shared memory requests
|
||||
|
||||
@@ -44,8 +44,8 @@
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ScalarMemPipeline::ScalarMemPipeline(const ComputeUnitParams* p,
|
||||
ComputeUnit *cu)
|
||||
: computeUnit(cu), _name(cu->name() + ".ScalarMemPipeline"),
|
||||
ComputeUnit &cu)
|
||||
: computeUnit(cu), _name(cu.name() + ".ScalarMemPipeline"),
|
||||
queueSize(p->scalar_mem_queue_size),
|
||||
inflightStores(0), inflightLoads(0)
|
||||
{
|
||||
@@ -72,10 +72,10 @@ ScalarMemPipeline::exec()
|
||||
}
|
||||
|
||||
if ((!returnedStores.empty() || !returnedLoads.empty()) &&
|
||||
m->latency.rdy() && computeUnit->scalarMemToSrfBus.rdy() &&
|
||||
m->latency.rdy() && computeUnit.scalarMemToSrfBus.rdy() &&
|
||||
accessSrf &&
|
||||
(computeUnit->shader->coissue_return ||
|
||||
computeUnit->scalarMemUnit.rdy())) {
|
||||
(computeUnit.shader->coissue_return ||
|
||||
computeUnit.scalarMemUnit.rdy())) {
|
||||
|
||||
w = m->wavefront();
|
||||
|
||||
@@ -97,21 +97,21 @@ ScalarMemPipeline::exec()
|
||||
}
|
||||
|
||||
// Decrement outstanding register count
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
|
||||
computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
|
||||
|
||||
if (m->isStore() || m->isAtomic()) {
|
||||
computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
|
||||
computeUnit.shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
if (m->isLoad() || m->isAtomic()) {
|
||||
computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
|
||||
computeUnit.shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
// Mark write bus busy for appropriate amount of time
|
||||
computeUnit->scalarMemToSrfBus.set(m->time);
|
||||
if (!computeUnit->shader->coissue_return)
|
||||
computeUnit.scalarMemToSrfBus.set(m->time);
|
||||
if (!computeUnit.shader->coissue_return)
|
||||
w->computeUnit->scalarMemUnit.set(m->time);
|
||||
}
|
||||
|
||||
@@ -138,7 +138,7 @@ ScalarMemPipeline::exec()
|
||||
issuedRequests.pop();
|
||||
|
||||
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping scalar mem_op\n",
|
||||
computeUnit->cu_id, mp->simdId, mp->wfSlotId);
|
||||
computeUnit.cu_id, mp->simdId, mp->wfSlotId);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ class ComputeUnit;
|
||||
class ScalarMemPipeline
|
||||
{
|
||||
public:
|
||||
ScalarMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu);
|
||||
ScalarMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu);
|
||||
void exec();
|
||||
|
||||
std::queue<GPUDynInstPtr> &getGMReqFIFO() { return issuedRequests; }
|
||||
@@ -84,12 +84,12 @@ class ScalarMemPipeline
|
||||
return (issuedRequests.size() + pendReqs) < queueSize;
|
||||
}
|
||||
|
||||
const std::string &name() const { return _name; }
|
||||
const std::string& name() const { return _name; }
|
||||
void regStats();
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
std::string _name;
|
||||
ComputeUnit &computeUnit;
|
||||
const std::string _name;
|
||||
int queueSize;
|
||||
|
||||
// Counters to track and limit the inflight scalar loads and stores
|
||||
|
||||
@@ -43,17 +43,17 @@
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu)
|
||||
: computeUnit(cu), _name(cu->name() + ".ScheduleStage"),
|
||||
ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu)
|
||||
: computeUnit(cu), _name(cu.name() + ".ScheduleStage"),
|
||||
vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
|
||||
scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
|
||||
locMemBusRdy(false), locMemIssueRdy(false)
|
||||
{
|
||||
for (int j = 0; j < cu->numExeUnits(); ++j) {
|
||||
for (int j = 0; j < cu.numExeUnits(); ++j) {
|
||||
scheduler.emplace_back(p);
|
||||
}
|
||||
wavesInSch.clear();
|
||||
schList.resize(cu->numExeUnits());
|
||||
schList.resize(cu.numExeUnits());
|
||||
for (auto &dq : schList) {
|
||||
dq.clear();
|
||||
}
|
||||
@@ -70,36 +70,36 @@ void
|
||||
ScheduleStage::init()
|
||||
{
|
||||
|
||||
fatal_if(scheduler.size() != computeUnit->readyList.size(),
|
||||
fatal_if(scheduler.size() != computeUnit.readyList.size(),
|
||||
"Scheduler should have same number of entries as CU's readyList");
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
scheduler[j].bindList(&computeUnit->readyList[j]);
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
scheduler[j].bindList(&computeUnit.readyList[j]);
|
||||
}
|
||||
|
||||
dispatchList = &computeUnit->dispatchList;
|
||||
dispatchList = &computeUnit.dispatchList;
|
||||
|
||||
assert(computeUnit->numVectorGlobalMemUnits == 1);
|
||||
assert(computeUnit->numVectorSharedMemUnits == 1);
|
||||
assert(computeUnit.numVectorGlobalMemUnits == 1);
|
||||
assert(computeUnit.numVectorSharedMemUnits == 1);
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::exec()
|
||||
{
|
||||
// Update readyList
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
// delete all ready wavefronts whose instruction buffers are now
|
||||
// empty because the last instruction was executed
|
||||
computeUnit->updateReadyList(j);
|
||||
computeUnit.updateReadyList(j);
|
||||
/**
|
||||
* Remove any wave that already has an instruction present in SCH
|
||||
* waiting for RF reads to complete. This prevents out of order
|
||||
* execution within a wave.
|
||||
*/
|
||||
for (auto wIt = computeUnit->readyList.at(j).begin();
|
||||
wIt != computeUnit->readyList.at(j).end();) {
|
||||
for (auto wIt = computeUnit.readyList.at(j).begin();
|
||||
wIt != computeUnit.readyList.at(j).end();) {
|
||||
if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
|
||||
*wIt = nullptr;
|
||||
wIt = computeUnit->readyList.at(j).erase(wIt);
|
||||
wIt = computeUnit.readyList.at(j).erase(wIt);
|
||||
} else {
|
||||
wIt++;
|
||||
}
|
||||
@@ -112,10 +112,10 @@ ScheduleStage::exec()
|
||||
// Scalar Memory are iterated after VMEM
|
||||
|
||||
// Iterate VMEM and SMEM
|
||||
int firstMemUnit = computeUnit->firstMemUnit();
|
||||
int lastMemUnit = computeUnit->lastMemUnit();
|
||||
int firstMemUnit = computeUnit.firstMemUnit();
|
||||
int lastMemUnit = computeUnit.lastMemUnit();
|
||||
for (int j = firstMemUnit; j <= lastMemUnit; j++) {
|
||||
int readyListSize = computeUnit->readyList[j].size();
|
||||
int readyListSize = computeUnit.readyList[j].size();
|
||||
// If no wave is ready to be scheduled on the execution resource
|
||||
// then skip scheduling for this execution resource
|
||||
if (!readyListSize) {
|
||||
@@ -135,12 +135,12 @@ ScheduleStage::exec()
|
||||
}
|
||||
|
||||
// Iterate everything else
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
// skip the VMEM resources
|
||||
if (j >= firstMemUnit && j <= lastMemUnit) {
|
||||
continue;
|
||||
}
|
||||
int readyListSize = computeUnit->readyList[j].size();
|
||||
int readyListSize = computeUnit.readyList[j].size();
|
||||
// If no wave is ready to be scheduled on the execution resource
|
||||
// then skip scheduling for this execution resource
|
||||
if (!readyListSize) {
|
||||
@@ -205,16 +205,16 @@ ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
|
||||
bool accessVrfWr = true;
|
||||
if (!ii->isScalar()) {
|
||||
accessVrfWr =
|
||||
computeUnit->vrf[w->simdId]->canScheduleWriteOperands(w, ii);
|
||||
computeUnit.vrf[w->simdId]->canScheduleWriteOperands(w, ii);
|
||||
}
|
||||
bool accessSrfWr =
|
||||
computeUnit->srf[w->simdId]->canScheduleWriteOperands(w, ii);
|
||||
computeUnit.srf[w->simdId]->canScheduleWriteOperands(w, ii);
|
||||
bool accessRf = accessVrfWr && accessSrfWr;
|
||||
if (accessRf) {
|
||||
if (!ii->isScalar()) {
|
||||
computeUnit->vrf[w->simdId]->scheduleWriteOperands(w, ii);
|
||||
computeUnit.vrf[w->simdId]->scheduleWriteOperands(w, ii);
|
||||
}
|
||||
computeUnit->srf[w->simdId]->scheduleWriteOperands(w, ii);
|
||||
computeUnit.srf[w->simdId]->scheduleWriteOperands(w, ii);
|
||||
return true;
|
||||
} else {
|
||||
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
|
||||
@@ -235,7 +235,7 @@ ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
|
||||
void
|
||||
ScheduleStage::scheduleRfDestOperands()
|
||||
{
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
if (!dispatchList->at(j).first) {
|
||||
continue;
|
||||
}
|
||||
@@ -269,10 +269,10 @@ ScheduleStage::addToSchList(int exeType, Wavefront *w)
|
||||
bool accessVrf = true;
|
||||
if (!ii->isScalar()) {
|
||||
accessVrf =
|
||||
computeUnit->vrf[w->simdId]->canScheduleReadOperands(w, ii);
|
||||
computeUnit.vrf[w->simdId]->canScheduleReadOperands(w, ii);
|
||||
}
|
||||
bool accessSrf =
|
||||
computeUnit->srf[w->simdId]->canScheduleReadOperands(w, ii);
|
||||
computeUnit.srf[w->simdId]->canScheduleReadOperands(w, ii);
|
||||
// If RFs can support instruction, add to schList in RFBUSY state,
|
||||
// place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
|
||||
// to the VRF
|
||||
@@ -282,16 +282,16 @@ ScheduleStage::addToSchList(int exeType, Wavefront *w)
|
||||
exeType, w->simdId, w->wfDynId,
|
||||
ii->seqNum(), ii->disassemble());
|
||||
|
||||
computeUnit->insertInPipeMap(w);
|
||||
computeUnit.insertInPipeMap(w);
|
||||
wavesInSch.emplace(w->wfDynId);
|
||||
schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
|
||||
if (w->isOldestInstWaitcnt()) {
|
||||
w->setStatus(Wavefront::S_WAITCNT);
|
||||
}
|
||||
if (!ii->isScalar()) {
|
||||
computeUnit->vrf[w->simdId]->scheduleReadOperands(w, ii);
|
||||
computeUnit.vrf[w->simdId]->scheduleReadOperands(w, ii);
|
||||
}
|
||||
computeUnit->srf[w->simdId]->scheduleReadOperands(w, ii);
|
||||
computeUnit.srf[w->simdId]->scheduleReadOperands(w, ii);
|
||||
|
||||
DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
|
||||
exeType, w->simdId, w->wfDynId,
|
||||
@@ -341,33 +341,33 @@ ScheduleStage::checkMemResources()
|
||||
scalarMemBusRdy = false;
|
||||
scalarMemIssueRdy = false;
|
||||
// check if there is a SRF->Global Memory bus available and
|
||||
if (computeUnit->srfToScalarMemPipeBus.rdy(Cycles(1))) {
|
||||
if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) {
|
||||
scalarMemBusRdy = true;
|
||||
}
|
||||
// check if we can issue a scalar memory instruction
|
||||
if (computeUnit->scalarMemUnit.rdy(Cycles(1))) {
|
||||
if (computeUnit.scalarMemUnit.rdy(Cycles(1))) {
|
||||
scalarMemIssueRdy = true;
|
||||
}
|
||||
|
||||
glbMemBusRdy = false;
|
||||
glbMemIssueRdy = false;
|
||||
// check if there is a VRF->Global Memory bus available
|
||||
if (computeUnit->vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
|
||||
if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
|
||||
glbMemBusRdy = true;
|
||||
}
|
||||
// check if we can issue a Global memory instruction
|
||||
if (computeUnit->vectorGlobalMemUnit.rdy(Cycles(1))) {
|
||||
if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) {
|
||||
glbMemIssueRdy = true;
|
||||
}
|
||||
|
||||
locMemBusRdy = false;
|
||||
locMemIssueRdy = false;
|
||||
// check if there is a VRF->LDS bus available
|
||||
if (computeUnit->vrfToLocalMemPipeBus.rdy(Cycles(1))) {
|
||||
if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) {
|
||||
locMemBusRdy = true;
|
||||
}
|
||||
// check if we can issue a LDS instruction
|
||||
if (computeUnit->vectorSharedMemUnit.rdy(Cycles(1))) {
|
||||
if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) {
|
||||
locMemIssueRdy = true;
|
||||
}
|
||||
}
|
||||
@@ -378,10 +378,10 @@ ScheduleStage::dispatchReady(Wavefront *w)
|
||||
vectorAluRdy = false;
|
||||
scalarAluRdy = false;
|
||||
// check for available vector/scalar ALUs in the next cycle
|
||||
if (computeUnit->vectorALUs[w->simdId].rdy(Cycles(1))) {
|
||||
if (computeUnit.vectorALUs[w->simdId].rdy(Cycles(1))) {
|
||||
vectorAluRdy = true;
|
||||
}
|
||||
if (computeUnit->scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
|
||||
if (computeUnit.scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
|
||||
scalarAluRdy = true;
|
||||
}
|
||||
GPUDynInstPtr ii = w->instructionBuffer.front();
|
||||
@@ -423,11 +423,11 @@ ScheduleStage::dispatchReady(Wavefront *w)
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
|
||||
if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
|
||||
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
|
||||
}
|
||||
@@ -445,7 +445,7 @@ ScheduleStage::dispatchReady(Wavefront *w)
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->scalarMemoryPipe.
|
||||
if (!computeUnit.scalarMemoryPipe.
|
||||
isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
|
||||
w->scalarWrGmReqsInPipe)) {
|
||||
rdy = false;
|
||||
@@ -465,7 +465,7 @@ ScheduleStage::dispatchReady(Wavefront *w)
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->localMemoryPipe.
|
||||
if (!computeUnit.localMemoryPipe.
|
||||
isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
|
||||
@@ -484,15 +484,15 @@ ScheduleStage::dispatchReady(Wavefront *w)
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
|
||||
if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
|
||||
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->localMemoryPipe.
|
||||
if (!computeUnit.localMemoryPipe.
|
||||
isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
|
||||
@@ -514,7 +514,7 @@ ScheduleStage::fillDispatchList()
|
||||
// update execution resource status
|
||||
checkMemResources();
|
||||
// iterate execution resources
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); j++) {
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); j++) {
|
||||
assert(dispatchList->at(j).second == EMPTY);
|
||||
|
||||
// iterate waves in schList to pick one for dispatch
|
||||
@@ -537,7 +537,7 @@ ScheduleStage::fillDispatchList()
|
||||
instructionBuffer.front();
|
||||
if (!mp->isMemSync() && !mp->isScalar() &&
|
||||
(mp->isGlobalMem() || mp->isFlat())) {
|
||||
computeUnit->globalMemoryPipe.acqCoalescerToken(mp);
|
||||
computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
|
||||
}
|
||||
|
||||
doDispatchListTransition(j, EXREADY, schIter->first);
|
||||
@@ -581,9 +581,9 @@ ScheduleStage::arbitrateVrfToLdsBus()
|
||||
// and a VRF->LDS bus. In GFx9, this is not the case.
|
||||
|
||||
// iterate the GM pipelines
|
||||
for (int i = 0; i < computeUnit->numVectorGlobalMemUnits; i++) {
|
||||
for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
|
||||
// get the GM pipe index in the dispatchList
|
||||
int gm_exe_unit = computeUnit->firstMemUnit() + i;
|
||||
int gm_exe_unit = computeUnit.firstMemUnit() + i;
|
||||
// get the wave in the dispatchList
|
||||
Wavefront *w = dispatchList->at(gm_exe_unit).first;
|
||||
// If the WF is valid, ready to execute, and the instruction
|
||||
@@ -617,7 +617,7 @@ ScheduleStage::checkRfOperandReadComplete()
|
||||
// Iterate the schList queues and check if operand reads
|
||||
// have completed in the RFs. If so, mark the wave as ready for
|
||||
// selection for dispatchList
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
for (auto &p : schList.at(j)) {
|
||||
Wavefront *w = p.first;
|
||||
assert(w);
|
||||
@@ -630,10 +630,10 @@ ScheduleStage::checkRfOperandReadComplete()
|
||||
bool vrfRdy = true;
|
||||
if (!ii->isScalar()) {
|
||||
vrfRdy =
|
||||
computeUnit->vrf[w->simdId]->operandReadComplete(w, ii);
|
||||
computeUnit.vrf[w->simdId]->operandReadComplete(w, ii);
|
||||
}
|
||||
bool srfRdy =
|
||||
computeUnit->srf[w->simdId]->operandReadComplete(w, ii);
|
||||
computeUnit.srf[w->simdId]->operandReadComplete(w, ii);
|
||||
bool operandsReady = vrfRdy && srfRdy;
|
||||
if (operandsReady) {
|
||||
DPRINTF(GPUSched,
|
||||
@@ -671,9 +671,9 @@ void
|
||||
ScheduleStage::reserveResources()
|
||||
{
|
||||
std::vector<bool> exeUnitReservations;
|
||||
exeUnitReservations.resize(computeUnit->numExeUnits(), false);
|
||||
exeUnitReservations.resize(computeUnit.numExeUnits(), false);
|
||||
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
Wavefront *dispatchedWave = dispatchList->at(j).first;
|
||||
if (dispatchedWave) {
|
||||
DISPATCH_STATUS s = dispatchList->at(j).second;
|
||||
@@ -686,10 +686,10 @@ ScheduleStage::reserveResources()
|
||||
GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
|
||||
|
||||
if (!ii->isScalar()) {
|
||||
computeUnit->vrf[dispatchedWave->simdId]->
|
||||
computeUnit.vrf[dispatchedWave->simdId]->
|
||||
dispatchInstruction(ii);
|
||||
}
|
||||
computeUnit->srf[dispatchedWave->simdId]->
|
||||
computeUnit.srf[dispatchedWave->simdId]->
|
||||
dispatchInstruction(ii);
|
||||
|
||||
std::stringstream ss;
|
||||
@@ -743,35 +743,35 @@ void
|
||||
ScheduleStage::regStats()
|
||||
{
|
||||
rdyListNotEmpty
|
||||
.init(computeUnit->numExeUnits())
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".rdy_list_not_empty")
|
||||
.desc("number of cycles one or more wave on ready list per "
|
||||
"execution resource")
|
||||
;
|
||||
|
||||
rdyListEmpty
|
||||
.init(computeUnit->numExeUnits())
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".rdy_list_empty")
|
||||
.desc("number of cycles no wave on ready list per "
|
||||
"execution resource")
|
||||
;
|
||||
|
||||
addToSchListStalls
|
||||
.init(computeUnit->numExeUnits())
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".sch_list_add_stalls")
|
||||
.desc("number of cycles a wave is not added to schList per "
|
||||
"execution resource when ready list is not empty")
|
||||
;
|
||||
|
||||
schListToDispList
|
||||
.init(computeUnit->numExeUnits())
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".sch_list_to_disp_list")
|
||||
.desc("number of cycles a wave is added to dispatchList per "
|
||||
"execution resource")
|
||||
;
|
||||
|
||||
schListToDispListStalls
|
||||
.init(computeUnit->numExeUnits())
|
||||
.init(computeUnit.numExeUnits())
|
||||
.name(name() + ".sch_list_to_disp_list_stalls")
|
||||
.desc("number of cycles no wave is added to dispatchList per "
|
||||
"execution resource")
|
||||
|
||||
@@ -57,13 +57,13 @@ struct ComputeUnitParams;
|
||||
class ScheduleStage
|
||||
{
|
||||
public:
|
||||
ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu);
|
||||
ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu);
|
||||
~ScheduleStage();
|
||||
void init();
|
||||
void exec();
|
||||
|
||||
// Stats related variables and methods
|
||||
std::string name() { return _name; }
|
||||
const std::string& name() const { return _name; }
|
||||
enum SchNonRdyType {
|
||||
SCH_SCALAR_ALU_NRDY,
|
||||
SCH_VECTOR_ALU_NRDY,
|
||||
@@ -114,7 +114,7 @@ class ScheduleStage
|
||||
};
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
ComputeUnit &computeUnit;
|
||||
// Each execution resource will have its own
|
||||
// scheduler and a dispatch list
|
||||
std::vector<Scheduler> scheduler;
|
||||
@@ -168,7 +168,7 @@ class ScheduleStage
|
||||
// to dispatchList
|
||||
Stats::Vector dispNrdyStalls;
|
||||
|
||||
std::string _name;
|
||||
const std::string _name;
|
||||
|
||||
// called by exec() to add a wave to schList if the RFs can support it
|
||||
bool addToSchList(int exeType, Wavefront *w);
|
||||
|
||||
@@ -45,8 +45,8 @@
|
||||
#include "params/ComputeUnit.hh"
|
||||
|
||||
ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p,
|
||||
ComputeUnit *cu)
|
||||
: computeUnit(cu), _name(cu->name() + ".ScoreboardCheckStage")
|
||||
ComputeUnit &cu)
|
||||
: computeUnit(cu), _name(cu.name() + ".ScoreboardCheckStage")
|
||||
{
|
||||
}
|
||||
|
||||
@@ -58,8 +58,8 @@ ScoreboardCheckStage::~ScoreboardCheckStage()
|
||||
void
|
||||
ScoreboardCheckStage::init()
|
||||
{
|
||||
for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
|
||||
readyList.push_back(&computeUnit->readyList[unitId]);
|
||||
for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
|
||||
readyList.push_back(&computeUnit.readyList[unitId]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -104,7 +104,7 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
|
||||
if (w->getStatus() == Wavefront::S_BARRIER) {
|
||||
assert(w->hasBarrier());
|
||||
int bar_id = w->barrierId();
|
||||
if (!computeUnit->allAtBarrier(bar_id)) {
|
||||
if (!computeUnit.allAtBarrier(bar_id)) {
|
||||
DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalled at "
|
||||
"barrier Id%d. %d waves remain.\n", w->computeUnit->cu_id,
|
||||
w->simdId, w->wfSlotId, w->wfDynId, bar_id,
|
||||
@@ -116,8 +116,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
|
||||
DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves at barrier "
|
||||
"Id%d. Resetting barrier resources.\n", w->computeUnit->cu_id,
|
||||
w->simdId, w->wfSlotId, w->wfDynId, bar_id);
|
||||
computeUnit->resetBarrier(bar_id);
|
||||
computeUnit->releaseWFsFromBarrier(bar_id);
|
||||
computeUnit.resetBarrier(bar_id);
|
||||
computeUnit.releaseWFsFromBarrier(bar_id);
|
||||
}
|
||||
|
||||
// Check WF status: it has to be running
|
||||
@@ -154,17 +154,17 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
|
||||
}
|
||||
|
||||
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Ready for Inst : %s\n",
|
||||
computeUnit->cu_id, w->simdId, w->wfSlotId, ii->disassemble());
|
||||
computeUnit.cu_id, w->simdId, w->wfSlotId, ii->disassemble());
|
||||
|
||||
// Non-scalar (i.e., vector) instructions may use VGPRs
|
||||
if (!ii->isScalar()) {
|
||||
if (!computeUnit->vrf[w->simdId]->operandsReady(w, ii)) {
|
||||
if (!computeUnit.vrf[w->simdId]->operandsReady(w, ii)) {
|
||||
*rdyStatus = NRDY_VGPR_NRDY;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Scalar and non-scalar instructions may use SGPR
|
||||
if (!computeUnit->srf[w->simdId]->operandsReady(w, ii)) {
|
||||
if (!computeUnit.srf[w->simdId]->operandsReady(w, ii)) {
|
||||
*rdyStatus = NRDY_SGPR_NRDY;
|
||||
return false;
|
||||
}
|
||||
@@ -190,7 +190,7 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
|
||||
return false;
|
||||
}
|
||||
}
|
||||
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
|
||||
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit.cu_id,
|
||||
w->simdId, w->wfSlotId, ii->disassemble());
|
||||
*exeResType = mapWaveToExeUnit(w);
|
||||
*rdyStatus = INST_RDY;
|
||||
@@ -236,7 +236,7 @@ ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w)
|
||||
}
|
||||
}
|
||||
panic("%s: unmapped to an execution resource", ii->disassemble());
|
||||
return computeUnit->numExeUnits();
|
||||
return computeUnit.numExeUnits();
|
||||
}
|
||||
|
||||
void
|
||||
@@ -244,7 +244,7 @@ ScoreboardCheckStage::exec()
|
||||
{
|
||||
// reset the ready list for all execution units; it will be
|
||||
// constructed every cycle since resource availability may change
|
||||
for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
|
||||
for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
|
||||
// Reset wavefront pointers to nullptr so clear() on the vector
|
||||
// does not accidentally destruct the wavefront object
|
||||
for (int i = 0; i < readyList[unitId]->size(); i++) {
|
||||
@@ -253,10 +253,10 @@ ScoreboardCheckStage::exec()
|
||||
readyList[unitId]->clear();
|
||||
}
|
||||
// iterate over all WF slots across all vector ALUs
|
||||
for (int simdId = 0; simdId < computeUnit->numVectorALUs; ++simdId) {
|
||||
for (int wfSlot = 0; wfSlot < computeUnit->shader->n_wf; ++wfSlot) {
|
||||
for (int simdId = 0; simdId < computeUnit.numVectorALUs; ++simdId) {
|
||||
for (int wfSlot = 0; wfSlot < computeUnit.shader->n_wf; ++wfSlot) {
|
||||
// reset the ready status of each wavefront
|
||||
Wavefront *curWave = computeUnit->wfList[simdId][wfSlot];
|
||||
Wavefront *curWave = computeUnit.wfList[simdId][wfSlot];
|
||||
nonrdytype_e rdyStatus = NRDY_ILLEGAL;
|
||||
int exeResType = -1;
|
||||
// check WF readiness: If the WF's oldest
|
||||
|
||||
@@ -70,7 +70,7 @@ class ScoreboardCheckStage
|
||||
NRDY_CONDITIONS
|
||||
};
|
||||
|
||||
ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit *cu);
|
||||
ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu);
|
||||
~ScoreboardCheckStage();
|
||||
void init();
|
||||
void exec();
|
||||
@@ -84,7 +84,7 @@ class ScoreboardCheckStage
|
||||
int mapWaveToExeUnit(Wavefront *w);
|
||||
bool ready(Wavefront *w, nonrdytype_e *rdyStatus,
|
||||
int *exeResType, int wfSlot);
|
||||
ComputeUnit *computeUnit;
|
||||
ComputeUnit &computeUnit;
|
||||
|
||||
// List of waves which are ready to be scheduled.
|
||||
// Each execution resource has a ready list
|
||||
@@ -93,7 +93,7 @@ class ScoreboardCheckStage
|
||||
// Stats
|
||||
Stats::Vector stallCycles;
|
||||
|
||||
std::string _name;
|
||||
const std::string _name;
|
||||
};
|
||||
|
||||
#endif // __SCOREBOARD_CHECK_STAGE_HH__
|
||||
|
||||
Reference in New Issue
Block a user