gpu-compute: Add pipeline stage interface classes
This change separates the pipeline stage interfaces for the GPU's compute unit into their own classes with a well-defined interface. This helps to create a cleaner interface for users to extend the CU pipeline's capabilities and also helps consolidate all the pipeline communication code in one place in the source. Change-Id: I569d52bce84dc1b9fbf8f0f96d53a81a2b6773c6 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29972 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
committed by
Anthony Gutierrez
parent
6655161037
commit
63c76448eb
@@ -41,6 +41,7 @@ SimObject('GPUStaticInstFlags.py')
|
||||
SimObject('LdsState.py')
|
||||
SimObject('X86GPUTLB.py')
|
||||
|
||||
Source('comm.cc')
|
||||
Source('compute_unit.cc')
|
||||
Source('dispatcher.cc')
|
||||
Source('exec_stage.cc')
|
||||
|
||||
154
src/gpu-compute/comm.cc
Normal file
154
src/gpu-compute/comm.cc
Normal file
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "gpu-compute/comm.hh"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "params/ComputeUnit.hh"
|
||||
|
||||
/**
|
||||
* Scoreboard/Schedule stage interface.
|
||||
*/
|
||||
ScoreboardCheckToSchedule::ScoreboardCheckToSchedule(const ComputeUnitParams
|
||||
*p)
|
||||
{
|
||||
int num_func_units = p->num_SIMDs + p->num_scalar_cores
|
||||
+ p->num_global_mem_pipes + p->num_shared_mem_pipes
|
||||
+ p->num_scalar_mem_pipes;
|
||||
_readyWFs.resize(num_func_units);
|
||||
|
||||
for (auto &func_unit_wf_list : _readyWFs) {
|
||||
func_unit_wf_list.reserve(p->n_wf);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckToSchedule::reset()
|
||||
{
|
||||
for (auto &func_unit_wf_list : _readyWFs) {
|
||||
func_unit_wf_list.resize(0);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckToSchedule::markWFReady(Wavefront *wf, int func_unit_id)
|
||||
{
|
||||
_readyWFs[func_unit_id].push_back(wf);
|
||||
}
|
||||
|
||||
int
|
||||
ScoreboardCheckToSchedule::numReadyLists() const
|
||||
{
|
||||
return _readyWFs.size();
|
||||
}
|
||||
|
||||
std::vector<Wavefront*>&
|
||||
ScoreboardCheckToSchedule::readyWFs(int func_unit_id)
|
||||
{
|
||||
return _readyWFs[func_unit_id];
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all wavefronts that have been marked as ready at scoreboard stage
|
||||
* but are found to have empty instruction buffers at schedule stage.
|
||||
*/
|
||||
void
|
||||
ScoreboardCheckToSchedule::updateReadyList(int func_unit_id)
|
||||
{
|
||||
std::vector<Wavefront*> &func_unit_wf_list = _readyWFs[func_unit_id];
|
||||
|
||||
for (auto it = func_unit_wf_list.begin(); it != func_unit_wf_list.end();) {
|
||||
if ((*it)->instructionBuffer.empty()) {
|
||||
it = func_unit_wf_list.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Schedule/Execute stage interface.
|
||||
*/
|
||||
ScheduleToExecute::ScheduleToExecute(const ComputeUnitParams *p)
|
||||
{
|
||||
int num_func_units = p->num_SIMDs + p->num_scalar_cores
|
||||
+ p->num_global_mem_pipes + p->num_shared_mem_pipes
|
||||
+ p->num_scalar_mem_pipes;
|
||||
_readyInsts.resize(num_func_units, nullptr);
|
||||
_dispatchStatus.resize(num_func_units, EMPTY);
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleToExecute::reset()
|
||||
{
|
||||
for (auto &func_unit_ready_inst : _readyInsts) {
|
||||
func_unit_ready_inst = nullptr;
|
||||
}
|
||||
|
||||
for (auto &func_unit_status : _dispatchStatus) {
|
||||
func_unit_status = EMPTY;
|
||||
}
|
||||
}
|
||||
|
||||
GPUDynInstPtr&
|
||||
ScheduleToExecute::readyInst(int func_unit_id)
|
||||
{
|
||||
return _readyInsts[func_unit_id];
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleToExecute::dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst,
|
||||
int func_unit_id,
|
||||
DISPATCH_STATUS disp_status)
|
||||
{
|
||||
_readyInsts[func_unit_id] = gpu_dyn_inst;
|
||||
_dispatchStatus[func_unit_id] = disp_status;
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleToExecute::dispatchTransition(int func_unit_id,
|
||||
DISPATCH_STATUS disp_status)
|
||||
{
|
||||
_readyInsts[func_unit_id] = nullptr;
|
||||
_dispatchStatus[func_unit_id] = disp_status;
|
||||
}
|
||||
|
||||
DISPATCH_STATUS
|
||||
ScheduleToExecute::dispatchStatus(int func_unit_id) const
|
||||
{
|
||||
return _dispatchStatus[func_unit_id];
|
||||
}
|
||||
123
src/gpu-compute/comm.hh
Normal file
123
src/gpu-compute/comm.hh
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __GPU_COMPUTE_COMM_HH__
|
||||
#define __GPU_COMPUTE_COMM_HH__
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
|
||||
#include "gpu-compute/exec_stage.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
|
||||
struct ComputeUnitParams;
|
||||
class Wavefront;
|
||||
|
||||
class PipeStageIFace
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Reset the pipe stage interface. This is called to remove
|
||||
* any stale state from the pipe stage that is leftover from
|
||||
* the prior cycle. This is needed when stages do not actually
|
||||
* consume the information passed via the stage interfaces.
|
||||
*/
|
||||
virtual void reset() = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* Communication interface between ScoreboardCheck and Schedule stages.
|
||||
*/
|
||||
class ScoreboardCheckToSchedule : public PipeStageIFace
|
||||
{
|
||||
public:
|
||||
ScoreboardCheckToSchedule() = delete;
|
||||
ScoreboardCheckToSchedule(const ComputeUnitParams *p);
|
||||
void reset() override;
|
||||
/**
|
||||
* Mark the WF as ready for execution on a particular functional
|
||||
* unit.
|
||||
*/
|
||||
void markWFReady(Wavefront *wf, int func_unit_id);
|
||||
/**
|
||||
* Returns the number of ready lists (i.e., the number of functional
|
||||
* units). Each functional unit has its own list of ready WFs to
|
||||
* consider for arbitration.
|
||||
*/
|
||||
int numReadyLists() const;
|
||||
/**
|
||||
* TODO: These methods expose this class' implementation too much by
|
||||
* returning references to its internal data structures directly.
|
||||
* These are to support legacy functionality in the CU pipeline.
|
||||
* They should be removed eventually for an API that hides such
|
||||
* implementation details.
|
||||
*/
|
||||
std::vector<Wavefront*>& readyWFs(int func_unit_id);
|
||||
|
||||
// TODO: Leftover from old CU code, needs to go away.
|
||||
void updateReadyList(int func_unit_id);
|
||||
|
||||
private:
|
||||
std::vector<std::vector<Wavefront*>> _readyWFs;
|
||||
};
|
||||
|
||||
/**
|
||||
* Communication interface between Schedule and Execute stages.
|
||||
*/
|
||||
class ScheduleToExecute : public PipeStageIFace
|
||||
{
|
||||
public:
|
||||
ScheduleToExecute() = delete;
|
||||
ScheduleToExecute(const ComputeUnitParams *p);
|
||||
void reset() override;
|
||||
GPUDynInstPtr& readyInst(int func_unit_id);
|
||||
/**
|
||||
* Once the scheduler has chosen a winning WF for execution, and
|
||||
* after the WF's oldest instruction's operands have been read,
|
||||
* this method is used to mark the instruction as ready to execute.
|
||||
* This puts it on the dispatch list to be consumed by the execute
|
||||
* stage.
|
||||
*/
|
||||
void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst,
|
||||
int func_unit_id, DISPATCH_STATUS disp_status);
|
||||
void dispatchTransition(int func_unit_id, DISPATCH_STATUS disp_status);
|
||||
DISPATCH_STATUS dispatchStatus(int func_unit_id) const;
|
||||
|
||||
private:
|
||||
std::vector<GPUDynInstPtr> _readyInsts;
|
||||
std::vector<DISPATCH_STATUS> _dispatchStatus;
|
||||
};
|
||||
|
||||
#endif // __GPU_COMPUTE_COMM_HH__
|
||||
@@ -68,9 +68,9 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
|
||||
coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
|
||||
registerManager(p->register_manager),
|
||||
fetchStage(p, *this),
|
||||
scoreboardCheckStage(p, *this),
|
||||
scheduleStage(p, *this),
|
||||
execStage(p, *this),
|
||||
scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
|
||||
scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
|
||||
execStage(p, *this, scheduleToExecute),
|
||||
globalMemoryPipe(p, *this),
|
||||
localMemoryPipe(p, *this),
|
||||
scalarMemoryPipe(p, *this),
|
||||
@@ -98,7 +98,9 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
|
||||
lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
|
||||
_cacheLineSize(p->system->cacheLineSize()),
|
||||
_numBarrierSlots(p->num_barrier_slots),
|
||||
globalSeqNum(0), wavefrontSize(p->wf_size)
|
||||
globalSeqNum(0), wavefrontSize(p->wf_size),
|
||||
scoreboardCheckToSchedule(p),
|
||||
scheduleToExecute(p)
|
||||
{
|
||||
/**
|
||||
* This check is necessary because std::bitset only provides conversion
|
||||
@@ -213,8 +215,6 @@ ComputeUnit::~ComputeUnit()
|
||||
lastVaddrSimd[j].clear();
|
||||
}
|
||||
lastVaddrCU.clear();
|
||||
readyList.clear();
|
||||
dispatchList.clear();
|
||||
delete cuExitCallback;
|
||||
delete ldsPort;
|
||||
}
|
||||
@@ -297,24 +297,6 @@ ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task)
|
||||
w->computeActualWgSz(task);
|
||||
}
|
||||
|
||||
// delete all wavefronts that have been marked as ready at SCB stage
|
||||
// but are found to have empty instruction buffers at SCH stage
|
||||
void
|
||||
ComputeUnit::updateReadyList(int unitId)
|
||||
{
|
||||
if (!readyList[unitId].empty()) {
|
||||
for (std::vector<Wavefront *>::iterator it = readyList[unitId].begin();
|
||||
it != readyList[unitId].end();) {
|
||||
if ((*it)->instructionBuffer.empty()) {
|
||||
it = readyList[unitId].erase(it);
|
||||
}
|
||||
else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
|
||||
HSAQueueEntry *task, int bar_id, bool fetchContext)
|
||||
@@ -786,15 +768,7 @@ ComputeUnit::init()
|
||||
vectorRegsReserved.resize(numVectorALUs, 0);
|
||||
scalarRegsReserved.resize(numVectorALUs, 0);
|
||||
|
||||
// Initializing pipeline resources
|
||||
readyList.resize(numExeUnits());
|
||||
|
||||
for (int j = 0; j < numExeUnits(); ++j) {
|
||||
dispatchList.push_back(std::make_pair(nullptr, EMPTY));
|
||||
}
|
||||
|
||||
fetchStage.init();
|
||||
scoreboardCheckStage.init();
|
||||
scheduleStage.init();
|
||||
execStage.init();
|
||||
globalMemoryPipe.init();
|
||||
|
||||
@@ -44,6 +44,7 @@
|
||||
#include "base/types.hh"
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "enums/PrefetchType.hh"
|
||||
#include "gpu-compute/comm.hh"
|
||||
#include "gpu-compute/exec_stage.hh"
|
||||
#include "gpu-compute/fetch_stage.hh"
|
||||
#include "gpu-compute/global_memory_pipeline.hh"
|
||||
@@ -266,40 +267,6 @@ class ComputeUnit : public ClockedObject
|
||||
int numCyclesPerStoreTransfer; // number of cycles per vector store
|
||||
int numCyclesPerLoadTransfer; // number of cycles per vector load
|
||||
|
||||
// Buffers used to communicate between various pipeline stages
|
||||
|
||||
// At a high level, the following intra-/inter-stage communication occurs:
|
||||
// SCB to SCH: readyList provides per exec resource list of waves that
|
||||
// passed dependency and readiness checks. If selected by
|
||||
// scheduler, attempt to add wave to schList conditional on
|
||||
// RF support.
|
||||
// SCH: schList holds waves that are gathering operands or waiting
|
||||
// for execution resource availability. Once ready, waves are
|
||||
// placed on the dispatchList as candidates for execution. A wave
|
||||
// may spend multiple cycles in SCH stage, on the schList due to
|
||||
// RF access conflicts or execution resource contention.
|
||||
// SCH to EX: dispatchList holds waves that are ready to be executed.
|
||||
// LM/FLAT arbitration may remove an LM wave and place it
|
||||
// back on the schList. RF model may also force a wave back
|
||||
// to the schList if using the detailed model.
|
||||
|
||||
// List of waves which are ready to be scheduled.
|
||||
// Each execution resource has a ready list. readyList is
|
||||
// used to communicate between scoreboardCheck stage and
|
||||
// schedule stage
|
||||
std::vector<std::vector<Wavefront*>> readyList;
|
||||
|
||||
// List of waves which will be dispatched to
|
||||
// each execution resource. An EXREADY implies
|
||||
// dispatch list is non-empty and
|
||||
// execution unit has something to execute
|
||||
// this cycle. Currently, the dispatch list of
|
||||
// an execution resource can hold only one wave because
|
||||
// an execution resource can execute only one wave in a cycle.
|
||||
// dispatchList is used to communicate between schedule
|
||||
// and exec stage
|
||||
// TODO: convert std::pair to a class to increase readability
|
||||
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
|
||||
// track presence of dynamic instructions in the Schedule pipeline
|
||||
// stage. This is used to check the readiness of the oldest,
|
||||
// non-dispatched instruction of every WF in the Scoreboard stage.
|
||||
@@ -413,8 +380,6 @@ class ComputeUnit : public ClockedObject
|
||||
// number of available scalar registers per SIMD unit
|
||||
int numScalarRegsPerSimd;
|
||||
|
||||
void updateReadyList(int unitId);
|
||||
|
||||
// this hash map will keep track of page divergence
|
||||
// per memory instruction per wavefront. The hash map
|
||||
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
|
||||
@@ -1117,6 +1082,41 @@ class ComputeUnit : public ClockedObject
|
||||
InstSeqNum globalSeqNum;
|
||||
int wavefrontSize;
|
||||
|
||||
/**
|
||||
* TODO: Update these comments once the pipe stage interface has
|
||||
* been fully refactored.
|
||||
*
|
||||
* Pipeline stage interfaces.
|
||||
*
|
||||
* Buffers used to communicate between various pipeline stages
|
||||
* List of waves which will be dispatched to
|
||||
* each execution resource. An EXREADY implies
|
||||
* dispatch list is non-empty and
|
||||
* execution unit has something to execute
|
||||
* this cycle. Currently, the dispatch list of
|
||||
* an execution resource can hold only one wave because
|
||||
* an execution resource can execute only one wave in a cycle.
|
||||
* dispatchList is used to communicate between schedule
|
||||
* and exec stage
|
||||
*
|
||||
* At a high level, the following intra-/inter-stage communication occurs:
|
||||
* SCB to SCH: readyList provides per exec resource list of waves that
|
||||
* passed dependency and readiness checks. If selected by
|
||||
* scheduler, attempt to add wave to schList conditional on
|
||||
* RF support.
|
||||
* SCH: schList holds waves that are gathering operands or waiting
|
||||
* for execution resource availability. Once ready, waves are
|
||||
* placed on the dispatchList as candidates for execution. A wave
|
||||
* may spend multiple cycles in SCH stage, on the schList due to
|
||||
* RF access conflicts or execution resource contention.
|
||||
* SCH to EX: dispatchList holds waves that are ready to be executed.
|
||||
* LM/FLAT arbitration may remove an LM wave and place it
|
||||
* back on the schList. RF model may also force a wave back
|
||||
* to the schList if using the detailed model.
|
||||
*/
|
||||
ScoreboardCheckToSchedule scoreboardCheckToSchedule;
|
||||
ScheduleToExecute scheduleToExecute;
|
||||
|
||||
/**
|
||||
* The barrier slots for this CU.
|
||||
*/
|
||||
|
||||
@@ -41,8 +41,10 @@
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu)
|
||||
: computeUnit(cu), lastTimeInstExecuted(false),
|
||||
ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu,
|
||||
ScheduleToExecute &from_schedule)
|
||||
: computeUnit(cu), fromSchedule(from_schedule),
|
||||
lastTimeInstExecuted(false),
|
||||
thisTimeInstExecuted(false), instrExecuted (false),
|
||||
executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
|
||||
|
||||
@@ -54,7 +56,6 @@ ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu)
|
||||
void
|
||||
ExecStage::init()
|
||||
{
|
||||
dispatchList = &computeUnit.dispatchList;
|
||||
idle_dur = 0;
|
||||
}
|
||||
|
||||
@@ -128,14 +129,15 @@ ExecStage::dumpDispList()
|
||||
std::stringstream ss;
|
||||
bool empty = true;
|
||||
for (int i = 0; i < computeUnit.numExeUnits(); i++) {
|
||||
DISPATCH_STATUS s = dispatchList->at(i).second;
|
||||
DISPATCH_STATUS s = fromSchedule.dispatchStatus(i);
|
||||
ss << i << ": " << dispStatusToStr(s);
|
||||
if (s != EMPTY) {
|
||||
empty = false;
|
||||
Wavefront *w = dispatchList->at(i).first;
|
||||
ss << " SIMD[" << w->simdId << "] WV[" << w->wfDynId << "]: ";
|
||||
ss << (w->instructionBuffer.front())->seqNum() << ": ";
|
||||
ss << (w->instructionBuffer.front())->disassemble();
|
||||
GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(i);
|
||||
Wavefront *wf = gpu_dyn_inst->wavefront();
|
||||
ss << " SIMD[" << wf->simdId << "] WV[" << wf->wfDynId << "]: ";
|
||||
ss << (wf->instructionBuffer.front())->seqNum() << ": ";
|
||||
ss << (wf->instructionBuffer.front())->disassemble();
|
||||
}
|
||||
ss << "\n";
|
||||
}
|
||||
@@ -152,36 +154,41 @@ ExecStage::exec()
|
||||
dumpDispList();
|
||||
}
|
||||
for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
|
||||
DISPATCH_STATUS s = dispatchList->at(unitId).second;
|
||||
DISPATCH_STATUS s = fromSchedule.dispatchStatus(unitId);
|
||||
switch (s) {
|
||||
case EMPTY:
|
||||
case EMPTY:
|
||||
// Do not execute if empty, waiting for VRF reads,
|
||||
// or LM tied to GM waiting for VRF reads
|
||||
collectStatistics(IdleExec, unitId);
|
||||
break;
|
||||
case EXREADY:
|
||||
{
|
||||
collectStatistics(BusyExec, unitId);
|
||||
Wavefront *w = dispatchList->at(unitId).first;
|
||||
DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
|
||||
unitId, w->simdId, w->wfDynId,
|
||||
(w->instructionBuffer.front())->disassemble());
|
||||
DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
|
||||
dispatchList->at(unitId).first->exec();
|
||||
(computeUnit.scheduleStage).deleteFromSch(w);
|
||||
dispatchList->at(unitId).second = EMPTY;
|
||||
dispatchList->at(unitId).first->freeResources();
|
||||
dispatchList->at(unitId).first = nullptr;
|
||||
break;
|
||||
}
|
||||
case SKIP:
|
||||
collectStatistics(BusyExec, unitId);
|
||||
DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
|
||||
dispatchList->at(unitId).second = EMPTY;
|
||||
dispatchList->at(unitId).first->freeResources();
|
||||
dispatchList->at(unitId).first = nullptr;
|
||||
break;
|
||||
default:
|
||||
case EXREADY:
|
||||
{
|
||||
collectStatistics(BusyExec, unitId);
|
||||
GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(unitId);
|
||||
assert(gpu_dyn_inst);
|
||||
Wavefront *wf = gpu_dyn_inst->wavefront();
|
||||
DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
|
||||
unitId, wf->simdId, wf->wfDynId,
|
||||
gpu_dyn_inst->disassemble());
|
||||
DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
|
||||
wf->exec();
|
||||
(computeUnit.scheduleStage).deleteFromSch(wf);
|
||||
fromSchedule.dispatchTransition(unitId, EMPTY);
|
||||
wf->freeResources();
|
||||
break;
|
||||
}
|
||||
case SKIP:
|
||||
{
|
||||
collectStatistics(BusyExec, unitId);
|
||||
GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(unitId);
|
||||
assert(gpu_dyn_inst);
|
||||
Wavefront *wf = gpu_dyn_inst->wavefront();
|
||||
DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
|
||||
fromSchedule.dispatchTransition(unitId, EMPTY);
|
||||
wf->freeResources();
|
||||
break;
|
||||
}
|
||||
default:
|
||||
panic("Unknown dispatch status in exec()\n");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,7 +42,9 @@
|
||||
#include "sim/stats.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class ScheduleToExecute;
|
||||
class Wavefront;
|
||||
|
||||
struct ComputeUnitParams;
|
||||
|
||||
enum STAT_STATUS
|
||||
@@ -69,7 +71,8 @@ enum DISPATCH_STATUS
|
||||
class ExecStage
|
||||
{
|
||||
public:
|
||||
ExecStage(const ComputeUnitParams* p, ComputeUnit &cu);
|
||||
ExecStage(const ComputeUnitParams* p, ComputeUnit &cu,
|
||||
ScheduleToExecute &from_schedule);
|
||||
~ExecStage() { }
|
||||
void init();
|
||||
void exec();
|
||||
@@ -97,17 +100,8 @@ class ExecStage
|
||||
void collectStatistics(enum STAT_STATUS stage, int unitId);
|
||||
void initStatistics();
|
||||
ComputeUnit &computeUnit;
|
||||
ScheduleToExecute &fromSchedule;
|
||||
|
||||
// List of waves which will be dispatched to
|
||||
// each execution resource. A FILLED implies
|
||||
// dispatch list is non-empty and
|
||||
// execution unit has something to execute
|
||||
// this cycle. Currently, the dispatch list of
|
||||
// an execution resource can hold only one wave because
|
||||
// an execution resource can execute only one wave in a cycle.
|
||||
// dispatchList is used to communicate between schedule
|
||||
// and exec stage
|
||||
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
|
||||
bool lastTimeInstExecuted;
|
||||
bool thisTimeInstExecuted;
|
||||
bool instrExecuted;
|
||||
|
||||
@@ -43,8 +43,12 @@
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu)
|
||||
: computeUnit(cu), _name(cu.name() + ".ScheduleStage"),
|
||||
ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu,
|
||||
ScoreboardCheckToSchedule &from_scoreboard_check,
|
||||
ScheduleToExecute &to_execute)
|
||||
: computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
|
||||
toExecute(to_execute),
|
||||
_name(cu.name() + ".ScheduleStage"),
|
||||
vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
|
||||
scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
|
||||
locMemBusRdy(false), locMemIssueRdy(false)
|
||||
@@ -70,14 +74,12 @@ void
|
||||
ScheduleStage::init()
|
||||
{
|
||||
|
||||
fatal_if(scheduler.size() != computeUnit.readyList.size(),
|
||||
fatal_if(scheduler.size() != fromScoreboardCheck.numReadyLists(),
|
||||
"Scheduler should have same number of entries as CU's readyList");
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
scheduler[j].bindList(&computeUnit.readyList[j]);
|
||||
scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j));
|
||||
}
|
||||
|
||||
dispatchList = &computeUnit.dispatchList;
|
||||
|
||||
assert(computeUnit.numVectorGlobalMemUnits == 1);
|
||||
assert(computeUnit.numVectorSharedMemUnits == 1);
|
||||
}
|
||||
@@ -85,21 +87,21 @@ ScheduleStage::init()
|
||||
void
|
||||
ScheduleStage::exec()
|
||||
{
|
||||
toExecute.reset();
|
||||
|
||||
// Update readyList
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
// delete all ready wavefronts whose instruction buffers are now
|
||||
// empty because the last instruction was executed
|
||||
computeUnit.updateReadyList(j);
|
||||
/**
|
||||
* Remove any wave that already has an instruction present in SCH
|
||||
* waiting for RF reads to complete. This prevents out of order
|
||||
* execution within a wave.
|
||||
*/
|
||||
for (auto wIt = computeUnit.readyList.at(j).begin();
|
||||
wIt != computeUnit.readyList.at(j).end();) {
|
||||
fromScoreboardCheck.updateReadyList(j);
|
||||
for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
|
||||
wIt != fromScoreboardCheck.readyWFs(j).end();) {
|
||||
if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
|
||||
*wIt = nullptr;
|
||||
wIt = computeUnit.readyList.at(j).erase(wIt);
|
||||
wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
|
||||
} else {
|
||||
wIt++;
|
||||
}
|
||||
@@ -115,7 +117,7 @@ ScheduleStage::exec()
|
||||
int firstMemUnit = computeUnit.firstMemUnit();
|
||||
int lastMemUnit = computeUnit.lastMemUnit();
|
||||
for (int j = firstMemUnit; j <= lastMemUnit; j++) {
|
||||
int readyListSize = computeUnit.readyList[j].size();
|
||||
int readyListSize = fromScoreboardCheck.readyWFs(j).size();
|
||||
// If no wave is ready to be scheduled on the execution resource
|
||||
// then skip scheduling for this execution resource
|
||||
if (!readyListSize) {
|
||||
@@ -125,11 +127,13 @@ ScheduleStage::exec()
|
||||
rdyListNotEmpty[j]++;
|
||||
|
||||
// Pick a wave and attempt to add it to schList
|
||||
Wavefront *w = scheduler[j].chooseWave();
|
||||
if (!addToSchList(j, w)) {
|
||||
Wavefront *wf = scheduler[j].chooseWave();
|
||||
GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
|
||||
assert(gpu_dyn_inst);
|
||||
if (!addToSchList(j, gpu_dyn_inst)) {
|
||||
// For waves not added to schList, increment count of cycles
|
||||
// this wave spends in SCH stage.
|
||||
w->schCycles++;
|
||||
wf->schCycles++;
|
||||
addToSchListStalls[j]++;
|
||||
}
|
||||
}
|
||||
@@ -140,7 +144,7 @@ ScheduleStage::exec()
|
||||
if (j >= firstMemUnit && j <= lastMemUnit) {
|
||||
continue;
|
||||
}
|
||||
int readyListSize = computeUnit.readyList[j].size();
|
||||
int readyListSize = fromScoreboardCheck.readyWFs(j).size();
|
||||
// If no wave is ready to be scheduled on the execution resource
|
||||
// then skip scheduling for this execution resource
|
||||
if (!readyListSize) {
|
||||
@@ -150,11 +154,13 @@ ScheduleStage::exec()
|
||||
rdyListNotEmpty[j]++;
|
||||
|
||||
// Pick a wave and attempt to add it to schList
|
||||
Wavefront *w = scheduler[j].chooseWave();
|
||||
if (!addToSchList(j, w)) {
|
||||
Wavefront *wf = scheduler[j].chooseWave();
|
||||
GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
|
||||
assert(gpu_dyn_inst);
|
||||
if (!addToSchList(j, gpu_dyn_inst)) {
|
||||
// For waves not added to schList, increment count of cycles
|
||||
// this wave spends in SCH stage.
|
||||
w->schCycles++;
|
||||
wf->schCycles++;
|
||||
addToSchListStalls[j]++;
|
||||
}
|
||||
}
|
||||
@@ -191,30 +197,36 @@ ScheduleStage::exec()
|
||||
|
||||
void
|
||||
ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
|
||||
Wavefront *w)
|
||||
const GPUDynInstPtr &gpu_dyn_inst)
|
||||
{
|
||||
dispatchList->at(unitId).first = w;
|
||||
dispatchList->at(unitId).second = s;
|
||||
toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s)
|
||||
{
|
||||
toExecute.dispatchTransition(unitId, s);
|
||||
}
|
||||
|
||||
bool
|
||||
ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
|
||||
ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
|
||||
{
|
||||
GPUDynInstPtr ii = w->instructionBuffer.front();
|
||||
assert(ii);
|
||||
assert(gpu_dyn_inst);
|
||||
Wavefront *wf = gpu_dyn_inst->wavefront();
|
||||
bool accessVrfWr = true;
|
||||
if (!ii->isScalar()) {
|
||||
accessVrfWr =
|
||||
computeUnit.vrf[w->simdId]->canScheduleWriteOperands(w, ii);
|
||||
if (!gpu_dyn_inst->isScalar()) {
|
||||
accessVrfWr = computeUnit.vrf[wf->simdId]
|
||||
->canScheduleWriteOperands(wf, gpu_dyn_inst);
|
||||
}
|
||||
bool accessSrfWr =
|
||||
computeUnit.srf[w->simdId]->canScheduleWriteOperands(w, ii);
|
||||
bool accessSrfWr = computeUnit.srf[wf->simdId]
|
||||
->canScheduleWriteOperands(wf, gpu_dyn_inst);
|
||||
bool accessRf = accessVrfWr && accessSrfWr;
|
||||
if (accessRf) {
|
||||
if (!ii->isScalar()) {
|
||||
computeUnit.vrf[w->simdId]->scheduleWriteOperands(w, ii);
|
||||
if (!gpu_dyn_inst->isScalar()) {
|
||||
computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
|
||||
gpu_dyn_inst);
|
||||
}
|
||||
computeUnit.srf[w->simdId]->scheduleWriteOperands(w, ii);
|
||||
computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
|
||||
return true;
|
||||
} else {
|
||||
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
|
||||
@@ -226,8 +238,8 @@ ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
|
||||
}
|
||||
|
||||
// Increment stall counts for WF
|
||||
w->schStalls++;
|
||||
w->schRfAccessStalls++;
|
||||
wf->schStalls++;
|
||||
wf->schRfAccessStalls++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -236,18 +248,18 @@ void
|
||||
ScheduleStage::scheduleRfDestOperands()
|
||||
{
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
if (dispatchList->at(j).second == EMPTY ||
|
||||
dispatchList->at(j).second == SKIP) {
|
||||
if (toExecute.dispatchStatus(j) == EMPTY ||
|
||||
toExecute.dispatchStatus(j) == SKIP) {
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(dispatchList->at(j).first);
|
||||
|
||||
// get the wave on dispatch list and attempt to allocate write
|
||||
// resources in the RFs
|
||||
Wavefront *w = dispatchList->at(j).first;
|
||||
if (!schedRfWrites(j, w)) {
|
||||
reinsertToSchList(j, w);
|
||||
const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
|
||||
assert(gpu_dyn_inst);
|
||||
Wavefront *wf = gpu_dyn_inst->wavefront();
|
||||
if (!schedRfWrites(j, gpu_dyn_inst)) {
|
||||
reinsertToSchList(j, gpu_dyn_inst);
|
||||
doDispatchListTransition(j, EMPTY);
|
||||
// if this is a flat inst, also transition the LM pipe to empty
|
||||
// Note: since FLAT/LM arbitration occurs before scheduling
|
||||
@@ -255,51 +267,53 @@ ScheduleStage::scheduleRfDestOperands()
|
||||
// instruction lost arbitration, but would have been able to
|
||||
// pass the RF destination operand check here, and execute
|
||||
// instead of the FLAT.
|
||||
if (w->instructionBuffer.front()->isFlat()) {
|
||||
assert(dispatchList->at(w->localMem).second == SKIP);
|
||||
doDispatchListTransition(w->localMem, EMPTY);
|
||||
if (wf->instructionBuffer.front()->isFlat()) {
|
||||
assert(toExecute.dispatchStatus(wf->localMem)
|
||||
== SKIP);
|
||||
doDispatchListTransition(wf->localMem, EMPTY);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
ScheduleStage::addToSchList(int exeType, Wavefront *w)
|
||||
ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
|
||||
{
|
||||
// Attempt to add the wave to the schList if the VRF can support the
|
||||
// wave's next instruction
|
||||
GPUDynInstPtr ii = w->instructionBuffer.front();
|
||||
assert(ii);
|
||||
assert(gpu_dyn_inst);
|
||||
Wavefront *wf = gpu_dyn_inst->wavefront();
|
||||
bool accessVrf = true;
|
||||
if (!ii->isScalar()) {
|
||||
accessVrf =
|
||||
computeUnit.vrf[w->simdId]->canScheduleReadOperands(w, ii);
|
||||
if (!gpu_dyn_inst->isScalar()) {
|
||||
accessVrf = computeUnit.vrf[wf->simdId]
|
||||
->canScheduleReadOperands(wf, gpu_dyn_inst);
|
||||
}
|
||||
bool accessSrf =
|
||||
computeUnit.srf[w->simdId]->canScheduleReadOperands(w, ii);
|
||||
bool accessSrf = computeUnit.srf[wf->simdId]
|
||||
->canScheduleReadOperands(wf, gpu_dyn_inst);
|
||||
// If RFs can support instruction, add to schList in RFBUSY state,
|
||||
// place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
|
||||
// to the VRF
|
||||
bool accessRf = accessVrf && accessSrf;
|
||||
if (accessRf) {
|
||||
DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
|
||||
exeType, w->simdId, w->wfDynId,
|
||||
ii->seqNum(), ii->disassemble());
|
||||
exeType, wf->simdId, wf->wfDynId,
|
||||
gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
|
||||
|
||||
computeUnit.insertInPipeMap(w);
|
||||
wavesInSch.emplace(w->wfDynId);
|
||||
schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
|
||||
if (w->isOldestInstWaitcnt()) {
|
||||
w->setStatus(Wavefront::S_WAITCNT);
|
||||
computeUnit.insertInPipeMap(wf);
|
||||
wavesInSch.emplace(wf->wfDynId);
|
||||
schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
|
||||
if (wf->isOldestInstWaitcnt()) {
|
||||
wf->setStatus(Wavefront::S_WAITCNT);
|
||||
}
|
||||
if (!ii->isScalar()) {
|
||||
computeUnit.vrf[w->simdId]->scheduleReadOperands(w, ii);
|
||||
if (!gpu_dyn_inst->isScalar()) {
|
||||
computeUnit.vrf[wf->simdId]
|
||||
->scheduleReadOperands(wf, gpu_dyn_inst);
|
||||
}
|
||||
computeUnit.srf[w->simdId]->scheduleReadOperands(w, ii);
|
||||
computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);
|
||||
|
||||
DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
|
||||
exeType, w->simdId, w->wfDynId,
|
||||
ii->seqNum(), ii->disassemble());
|
||||
exeType, wf->simdId, wf->wfDynId,
|
||||
gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
|
||||
return true;
|
||||
} else {
|
||||
// Number of stall cycles due to RF access denied
|
||||
@@ -314,28 +328,30 @@ ScheduleStage::addToSchList(int exeType, Wavefront *w)
|
||||
}
|
||||
|
||||
// Increment stall counts for WF
|
||||
w->schStalls++;
|
||||
w->schRfAccessStalls++;
|
||||
wf->schStalls++;
|
||||
wf->schRfAccessStalls++;
|
||||
DPRINTF(GPUSched, "schList[%d]: Could not add: "
|
||||
"SIMD[%d] WV[%d]: %d: %s\n",
|
||||
exeType, w->simdId, w->wfDynId,
|
||||
ii->seqNum(), ii->disassemble());
|
||||
exeType, wf->simdId, wf->wfDynId,
|
||||
gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::reinsertToSchList(int exeType, Wavefront *w)
|
||||
ScheduleStage::reinsertToSchList(int exeType,
|
||||
const GPUDynInstPtr &gpu_dyn_inst)
|
||||
{
|
||||
// Insert wave w into schList for specified exeType.
|
||||
// Wave is inserted in age order, with oldest wave being at the
|
||||
// front of the schList
|
||||
assert(gpu_dyn_inst);
|
||||
auto schIter = schList.at(exeType).begin();
|
||||
while (schIter != schList.at(exeType).end()
|
||||
&& schIter->first->wfDynId < w->wfDynId) {
|
||||
&& schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
|
||||
schIter++;
|
||||
}
|
||||
schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY));
|
||||
schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
|
||||
}
|
||||
|
||||
void
|
||||
@@ -377,46 +393,48 @@ ScheduleStage::checkMemResources()
|
||||
}
|
||||
|
||||
bool
|
||||
ScheduleStage::dispatchReady(Wavefront *w)
|
||||
ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
|
||||
{
|
||||
assert(gpu_dyn_inst);
|
||||
Wavefront *wf = gpu_dyn_inst->wavefront();
|
||||
vectorAluRdy = false;
|
||||
scalarAluRdy = false;
|
||||
// check for available vector/scalar ALUs in the next cycle
|
||||
if (computeUnit.vectorALUs[w->simdId].rdy(Cycles(1))) {
|
||||
if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
|
||||
vectorAluRdy = true;
|
||||
}
|
||||
if (computeUnit.scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
|
||||
if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
|
||||
scalarAluRdy = true;
|
||||
}
|
||||
GPUDynInstPtr ii = w->instructionBuffer.front();
|
||||
|
||||
if (ii->isNop()) {
|
||||
if (gpu_dyn_inst->isNop()) {
|
||||
// S_NOP requires SALU. V_NOP requires VALU.
|
||||
// TODO: Scalar NOP does not require SALU in hardware,
|
||||
// and is executed out of IB directly.
|
||||
if (ii->isScalar() && !scalarAluRdy) {
|
||||
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
|
||||
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
return false;
|
||||
} else if (!ii->isScalar() && !vectorAluRdy) {
|
||||
} else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
|
||||
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
|
||||
return false;
|
||||
}
|
||||
} else if (ii->isEndOfKernel()) {
|
||||
} else if (gpu_dyn_inst->isEndOfKernel()) {
|
||||
// EndPgm instruction
|
||||
if (ii->isScalar() && !scalarAluRdy) {
|
||||
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
|
||||
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
return false;
|
||||
}
|
||||
} else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) {
|
||||
} else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
|
||||
|| gpu_dyn_inst->isALU()) {
|
||||
// Barrier, Branch, or ALU instruction
|
||||
if (ii->isScalar() && !scalarAluRdy) {
|
||||
if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
|
||||
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
return false;
|
||||
} else if (!ii->isScalar() && !vectorAluRdy) {
|
||||
} else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
|
||||
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
|
||||
return false;
|
||||
}
|
||||
} else if (!ii->isScalar() && ii->isGlobalMem()) {
|
||||
} else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
|
||||
// Vector Global Memory instruction
|
||||
bool rdy = true;
|
||||
if (!glbMemIssueRdy) {
|
||||
@@ -427,18 +445,18 @@ ScheduleStage::dispatchReady(Wavefront *w)
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
|
||||
if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
|
||||
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
}
|
||||
} else if (ii->isScalar() && ii->isGlobalMem()) {
|
||||
} else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
|
||||
// Scalar Global Memory instruction
|
||||
bool rdy = true;
|
||||
if (!scalarMemIssueRdy) {
|
||||
@@ -449,16 +467,17 @@ ScheduleStage::dispatchReady(Wavefront *w)
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.scalarMemoryPipe.
|
||||
isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
|
||||
w->scalarWrGmReqsInPipe)) {
|
||||
if (!computeUnit.scalarMemoryPipe
|
||||
.isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
|
||||
+ wf->scalarWrGmReqsInPipe))
|
||||
{
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
}
|
||||
} else if (!ii->isScalar() && ii->isLocalMem()) {
|
||||
} else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
|
||||
// Vector Local Memory instruction
|
||||
bool rdy = true;
|
||||
if (!locMemIssueRdy) {
|
||||
@@ -470,14 +489,14 @@ ScheduleStage::dispatchReady(Wavefront *w)
|
||||
dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.localMemoryPipe.
|
||||
isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
|
||||
isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
}
|
||||
} else if (!ii->isScalar() && ii->isFlat()) {
|
||||
} else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
|
||||
// Vector Flat memory instruction
|
||||
bool rdy = true;
|
||||
if (!glbMemIssueRdy || !locMemIssueRdy) {
|
||||
@@ -488,16 +507,16 @@ ScheduleStage::dispatchReady(Wavefront *w)
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
|
||||
if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
|
||||
if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
|
||||
}
|
||||
if (!computeUnit.localMemoryPipe.
|
||||
isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
|
||||
isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
|
||||
}
|
||||
@@ -505,7 +524,8 @@ ScheduleStage::dispatchReady(Wavefront *w)
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
panic("%s: unknown instr checked for readiness", ii->disassemble());
|
||||
panic("%s: unknown instr checked for readiness",
|
||||
gpu_dyn_inst->disassemble());
|
||||
return false;
|
||||
}
|
||||
dispNrdyStalls[SCH_RDY]++;
|
||||
@@ -519,7 +539,7 @@ ScheduleStage::fillDispatchList()
|
||||
checkMemResources();
|
||||
// iterate execution resources
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); j++) {
|
||||
assert(dispatchList->at(j).second == EMPTY);
|
||||
assert(toExecute.dispatchStatus(j) == EMPTY);
|
||||
|
||||
// iterate waves in schList to pick one for dispatch
|
||||
auto schIter = schList.at(j).begin();
|
||||
@@ -537,8 +557,7 @@ ScheduleStage::fillDispatchList()
|
||||
|
||||
// Acquire a coalescer token if it is a global mem
|
||||
// operation.
|
||||
GPUDynInstPtr mp = schIter->first->
|
||||
instructionBuffer.front();
|
||||
GPUDynInstPtr mp = schIter->first;
|
||||
if (!mp->isMemSync() && !mp->isScalar() &&
|
||||
(mp->isGlobalMem() || mp->isFlat())) {
|
||||
computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
|
||||
@@ -553,10 +572,10 @@ ScheduleStage::fillDispatchList()
|
||||
} else {
|
||||
// Either another wave has been dispatched, or this wave
|
||||
// was not ready, so it is stalled this cycle
|
||||
schIter->first->schStalls++;
|
||||
schIter->first->wavefront()->schStalls++;
|
||||
if (!dispRdy) {
|
||||
// not ready for dispatch, increment stall stat
|
||||
schIter->first->schResourceStalls++;
|
||||
schIter->first->wavefront()->schResourceStalls++;
|
||||
}
|
||||
// Examine next wave for this resource
|
||||
schIter++;
|
||||
@@ -589,28 +608,31 @@ ScheduleStage::arbitrateVrfToLdsBus()
|
||||
// get the GM pipe index in the dispatchList
|
||||
int gm_exe_unit = computeUnit.firstMemUnit() + i;
|
||||
// get the wave in the dispatchList
|
||||
Wavefront *w = dispatchList->at(gm_exe_unit).first;
|
||||
GPUDynInstPtr &gpu_dyn_inst
|
||||
= toExecute.readyInst(gm_exe_unit);
|
||||
// If the WF is valid, ready to execute, and the instruction
|
||||
// is a flat access, arbitrate with the WF's assigned LM pipe
|
||||
if (w && dispatchList->at(gm_exe_unit).second == EXREADY &&
|
||||
w->instructionBuffer.front()->isFlat()) {
|
||||
if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
|
||||
== EXREADY && gpu_dyn_inst->isFlat()) {
|
||||
Wavefront *wf = gpu_dyn_inst->wavefront();
|
||||
// If the associated LM pipe also has a wave selected, block
|
||||
// that wave and let the Flat instruction issue. The WF in the
|
||||
// LM pipe is added back to the schList for consideration next
|
||||
// cycle.
|
||||
if (dispatchList->at(w->localMem).second == EXREADY) {
|
||||
reinsertToSchList(w->localMem,
|
||||
dispatchList->at(w->localMem).first);
|
||||
if (toExecute.dispatchStatus(wf->localMem) == EXREADY) {
|
||||
reinsertToSchList(wf->localMem, toExecute
|
||||
.readyInst(wf->localMem));
|
||||
// Increment stall stats for LDS-VRF arbitration
|
||||
ldsBusArbStalls++;
|
||||
dispatchList->at(w->localMem).first->schLdsArbStalls++;
|
||||
toExecute.readyInst(wf->localMem)
|
||||
->wavefront()->schLdsArbStalls++;
|
||||
}
|
||||
// With arbitration of LM pipe complete, transition the
|
||||
// LM pipe to SKIP state in the dispatchList to inform EX stage
|
||||
// that a Flat instruction is executing next cycle
|
||||
doDispatchListTransition(w->localMem, SKIP, w);
|
||||
doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
|
||||
DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
|
||||
"EXREADY->SKIP\n", w->localMem);
|
||||
"EXREADY->SKIP\n", wf->localMem);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -623,41 +645,41 @@ ScheduleStage::checkRfOperandReadComplete()
|
||||
// selection for dispatchList
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
for (auto &p : schList.at(j)) {
|
||||
Wavefront *w = p.first;
|
||||
assert(w);
|
||||
const GPUDynInstPtr &gpu_dyn_inst = p.first;
|
||||
assert(gpu_dyn_inst);
|
||||
Wavefront *wf = gpu_dyn_inst->wavefront();
|
||||
|
||||
// Increment the number of cycles the wave spends in the
|
||||
// SCH stage, since this loop visits every wave in SCH.
|
||||
w->schCycles++;
|
||||
wf->schCycles++;
|
||||
|
||||
GPUDynInstPtr ii = w->instructionBuffer.front();
|
||||
bool vrfRdy = true;
|
||||
if (!ii->isScalar()) {
|
||||
vrfRdy =
|
||||
computeUnit.vrf[w->simdId]->operandReadComplete(w, ii);
|
||||
if (!gpu_dyn_inst->isScalar()) {
|
||||
vrfRdy = computeUnit.vrf[wf->simdId]
|
||||
->operandReadComplete(wf, gpu_dyn_inst);
|
||||
}
|
||||
bool srfRdy =
|
||||
computeUnit.srf[w->simdId]->operandReadComplete(w, ii);
|
||||
bool srfRdy = computeUnit.srf[wf->simdId]
|
||||
->operandReadComplete(wf, gpu_dyn_inst);
|
||||
bool operandsReady = vrfRdy && srfRdy;
|
||||
if (operandsReady) {
|
||||
DPRINTF(GPUSched,
|
||||
"schList[%d]: WV[%d] operands ready for: %d: %s\n",
|
||||
j, w->wfDynId, ii->seqNum(), ii->disassemble());
|
||||
DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
|
||||
"%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
|
||||
gpu_dyn_inst->disassemble());
|
||||
DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
|
||||
j, w->wfDynId);
|
||||
j, wf->wfDynId);
|
||||
p.second = RFREADY;
|
||||
} else {
|
||||
DPRINTF(GPUSched,
|
||||
"schList[%d]: WV[%d] operands not ready for: %d: %s\n",
|
||||
j, w->wfDynId, ii->seqNum(), ii->disassemble());
|
||||
DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
|
||||
"for: %d: %s\n", j, wf->wfDynId,
|
||||
gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
|
||||
|
||||
// operands not ready yet, increment SCH stage stats
|
||||
// aggregate to all wavefronts on the CU
|
||||
p.second = RFBUSY;
|
||||
|
||||
// Increment stall stats
|
||||
w->schStalls++;
|
||||
w->schOpdNrdyStalls++;
|
||||
wf->schStalls++;
|
||||
wf->schOpdNrdyStalls++;
|
||||
|
||||
opdNrdyStalls[SCH_RF_OPD_NRDY]++;
|
||||
if (!vrfRdy) {
|
||||
@@ -678,23 +700,21 @@ ScheduleStage::reserveResources()
|
||||
exeUnitReservations.resize(computeUnit.numExeUnits(), false);
|
||||
|
||||
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
|
||||
Wavefront *dispatchedWave = dispatchList->at(j).first;
|
||||
if (dispatchedWave) {
|
||||
DISPATCH_STATUS s = dispatchList->at(j).second;
|
||||
GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
|
||||
if (gpu_dyn_inst) {
|
||||
DISPATCH_STATUS s = toExecute.dispatchStatus(j);
|
||||
Wavefront *wf = gpu_dyn_inst->wavefront();
|
||||
if (s == EMPTY) {
|
||||
continue;
|
||||
} else if (s == EXREADY) {
|
||||
// Wave is ready for execution
|
||||
std::vector<int> execUnitIds =
|
||||
dispatchedWave->reserveResources();
|
||||
GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
|
||||
std::vector<int> execUnitIds = wf->reserveResources();
|
||||
|
||||
if (!ii->isScalar()) {
|
||||
computeUnit.vrf[dispatchedWave->simdId]->
|
||||
dispatchInstruction(ii);
|
||||
if (!gpu_dyn_inst->isScalar()) {
|
||||
computeUnit.vrf[wf->simdId]
|
||||
->dispatchInstruction(gpu_dyn_inst);
|
||||
}
|
||||
computeUnit.srf[dispatchedWave->simdId]->
|
||||
dispatchInstruction(ii);
|
||||
computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);
|
||||
|
||||
std::stringstream ss;
|
||||
for (auto id : execUnitIds) {
|
||||
@@ -702,16 +722,16 @@ ScheduleStage::reserveResources()
|
||||
}
|
||||
DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
|
||||
" Reserving ExeRes[ %s]\n",
|
||||
j, dispatchedWave->simdId, dispatchedWave->wfDynId,
|
||||
ii->seqNum(), ii->disassemble(), ss.str());
|
||||
j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
|
||||
gpu_dyn_inst->disassemble(), ss.str());
|
||||
// mark the resources as reserved for this cycle
|
||||
for (auto execUnitId : execUnitIds) {
|
||||
panic_if(exeUnitReservations.at(execUnitId),
|
||||
"Execution unit %d is reserved!!!\n"
|
||||
"SIMD[%d] WV[%d]: %d: %s",
|
||||
execUnitId, dispatchedWave->simdId,
|
||||
dispatchedWave->wfDynId,
|
||||
ii->seqNum(), ii->disassemble());
|
||||
execUnitId, wf->simdId, wf->wfDynId,
|
||||
gpu_dyn_inst->seqNum(),
|
||||
gpu_dyn_inst->disassemble());
|
||||
exeUnitReservations.at(execUnitId) = true;
|
||||
}
|
||||
|
||||
@@ -720,18 +740,20 @@ ScheduleStage::reserveResources()
|
||||
// that we've reserved a global and local memory unit. Thus,
|
||||
// we need to mark the latter execution unit as not available.
|
||||
if (execUnitIds.size() > 1) {
|
||||
int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem;
|
||||
assert(dispatchList->at(lm_exec_unit).second == SKIP);
|
||||
int lm_exec_unit M5_VAR_USED = wf->localMem;
|
||||
assert(toExecute.dispatchStatus(lm_exec_unit)
|
||||
== SKIP);
|
||||
}
|
||||
} else if (s == SKIP) {
|
||||
// Shared Memory pipe reserved for FLAT instruction.
|
||||
// Verify the GM pipe for this wave is ready to execute
|
||||
// and the wave in the GM pipe is the same as the wave
|
||||
// in the LM pipe
|
||||
int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem;
|
||||
assert(dispatchList->at(gm_exec_unit).first->wfDynId ==
|
||||
dispatchedWave->wfDynId);
|
||||
assert(dispatchList->at(gm_exec_unit).second == EXREADY);
|
||||
int gm_exec_unit M5_VAR_USED = wf->globalMem;
|
||||
assert(wf->wfDynId == toExecute
|
||||
.readyInst(gm_exec_unit)->wfDynId);
|
||||
assert(toExecute.dispatchStatus(gm_exec_unit)
|
||||
== EXREADY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,8 +41,8 @@
|
||||
#include <vector>
|
||||
|
||||
#include "gpu-compute/exec_stage.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "gpu-compute/scheduler.hh"
|
||||
#include "gpu-compute/scoreboard_check_stage.hh"
|
||||
|
||||
// Schedule or execution arbitration stage.
|
||||
// From the pool of ready waves in the ready list,
|
||||
@@ -50,6 +50,8 @@
|
||||
// The selection is made based on a scheduling policy
|
||||
|
||||
class ComputeUnit;
|
||||
class ScheduleToExecute;
|
||||
class ScoreboardCheckToSchedule;
|
||||
class Wavefront;
|
||||
|
||||
struct ComputeUnitParams;
|
||||
@@ -57,7 +59,9 @@ struct ComputeUnitParams;
|
||||
class ScheduleStage
|
||||
{
|
||||
public:
|
||||
ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu);
|
||||
ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu,
|
||||
ScoreboardCheckToSchedule &from_scoreboard_check,
|
||||
ScheduleToExecute &to_execute);
|
||||
~ScheduleStage();
|
||||
void init();
|
||||
void exec();
|
||||
@@ -115,17 +119,13 @@ class ScheduleStage
|
||||
|
||||
private:
|
||||
ComputeUnit &computeUnit;
|
||||
ScoreboardCheckToSchedule &fromScoreboardCheck;
|
||||
ScheduleToExecute &toExecute;
|
||||
|
||||
// Each execution resource will have its own
|
||||
// scheduler and a dispatch list
|
||||
std::vector<Scheduler> scheduler;
|
||||
|
||||
// List of waves which will be dispatched to
|
||||
// each execution resource.
|
||||
// Currently, the dispatch list of
|
||||
// an execution resource can hold only one wave because
|
||||
// an execution resource can execute only one wave in a cycle.
|
||||
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
|
||||
|
||||
// Stats
|
||||
|
||||
// Number of cycles with empty (or not empty) readyList, per execution
|
||||
@@ -171,10 +171,10 @@ class ScheduleStage
|
||||
const std::string _name;
|
||||
|
||||
// called by exec() to add a wave to schList if the RFs can support it
|
||||
bool addToSchList(int exeType, Wavefront *w);
|
||||
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
|
||||
// re-insert a wave to schList if wave lost arbitration
|
||||
// wave is inserted such that age order (oldest to youngest) is preserved
|
||||
void reinsertToSchList(int exeType, Wavefront *w);
|
||||
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
|
||||
// check waves in schList to see if RF reads complete
|
||||
void checkRfOperandReadComplete();
|
||||
// check execution resources for readiness
|
||||
@@ -189,7 +189,7 @@ class ScheduleStage
|
||||
// check status of memory pipes and RF to Mem buses
|
||||
void checkMemResources();
|
||||
// resource ready check called by fillDispatchList
|
||||
bool dispatchReady(Wavefront *w);
|
||||
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst);
|
||||
// pick waves from schList and populate dispatchList with one wave
|
||||
// per EXE resource type
|
||||
void fillDispatchList();
|
||||
@@ -199,12 +199,13 @@ class ScheduleStage
|
||||
// dispatchList
|
||||
void scheduleRfDestOperands();
|
||||
// invoked by scheduleRfDestOperands to schedule RF writes for a wave
|
||||
bool schedRfWrites(int exeType, Wavefront *w);
|
||||
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
|
||||
// reserve resources for waves surviving arbitration in dispatchList
|
||||
void reserveResources();
|
||||
|
||||
void doDispatchListTransition(int unitId, DISPATCH_STATUS s,
|
||||
Wavefront *w = nullptr);
|
||||
const GPUDynInstPtr &gpu_dyn_inst);
|
||||
void doDispatchListTransition(int unitId, DISPATCH_STATUS s);
|
||||
|
||||
// Set tracking wfDynId for each wave present in schedule stage
|
||||
// Used to allow only one instruction per wave in schedule
|
||||
@@ -219,7 +220,7 @@ class ScheduleStage
|
||||
// The maximum number of waves per resource can be determined by either
|
||||
// the VRF/SRF availability or limits imposed by paremeters (to be added)
|
||||
// of the SCH stage or CU.
|
||||
std::vector<std::deque<std::pair<Wavefront*, SCH_STATUS>>> schList;
|
||||
std::vector<std::deque<std::pair<GPUDynInstPtr, SCH_STATUS>>> schList;
|
||||
};
|
||||
|
||||
#endif // __SCHEDULE_STAGE_HH__
|
||||
|
||||
@@ -45,22 +45,16 @@
|
||||
#include "params/ComputeUnit.hh"
|
||||
|
||||
ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p,
|
||||
ComputeUnit &cu)
|
||||
: computeUnit(cu), _name(cu.name() + ".ScoreboardCheckStage")
|
||||
ComputeUnit &cu,
|
||||
ScoreboardCheckToSchedule
|
||||
&to_schedule)
|
||||
: computeUnit(cu), toSchedule(to_schedule),
|
||||
_name(cu.name() + ".ScoreboardCheckStage")
|
||||
{
|
||||
}
|
||||
|
||||
ScoreboardCheckStage::~ScoreboardCheckStage()
|
||||
{
|
||||
readyList.clear();
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckStage::init()
|
||||
{
|
||||
for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
|
||||
readyList.push_back(&computeUnit.readyList[unitId]);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -242,17 +236,13 @@ ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w)
|
||||
void
|
||||
ScoreboardCheckStage::exec()
|
||||
{
|
||||
// reset the ready list for all execution units; it will be
|
||||
// constructed every cycle since resource availability may change
|
||||
for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
|
||||
// Reset wavefront pointers to nullptr so clear() on the vector
|
||||
// does not accidentally destruct the wavefront object
|
||||
for (int i = 0; i < readyList[unitId]->size(); i++) {
|
||||
readyList[unitId]->at(i) = nullptr;
|
||||
}
|
||||
readyList[unitId]->clear();
|
||||
}
|
||||
// iterate over all WF slots across all vector ALUs
|
||||
/**
|
||||
* Reset the ready list for all execution units; ready list will be
|
||||
* constructed every cycle because resource availability may change.
|
||||
*/
|
||||
toSchedule.reset();
|
||||
|
||||
// Iterate over all WF slots across all SIMDs.
|
||||
for (int simdId = 0; simdId < computeUnit.numVectorALUs; ++simdId) {
|
||||
for (int wfSlot = 0; wfSlot < computeUnit.shader->n_wf; ++wfSlot) {
|
||||
// reset the ready status of each wavefront
|
||||
@@ -269,7 +259,7 @@ ScoreboardCheckStage::exec()
|
||||
curWave->simdId, curWave->wfDynId,
|
||||
curWave->nextInstr()->seqNum(),
|
||||
curWave->nextInstr()->disassemble());
|
||||
readyList.at(exeResType)->push_back(curWave);
|
||||
toSchedule.markWFReady(curWave, exeResType);
|
||||
}
|
||||
collectStatistics(rdyStatus);
|
||||
}
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
#include "sim/stats.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class ScoreboardCheckToSchedule;
|
||||
class Wavefront;
|
||||
|
||||
struct ComputeUnitParams;
|
||||
@@ -70,9 +71,9 @@ class ScoreboardCheckStage
|
||||
NRDY_CONDITIONS
|
||||
};
|
||||
|
||||
ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu);
|
||||
ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu,
|
||||
ScoreboardCheckToSchedule &to_schedule);
|
||||
~ScoreboardCheckStage();
|
||||
void init();
|
||||
void exec();
|
||||
|
||||
// Stats related variables and methods
|
||||
@@ -86,9 +87,12 @@ class ScoreboardCheckStage
|
||||
int *exeResType, int wfSlot);
|
||||
ComputeUnit &computeUnit;
|
||||
|
||||
// List of waves which are ready to be scheduled.
|
||||
// Each execution resource has a ready list
|
||||
std::vector<std::vector<Wavefront*>*> readyList;
|
||||
/**
|
||||
* Interface between scoreboard check and schedule stages. Each
|
||||
* cycle the scoreboard check stage populates this interface with
|
||||
* information needed by the schedule stage.
|
||||
*/
|
||||
ScoreboardCheckToSchedule &toSchedule;
|
||||
|
||||
// Stats
|
||||
Stats::Vector stallCycles;
|
||||
|
||||
Reference in New Issue
Block a user