Files
gem5/src/gpu-compute/compute_unit.hh
vramadas95 dff879cf21 configs, gpu-compute: Add configurable L1 scalar latencies
Previously the scalar cache path used the same latency parameter as the
vector cache path for memory requests. This commit adds new parameters
for the scalar cache path latencies. This commit also modifies the model
to use the new latency parameter to set the memory request latency in
the scalar cache. The new paramters are '--scalar-mem-req-latency' and
'--scalar-mem-resp-latency' and are set to default values of 50 and 0
respectively

Change-Id: I7483f780f2fc0cfbc320ed1fd0c2ee3e2dfc7af2
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/65511
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
2022-11-12 02:23:02 +00:00

1148 lines
37 KiB
C++

/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __COMPUTE_UNIT_HH__
#define __COMPUTE_UNIT_HH__
#include <deque>
#include <map>
#include <unordered_set>
#include <vector>
#include "base/callback.hh"
#include "base/compiler.hh"
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "enums/PrefetchType.hh"
#include "gpu-compute/comm.hh"
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/fetch_stage.hh"
#include "gpu-compute/global_memory_pipeline.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/local_memory_pipeline.hh"
#include "gpu-compute/register_manager.hh"
#include "gpu-compute/scalar_memory_pipeline.hh"
#include "gpu-compute/schedule_stage.hh"
#include "gpu-compute/scoreboard_check_stage.hh"
#include "mem/port.hh"
#include "mem/token_port.hh"
#include "sim/clocked_object.hh"
namespace gem5
{
class HSAQueueEntry;
class LdsChunk;
class ScalarRegisterFile;
class Shader;
class VectorRegisterFile;
struct ComputeUnitParams;
enum EXEC_POLICY
{
OLDEST = 0,
RR
};
enum TLB_CACHE
{
TLB_MISS_CACHE_MISS = 0,
TLB_MISS_CACHE_HIT,
TLB_HIT_CACHE_MISS,
TLB_HIT_CACHE_HIT
};
/**
* WF barrier slots. This represents the barrier resource for
* WF-level barriers (i.e., barriers to sync WFs within a WG).
*/
class WFBarrier
{
public:
WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)
{
}
static const int InvalidID = -1;
int
numAtBarrier() const
{
return _numAtBarrier;
}
/**
* Number of WFs that have not yet reached the barrier.
*/
int
numYetToReachBarrier() const
{
return _maxBarrierCnt - _numAtBarrier;
}
int
maxBarrierCnt() const
{
return _maxBarrierCnt;
}
/**
* Set the maximum barrier count (i.e., the number of WFs that are
* participating in the barrier).
*/
void
setMaxBarrierCnt(int max_barrier_cnt)
{
_maxBarrierCnt = max_barrier_cnt;
}
/**
* Mark that a WF has reached the barrier.
*/
void
incNumAtBarrier()
{
assert(_numAtBarrier < _maxBarrierCnt);
++_numAtBarrier;
}
/**
* Have all WFs participating in this barrier reached the barrier?
* If so, then the barrier is satisfied and WFs may proceed past
* the barrier.
*/
bool
allAtBarrier() const
{
return _numAtBarrier == _maxBarrierCnt;
}
/**
* Decrement the number of WFs that are participating in this barrier.
* This should be called when a WF exits.
*/
void
decMaxBarrierCnt()
{
assert(_maxBarrierCnt > 0);
--_maxBarrierCnt;
}
/**
* Release this barrier resource so it can be used by other WGs. This
* is generally called when a WG has finished.
*/
void
release()
{
_numAtBarrier = 0;
_maxBarrierCnt = 0;
}
/**
* Reset the barrier. This is used to reset the barrier, usually when
* a dynamic instance of a barrier has been satisfied.
*/
void
reset()
{
_numAtBarrier = 0;
}
private:
/**
* The number of WFs in the WG that have reached the barrier. Once
* the number of WFs that reach a barrier matches the number of WFs
* in the WG, the barrier is satisfied.
*/
int _numAtBarrier;
/**
* The maximum number of WFs that can reach this barrier. This is
* essentially the number of WFs in the WG, and a barrier is satisfied
* when the number of WFs that reach the barrier equal this value. If
* a WF exits early it must decrement this value so that it is no
* longer considered for this barrier.
*/
int _maxBarrierCnt;
};
class ComputeUnit : public ClockedObject
{
public:
// Execution resources
//
// The ordering of units is:
// Vector ALUs
// Scalar ALUs
// GM Pipe
// LM Pipe
// Scalar Mem Pipe
//
// Note: the ordering of units is important and the code assumes the
// above ordering. However, there may be more than one resource of
// each type (e.g., 4 VALUs or 2 SALUs)
int numVectorGlobalMemUnits;
// Resource control for global memory to VRF data/address bus
WaitClass glbMemToVrfBus;
// Resource control for Vector Register File->Global Memory pipe buses
WaitClass vrfToGlobalMemPipeBus;
// Resource control for Vector Global Memory execution unit
WaitClass vectorGlobalMemUnit;
int numVectorSharedMemUnits;
// Resource control for local memory to VRF data/address bus
WaitClass locMemToVrfBus;
// Resource control for Vector Register File->Local Memory pipe buses
WaitClass vrfToLocalMemPipeBus;
// Resource control for Vector Shared/Local Memory execution unit
WaitClass vectorSharedMemUnit;
int numScalarMemUnits;
// Resource control for scalar memory to SRF data/address bus
WaitClass scalarMemToSrfBus;
// Resource control for Scalar Register File->Scalar Memory pipe buses
WaitClass srfToScalarMemPipeBus;
// Resource control for Scalar Memory execution unit
WaitClass scalarMemUnit;
// vector ALU execution resources
int numVectorALUs;
std::vector<WaitClass> vectorALUs;
// scalar ALU execution resources
int numScalarALUs;
std::vector<WaitClass> scalarALUs;
// Return total number of execution units on this CU
int numExeUnits() const;
// index into readyList of the first memory unit
int firstMemUnit() const;
// index into readyList of the last memory unit
int lastMemUnit() const;
// index into scalarALUs vector of SALU used by the wavefront
int mapWaveToScalarAlu(Wavefront *w) const;
// index into readyList of SALU used by wavefront
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
// index into readyList of Global Memory unit used by wavefront
int mapWaveToGlobalMem(Wavefront *w) const;
// index into readyList of Local Memory unit used by wavefront
int mapWaveToLocalMem(Wavefront *w) const;
// index into readyList of Scalar Memory unit used by wavefront
int mapWaveToScalarMem(Wavefront *w) const;
int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
int numCyclesPerStoreTransfer; // number of cycles per vector store
int numCyclesPerLoadTransfer; // number of cycles per vector load
// track presence of dynamic instructions in the Schedule pipeline
// stage. This is used to check the readiness of the oldest,
// non-dispatched instruction of every WF in the Scoreboard stage.
std::unordered_set<uint64_t> pipeMap;
RegisterManager* registerManager;
FetchStage fetchStage;
ScoreboardCheckStage scoreboardCheckStage;
ScheduleStage scheduleStage;
ExecStage execStage;
GlobalMemPipeline globalMemoryPipe;
LocalMemPipeline localMemoryPipe;
ScalarMemPipeline scalarMemoryPipe;
EventFunctionWrapper tickEvent;
typedef ComputeUnitParams Params;
std::vector<std::vector<Wavefront*>> wfList;
int cu_id;
// array of vector register files, one per SIMD
std::vector<VectorRegisterFile*> vrf;
// array of scalar register files, one per SIMD
std::vector<ScalarRegisterFile*> srf;
// Width per VALU/SIMD unit: number of work items that can be executed
// on the vector ALU simultaneously in a SIMD unit
int simdWidth;
// number of pipe stages for bypassing data to next dependent single
// precision vector instruction inside the vector ALU pipeline
int spBypassPipeLength;
// number of pipe stages for bypassing data to next dependent double
// precision vector instruction inside the vector ALU pipeline
int dpBypassPipeLength;
// number of pipe stages for scalar ALU
int scalarPipeStages;
// number of pipe stages for operand collection & distribution network
int operandNetworkLength;
// number of cycles per instruction issue period
Cycles issuePeriod;
// VRF to GM Bus latency
Cycles vrf_gm_bus_latency;
// SRF to Scalar Mem Bus latency
Cycles srf_scm_bus_latency;
// VRF to LM Bus latency
Cycles vrf_lm_bus_latency;
// tracks the last cycle a vector instruction was executed on a SIMD
std::vector<uint64_t> lastExecCycle;
// tracks the number of dyn inst executed per SIMD
std::vector<uint64_t> instExecPerSimd;
// true if we allow a separate TLB per lane
bool perLaneTLB;
// if 0, TLB prefetching is off.
int prefetchDepth;
// if fixed-stride prefetching, this is the stride.
int prefetchStride;
std::vector<Addr> lastVaddrCU;
std::vector<std::vector<Addr>> lastVaddrSimd;
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
enums::PrefetchType prefetchType;
EXEC_POLICY exec_policy;
bool debugSegFault;
// Idle CU timeout in ticks
Tick idleCUTimeout;
int idleWfs;
bool functionalTLB;
bool localMemBarrier;
/*
* for Counting page accesses
*/
bool countPages;
Shader *shader;
Tick req_tick_latency;
Tick resp_tick_latency;
Tick scalar_req_tick_latency;
Tick scalar_resp_tick_latency;
/**
* Number of WFs to schedule to each SIMD. This vector is populated
* by hasDispResources(), and consumed by the subsequent call to
* dispWorkgroup(), to schedule the specified number of WFs to the
* SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
*/
std::vector<int> numWfsToSched;
// number of currently reserved vector registers per SIMD unit
std::vector<int> vectorRegsReserved;
// number of currently reserved scalar registers per SIMD unit
std::vector<int> scalarRegsReserved;
// number of vector registers per SIMD unit
int numVecRegsPerSimd;
// number of available scalar registers per SIMD unit
int numScalarRegsPerSimd;
// this hash map will keep track of page divergence
// per memory instruction per wavefront. The hash map
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
std::map<Addr, int> pagesTouched;
void insertInPipeMap(Wavefront *w);
void deleteFromPipeMap(Wavefront *w);
ComputeUnit(const Params &p);
~ComputeUnit();
// Timing Functions
int oprNetPipeLength() const { return operandNetworkLength; }
int simdUnitWidth() const { return simdWidth; }
int spBypassLength() const { return spBypassPipeLength; }
int dpBypassLength() const { return dpBypassPipeLength; }
int scalarPipeLength() const { return scalarPipeStages; }
int storeBusLength() const { return numCyclesPerStoreTransfer; }
int loadBusLength() const { return numCyclesPerLoadTransfer; }
int wfSize() const { return wavefrontSize; }
void exec();
void initiateFetch(Wavefront *wavefront);
void fetch(PacketPtr pkt, Wavefront *wavefront);
void fillKernelState(Wavefront *w, HSAQueueEntry *task);
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
HSAQueueEntry *task, int bar_id,
bool fetchContext=false);
void doInvalidate(RequestPtr req, int kernId);
void doFlush(GPUDynInstPtr gpuDynInst);
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
int cacheLineSize() const { return _cacheLineSize; }
int getCacheLineBits() const { return cacheLineBits; }
void resetRegisterPool();
private:
WFBarrier&
barrierSlot(int bar_id)
{
assert(bar_id > WFBarrier::InvalidID);
return wfBarrierSlots.at(bar_id);
}
int
getFreeBarrierId()
{
assert(freeBarrierIds.size());
auto free_bar_id = freeBarrierIds.begin();
int bar_id = *free_bar_id;
freeBarrierIds.erase(free_bar_id);
return bar_id;
}
public:
int numYetToReachBarrier(int bar_id);
bool allAtBarrier(int bar_id);
void incNumAtBarrier(int bar_id);
int numAtBarrier(int bar_id);
int maxBarrierCnt(int bar_id);
void resetBarrier(int bar_id);
void decMaxBarrierCnt(int bar_id);
void releaseBarrier(int bar_id);
void releaseWFsFromBarrier(int bar_id);
int numBarrierSlots() const { return _numBarrierSlots; }
template<typename c0, typename c1>
void doSmReturn(GPUDynInstPtr gpuDynInst);
virtual void init() override;
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt);
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
bool kernelMemSync,
RequestPtr req=nullptr);
void handleMemPacket(PacketPtr pkt, int memport_index);
bool processTimingPacket(PacketPtr pkt);
void processFetchReturn(PacketPtr pkt);
void updatePageDivergenceDist(Addr addr);
RequestorID requestorId() { return _requestorId; }
RequestorID vramRequestorId();
bool isDone() const;
bool isVectorAluIdle(uint32_t simdId) const;
void handleSQCReturn(PacketPtr pkt);
protected:
RequestorID _requestorId;
LdsState &lds;
public:
LdsState &
getLds() const
{
return lds;
}
int32_t
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
[[nodiscard]] bool sendToLds(GPUDynInstPtr gpuDynInst);
typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
pageDataStruct pageAccesses;
void exitCallback();
class GMTokenPort : public TokenRequestPort
{
public:
GMTokenPort(const std::string& name, SimObject *owner,
PortID id = InvalidPortID)
: TokenRequestPort(name, owner, id)
{ }
~GMTokenPort() { }
protected:
bool recvTimingResp(PacketPtr) { return false; }
void recvReqRetry() { }
};
// Manager for the number of tokens available to this compute unit to
// send global memory request packets to the coalescer this is only used
// between global memory pipe and TCP coalescer.
TokenManager *memPortTokens;
GMTokenPort gmTokenPort;
/** Data access Port **/
class DataPort : public RequestPort
{
public:
DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
: RequestPort(_name, _cu, id), computeUnit(_cu) { }
bool snoopRangeSent;
struct SenderState : public Packet::SenderState
{
GPUDynInstPtr _gpuDynInst;
PortID port_index;
Packet::SenderState *saved;
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
Packet::SenderState *sender_state=nullptr)
: _gpuDynInst(gpuDynInst),
port_index(_port_index),
saved(sender_state) { }
};
class SystemHubEvent : public Event
{
DataPort *dataPort;
PacketPtr reqPkt;
public:
SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
: dataPort(_dataPort), reqPkt(pkt)
{
setFlags(Event::AutoDelete);
}
void
process()
{
// DMAs do not operate on packets and therefore do not
// convert to a response. Do that here instead.
reqPkt->makeResponse();
dataPort->handleResponse(reqPkt);
}
};
void processMemReqEvent(PacketPtr pkt);
EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
void processMemRespEvent(PacketPtr pkt);
EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);
std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
bool handleResponse(PacketPtr pkt);
protected:
ComputeUnit *computeUnit;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
{
resp.clear();
snoop = true;
}
};
// Scalar data cache access port
class ScalarDataPort : public RequestPort
{
public:
ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
: RequestPort(_name, _cu), computeUnit(_cu)
{
}
bool recvTimingResp(PacketPtr pkt) override;
void recvReqRetry() override;
struct SenderState : public Packet::SenderState
{
SenderState(GPUDynInstPtr gpuDynInst,
Packet::SenderState *sender_state=nullptr)
: _gpuDynInst(gpuDynInst), saved(sender_state)
{
}
GPUDynInstPtr _gpuDynInst;
Packet::SenderState *saved;
};
class MemReqEvent : public Event
{
private:
ScalarDataPort &scalarDataPort;
PacketPtr pkt;
public:
MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
: Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
{
setFlags(Event::AutoDelete);
}
void process();
const char *description() const;
};
class SystemHubEvent : public Event
{
ScalarDataPort *dataPort;
PacketPtr reqPkt;
public:
SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
: dataPort(_dataPort), reqPkt(pkt)
{
setFlags(Event::AutoDelete);
}
void
process()
{
// DMAs do not operate on packets and therefore do not
// convert to a response. Do that here instead.
reqPkt->makeResponse();
dataPort->handleResponse(reqPkt);
}
};
bool handleResponse(PacketPtr pkt);
std::deque<PacketPtr> retries;
private:
ComputeUnit *computeUnit;
};
// Instruction cache access port
class SQCPort : public RequestPort
{
public:
SQCPort(const std::string &_name, ComputeUnit *_cu)
: RequestPort(_name, _cu), computeUnit(_cu) { }
bool snoopRangeSent;
struct SenderState : public Packet::SenderState
{
Wavefront *wavefront;
Packet::SenderState *saved;
// kernel id to be used in handling I-Cache invalidate response
int kernId;
SenderState(Wavefront *_wavefront, Packet::SenderState
*sender_state=nullptr, int _kernId=-1)
: wavefront(_wavefront), saved(sender_state),
kernId(_kernId){ }
};
std::deque<std::pair<PacketPtr, Wavefront*>> retries;
protected:
ComputeUnit *computeUnit;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
{
resp.clear();
snoop = true;
}
};
/** Data TLB port **/
class DTLBPort : public RequestPort
{
public:
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
: RequestPort(_name, _cu, id), computeUnit(_cu),
stalled(false)
{ }
bool isStalled() { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
/**
* here we queue all the translation requests that were
* not successfully sent.
*/
std::deque<PacketPtr> retries;
/** SenderState is information carried along with the packet
* throughout the TLB hierarchy
*/
struct SenderState: public Packet::SenderState
{
// the memInst that this is associated with
GPUDynInstPtr _gpuDynInst;
// the lane in the memInst this is associated with, so we send
// the memory request down the right port
PortID portIndex;
// constructor used for packets involved in timing accesses
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
: _gpuDynInst(gpuDynInst), portIndex(port_index) { }
};
protected:
ComputeUnit *computeUnit;
bool stalled;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
};
class ScalarDTLBPort : public RequestPort
{
public:
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
: RequestPort(_name, _cu), computeUnit(_cu), stalled(false)
{
}
struct SenderState : public Packet::SenderState
{
SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
GPUDynInstPtr _gpuDynInst;
};
bool recvTimingResp(PacketPtr pkt) override;
void recvReqRetry() override { assert(false); }
bool isStalled() const { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
std::deque<PacketPtr> retries;
private:
ComputeUnit *computeUnit;
bool stalled;
};
class ITLBPort : public RequestPort
{
public:
ITLBPort(const std::string &_name, ComputeUnit *_cu)
: RequestPort(_name, _cu), computeUnit(_cu), stalled(false) { }
bool isStalled() { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
/**
* here we queue all the translation requests that were
* not successfully sent.
*/
std::deque<PacketPtr> retries;
/** SenderState is information carried along with the packet
* throughout the TLB hierarchy
*/
struct SenderState: public Packet::SenderState
{
// The wavefront associated with this request
Wavefront *wavefront;
SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
};
protected:
ComputeUnit *computeUnit;
bool stalled;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
};
/**
* the port intended to communicate between the CU and its LDS
*/
class LDSPort : public RequestPort
{
public:
LDSPort(const std::string &_name, ComputeUnit *_cu)
: RequestPort(_name, _cu), computeUnit(_cu)
{
}
bool isStalled() const { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
/**
* here we queue all the requests that were
* not successfully sent.
*/
std::queue<PacketPtr> retries;
/**
* SenderState is information carried along with the packet, esp. the
* GPUDynInstPtr
*/
class SenderState: public Packet::SenderState
{
protected:
// The actual read/write/atomic request that goes with this command
GPUDynInstPtr _gpuDynInst = nullptr;
public:
SenderState(GPUDynInstPtr gpuDynInst):
_gpuDynInst(gpuDynInst)
{
}
GPUDynInstPtr
getMemInst() const
{
return _gpuDynInst;
}
};
virtual bool
sendTimingReq(PacketPtr pkt);
protected:
bool stalled = false; ///< whether or not it is stalled
ComputeUnit *computeUnit;
virtual bool
recvTimingResp(PacketPtr pkt);
virtual Tick
recvAtomic(PacketPtr pkt) { return 0; }
virtual void
recvFunctional(PacketPtr pkt)
{
}
virtual void
recvRangeChange()
{
}
virtual void
recvReqRetry();
};
/** The port to access the Local Data Store
* Can be connected to a LDS object
*/
LDSPort ldsPort;
TokenManager *
getTokenManager()
{
return memPortTokens;
}
/** The memory port for SIMD data accesses.
* Can be connected to PhysMem for Ruby for timing simulations
*/
std::vector<DataPort> memPort;
// port to the TLB hierarchy (i.e., the L1 TLB)
std::vector<DTLBPort> tlbPort;
// port to the scalar data cache
ScalarDataPort scalarDataPort;
// port to the scalar data TLB
ScalarDTLBPort scalarDTLBPort;
// port to the SQC (i.e. the I-cache)
SQCPort sqcPort;
// port to the SQC TLB (there's a separate TLB for each I-cache)
ITLBPort sqcTLBPort;
Port &
getPort(const std::string &if_name, PortID idx) override
{
if (if_name == "memory_port" && idx < memPort.size()) {
return memPort[idx];
} else if (if_name == "translation_port" && idx < tlbPort.size()) {
return tlbPort[idx];
} else if (if_name == "scalar_port") {
return scalarDataPort;
} else if (if_name == "scalar_tlb_port") {
return scalarDTLBPort;
} else if (if_name == "sqc_port") {
return sqcPort;
} else if (if_name == "sqc_tlb_port") {
return sqcTLBPort;
} else if (if_name == "ldsPort") {
return ldsPort;
} else if (if_name == "gmTokenPort") {
return gmTokenPort;
} else {
return ClockedObject::getPort(if_name, idx);
}
}
InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }
private:
const int _cacheLineSize;
const int _numBarrierSlots;
int cacheLineBits;
InstSeqNum globalSeqNum;
int wavefrontSize;
/**
* TODO: Update these comments once the pipe stage interface has
* been fully refactored.
*
* Pipeline stage interfaces.
*
* Buffers used to communicate between various pipeline stages
* List of waves which will be dispatched to
* each execution resource. An EXREADY implies
* dispatch list is non-empty and
* execution unit has something to execute
* this cycle. Currently, the dispatch list of
* an execution resource can hold only one wave because
* an execution resource can execute only one wave in a cycle.
* dispatchList is used to communicate between schedule
* and exec stage
*
* At a high level, the following intra-/inter-stage communication occurs:
* SCB to SCH: readyList provides per exec resource list of waves that
* passed dependency and readiness checks. If selected by
* scheduler, attempt to add wave to schList conditional on
* RF support.
* SCH: schList holds waves that are gathering operands or waiting
* for execution resource availability. Once ready, waves are
* placed on the dispatchList as candidates for execution. A wave
* may spend multiple cycles in SCH stage, on the schList due to
* RF access conflicts or execution resource contention.
* SCH to EX: dispatchList holds waves that are ready to be executed.
* LM/FLAT arbitration may remove an LM wave and place it
* back on the schList. RF model may also force a wave back
* to the schList if using the detailed model.
*/
ScoreboardCheckToSchedule scoreboardCheckToSchedule;
ScheduleToExecute scheduleToExecute;
/**
* The barrier slots for this CU.
*/
std::vector<WFBarrier> wfBarrierSlots;
/**
* A set used to easily retrieve a free barrier ID.
*/
std::unordered_set<int> freeBarrierIds;
// hold the time of the arrival of the first cache block related to
// a particular GPUDynInst. This is used to calculate the difference
// between the first and last chace block arrival times.
std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
public:
void updateInstStats(GPUDynInstPtr gpuDynInst);
int activeWaves;
struct ComputeUnitStats : public statistics::Group
{
ComputeUnitStats(statistics::Group *parent, int n_wf);
statistics::Scalar vALUInsts;
statistics::Formula vALUInstsPerWF;
statistics::Scalar sALUInsts;
statistics::Formula sALUInstsPerWF;
statistics::Scalar instCyclesVALU;
statistics::Scalar instCyclesSALU;
statistics::Scalar threadCyclesVALU;
statistics::Formula vALUUtilization;
statistics::Scalar ldsNoFlatInsts;
statistics::Formula ldsNoFlatInstsPerWF;
statistics::Scalar flatVMemInsts;
statistics::Formula flatVMemInstsPerWF;
statistics::Scalar flatLDSInsts;
statistics::Formula flatLDSInstsPerWF;
statistics::Scalar vectorMemWrites;
statistics::Formula vectorMemWritesPerWF;
statistics::Scalar vectorMemReads;
statistics::Formula vectorMemReadsPerWF;
statistics::Scalar scalarMemWrites;
statistics::Formula scalarMemWritesPerWF;
statistics::Scalar scalarMemReads;
statistics::Formula scalarMemReadsPerWF;
statistics::Formula vectorMemReadsPerKiloInst;
statistics::Formula vectorMemWritesPerKiloInst;
statistics::Formula vectorMemInstsPerKiloInst;
statistics::Formula scalarMemReadsPerKiloInst;
statistics::Formula scalarMemWritesPerKiloInst;
statistics::Formula scalarMemInstsPerKiloInst;
// Cycles required to send register source (addr and data) from
// register files to memory pipeline, per SIMD.
statistics::Vector instCyclesVMemPerSimd;
statistics::Vector instCyclesScMemPerSimd;
statistics::Vector instCyclesLdsPerSimd;
statistics::Scalar globalReads;
statistics::Scalar globalWrites;
statistics::Formula globalMemInsts;
statistics::Scalar argReads;
statistics::Scalar argWrites;
statistics::Formula argMemInsts;
statistics::Scalar spillReads;
statistics::Scalar spillWrites;
statistics::Formula spillMemInsts;
statistics::Scalar groupReads;
statistics::Scalar groupWrites;
statistics::Formula groupMemInsts;
statistics::Scalar privReads;
statistics::Scalar privWrites;
statistics::Formula privMemInsts;
statistics::Scalar readonlyReads;
statistics::Scalar readonlyWrites;
statistics::Formula readonlyMemInsts;
statistics::Scalar kernargReads;
statistics::Scalar kernargWrites;
statistics::Formula kernargMemInsts;
statistics::Distribution waveLevelParallelism;
// the following stats compute the avg. TLB accesslatency per
// uncoalesced request (only for data)
statistics::Scalar tlbRequests;
statistics::Scalar tlbCycles;
statistics::Formula tlbLatency;
// hitsPerTLBLevel[x] are the hits in Level x TLB.
// x = 0 is the page table.
statistics::Vector hitsPerTLBLevel;
statistics::Scalar ldsBankAccesses;
statistics::Distribution ldsBankConflictDist;
// over all memory instructions executed over all wavefronts
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
statistics::Distribution pageDivergenceDist;
// count of non-flat global memory vector instructions executed
statistics::Scalar dynamicGMemInstrCnt;
// count of flat global memory vector instructions executed
statistics::Scalar dynamicFlatMemInstrCnt;
statistics::Scalar dynamicLMemInstrCnt;
statistics::Scalar wgBlockedDueBarrierAllocation;
statistics::Scalar wgBlockedDueLdsAllocation;
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
// active when the instruction is committed, this number is still
// incremented by 1
statistics::Scalar numInstrExecuted;
// Number of cycles among successive instruction executions across all
// wavefronts of the same CU
statistics::Distribution execRateDist;
// number of individual vector operations executed
statistics::Scalar numVecOpsExecuted;
// number of individual f16 vector operations executed
statistics::Scalar numVecOpsExecutedF16;
// number of individual f32 vector operations executed
statistics::Scalar numVecOpsExecutedF32;
// number of individual f64 vector operations executed
statistics::Scalar numVecOpsExecutedF64;
// number of individual FMA 16,32,64 vector operations executed
statistics::Scalar numVecOpsExecutedFMA16;
statistics::Scalar numVecOpsExecutedFMA32;
statistics::Scalar numVecOpsExecutedFMA64;
// number of individual MAC 16,32,64 vector operations executed
statistics::Scalar numVecOpsExecutedMAC16;
statistics::Scalar numVecOpsExecutedMAC32;
statistics::Scalar numVecOpsExecutedMAC64;
// number of individual MAD 16,32,64 vector operations executed
statistics::Scalar numVecOpsExecutedMAD16;
statistics::Scalar numVecOpsExecutedMAD32;
statistics::Scalar numVecOpsExecutedMAD64;
// total number of two op FP vector operations executed
statistics::Scalar numVecOpsExecutedTwoOpFP;
// Total cycles that something is running on the GPU
statistics::Scalar totalCycles;
statistics::Formula vpc; // vector ops per cycle
statistics::Formula vpc_f16; // vector ops per cycle
statistics::Formula vpc_f32; // vector ops per cycle
statistics::Formula vpc_f64; // vector ops per cycle
statistics::Formula ipc; // vector instructions per cycle
statistics::Distribution controlFlowDivergenceDist;
statistics::Distribution activeLanesPerGMemInstrDist;
statistics::Distribution activeLanesPerLMemInstrDist;
// number of vector ALU instructions received
statistics::Formula numALUInstsExecuted;
// number of times a WG cannot start due to lack of free VGPRs in SIMDs
statistics::Scalar numTimesWgBlockedDueVgprAlloc;
// number of times a WG cannot start due to lack of free SGPRs in SIMDs
statistics::Scalar numTimesWgBlockedDueSgprAlloc;
statistics::Scalar numCASOps;
statistics::Scalar numFailedCASOps;
statistics::Scalar completedWfs;
statistics::Scalar completedWGs;
// distrubtion in latency difference between first and last cache block
// arrival ticks
statistics::Distribution headTailLatency;
// Track the amount of interleaving between wavefronts on each SIMD.
// This stat is sampled using instExecPerSimd to compute the number
// of instructions that have been executed on a SIMD between a WF
// executing two successive instructions.
statistics::VectorDistribution instInterleave;
} stats;
};
} // namespace gem5
#endif // __COMPUTE_UNIT_HH__