Previously the scalar cache path used the same latency parameter as the vector cache path for memory requests. This commit adds new parameters for the scalar cache path latencies. This commit also modifies the model to use the new latency parameter to set the memory request latency in the scalar cache. The new paramters are '--scalar-mem-req-latency' and '--scalar-mem-resp-latency' and are set to default values of 50 and 0 respectively Change-Id: I7483f780f2fc0cfbc320ed1fd0c2ee3e2dfc7af2 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/65511 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Jason Lowe-Power <power.jg@gmail.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
1148 lines
37 KiB
C++
1148 lines
37 KiB
C++
/*
|
|
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef __COMPUTE_UNIT_HH__
|
|
#define __COMPUTE_UNIT_HH__
|
|
|
|
#include <deque>
|
|
#include <map>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
#include "base/callback.hh"
|
|
#include "base/compiler.hh"
|
|
#include "base/statistics.hh"
|
|
#include "base/stats/group.hh"
|
|
#include "base/types.hh"
|
|
#include "config/the_gpu_isa.hh"
|
|
#include "enums/PrefetchType.hh"
|
|
#include "gpu-compute/comm.hh"
|
|
#include "gpu-compute/exec_stage.hh"
|
|
#include "gpu-compute/fetch_stage.hh"
|
|
#include "gpu-compute/global_memory_pipeline.hh"
|
|
#include "gpu-compute/hsa_queue_entry.hh"
|
|
#include "gpu-compute/local_memory_pipeline.hh"
|
|
#include "gpu-compute/register_manager.hh"
|
|
#include "gpu-compute/scalar_memory_pipeline.hh"
|
|
#include "gpu-compute/schedule_stage.hh"
|
|
#include "gpu-compute/scoreboard_check_stage.hh"
|
|
#include "mem/port.hh"
|
|
#include "mem/token_port.hh"
|
|
#include "sim/clocked_object.hh"
|
|
|
|
namespace gem5
|
|
{
|
|
|
|
class HSAQueueEntry;
|
|
class LdsChunk;
|
|
class ScalarRegisterFile;
|
|
class Shader;
|
|
class VectorRegisterFile;
|
|
|
|
struct ComputeUnitParams;
|
|
|
|
enum EXEC_POLICY
|
|
{
|
|
OLDEST = 0,
|
|
RR
|
|
};
|
|
|
|
enum TLB_CACHE
|
|
{
|
|
TLB_MISS_CACHE_MISS = 0,
|
|
TLB_MISS_CACHE_HIT,
|
|
TLB_HIT_CACHE_MISS,
|
|
TLB_HIT_CACHE_HIT
|
|
};
|
|
|
|
/**
|
|
* WF barrier slots. This represents the barrier resource for
|
|
* WF-level barriers (i.e., barriers to sync WFs within a WG).
|
|
*/
|
|
class WFBarrier
|
|
{
|
|
public:
|
|
WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)
|
|
{
|
|
}
|
|
|
|
static const int InvalidID = -1;
|
|
|
|
int
|
|
numAtBarrier() const
|
|
{
|
|
return _numAtBarrier;
|
|
}
|
|
|
|
/**
|
|
* Number of WFs that have not yet reached the barrier.
|
|
*/
|
|
int
|
|
numYetToReachBarrier() const
|
|
{
|
|
return _maxBarrierCnt - _numAtBarrier;
|
|
}
|
|
|
|
int
|
|
maxBarrierCnt() const
|
|
{
|
|
return _maxBarrierCnt;
|
|
}
|
|
|
|
/**
|
|
* Set the maximum barrier count (i.e., the number of WFs that are
|
|
* participating in the barrier).
|
|
*/
|
|
void
|
|
setMaxBarrierCnt(int max_barrier_cnt)
|
|
{
|
|
_maxBarrierCnt = max_barrier_cnt;
|
|
}
|
|
|
|
/**
|
|
* Mark that a WF has reached the barrier.
|
|
*/
|
|
void
|
|
incNumAtBarrier()
|
|
{
|
|
assert(_numAtBarrier < _maxBarrierCnt);
|
|
++_numAtBarrier;
|
|
}
|
|
|
|
/**
|
|
* Have all WFs participating in this barrier reached the barrier?
|
|
* If so, then the barrier is satisfied and WFs may proceed past
|
|
* the barrier.
|
|
*/
|
|
bool
|
|
allAtBarrier() const
|
|
{
|
|
return _numAtBarrier == _maxBarrierCnt;
|
|
}
|
|
|
|
/**
|
|
* Decrement the number of WFs that are participating in this barrier.
|
|
* This should be called when a WF exits.
|
|
*/
|
|
void
|
|
decMaxBarrierCnt()
|
|
{
|
|
assert(_maxBarrierCnt > 0);
|
|
--_maxBarrierCnt;
|
|
}
|
|
|
|
/**
|
|
* Release this barrier resource so it can be used by other WGs. This
|
|
* is generally called when a WG has finished.
|
|
*/
|
|
void
|
|
release()
|
|
{
|
|
_numAtBarrier = 0;
|
|
_maxBarrierCnt = 0;
|
|
}
|
|
|
|
/**
|
|
* Reset the barrier. This is used to reset the barrier, usually when
|
|
* a dynamic instance of a barrier has been satisfied.
|
|
*/
|
|
void
|
|
reset()
|
|
{
|
|
_numAtBarrier = 0;
|
|
}
|
|
|
|
private:
|
|
/**
|
|
* The number of WFs in the WG that have reached the barrier. Once
|
|
* the number of WFs that reach a barrier matches the number of WFs
|
|
* in the WG, the barrier is satisfied.
|
|
*/
|
|
int _numAtBarrier;
|
|
|
|
/**
|
|
* The maximum number of WFs that can reach this barrier. This is
|
|
* essentially the number of WFs in the WG, and a barrier is satisfied
|
|
* when the number of WFs that reach the barrier equal this value. If
|
|
* a WF exits early it must decrement this value so that it is no
|
|
* longer considered for this barrier.
|
|
*/
|
|
int _maxBarrierCnt;
|
|
};
|
|
|
|
class ComputeUnit : public ClockedObject
|
|
{
|
|
public:
|
|
|
|
|
|
// Execution resources
|
|
//
|
|
// The ordering of units is:
|
|
// Vector ALUs
|
|
// Scalar ALUs
|
|
// GM Pipe
|
|
// LM Pipe
|
|
// Scalar Mem Pipe
|
|
//
|
|
// Note: the ordering of units is important and the code assumes the
|
|
// above ordering. However, there may be more than one resource of
|
|
// each type (e.g., 4 VALUs or 2 SALUs)
|
|
|
|
int numVectorGlobalMemUnits;
|
|
// Resource control for global memory to VRF data/address bus
|
|
WaitClass glbMemToVrfBus;
|
|
// Resource control for Vector Register File->Global Memory pipe buses
|
|
WaitClass vrfToGlobalMemPipeBus;
|
|
// Resource control for Vector Global Memory execution unit
|
|
WaitClass vectorGlobalMemUnit;
|
|
|
|
int numVectorSharedMemUnits;
|
|
// Resource control for local memory to VRF data/address bus
|
|
WaitClass locMemToVrfBus;
|
|
// Resource control for Vector Register File->Local Memory pipe buses
|
|
WaitClass vrfToLocalMemPipeBus;
|
|
// Resource control for Vector Shared/Local Memory execution unit
|
|
WaitClass vectorSharedMemUnit;
|
|
|
|
int numScalarMemUnits;
|
|
// Resource control for scalar memory to SRF data/address bus
|
|
WaitClass scalarMemToSrfBus;
|
|
// Resource control for Scalar Register File->Scalar Memory pipe buses
|
|
WaitClass srfToScalarMemPipeBus;
|
|
// Resource control for Scalar Memory execution unit
|
|
WaitClass scalarMemUnit;
|
|
|
|
// vector ALU execution resources
|
|
int numVectorALUs;
|
|
std::vector<WaitClass> vectorALUs;
|
|
|
|
// scalar ALU execution resources
|
|
int numScalarALUs;
|
|
std::vector<WaitClass> scalarALUs;
|
|
|
|
// Return total number of execution units on this CU
|
|
int numExeUnits() const;
|
|
// index into readyList of the first memory unit
|
|
int firstMemUnit() const;
|
|
// index into readyList of the last memory unit
|
|
int lastMemUnit() const;
|
|
// index into scalarALUs vector of SALU used by the wavefront
|
|
int mapWaveToScalarAlu(Wavefront *w) const;
|
|
// index into readyList of SALU used by wavefront
|
|
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
|
|
// index into readyList of Global Memory unit used by wavefront
|
|
int mapWaveToGlobalMem(Wavefront *w) const;
|
|
// index into readyList of Local Memory unit used by wavefront
|
|
int mapWaveToLocalMem(Wavefront *w) const;
|
|
// index into readyList of Scalar Memory unit used by wavefront
|
|
int mapWaveToScalarMem(Wavefront *w) const;
|
|
|
|
int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
|
|
int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
|
|
int numCyclesPerStoreTransfer; // number of cycles per vector store
|
|
int numCyclesPerLoadTransfer; // number of cycles per vector load
|
|
|
|
// track presence of dynamic instructions in the Schedule pipeline
|
|
// stage. This is used to check the readiness of the oldest,
|
|
// non-dispatched instruction of every WF in the Scoreboard stage.
|
|
std::unordered_set<uint64_t> pipeMap;
|
|
|
|
RegisterManager* registerManager;
|
|
|
|
FetchStage fetchStage;
|
|
ScoreboardCheckStage scoreboardCheckStage;
|
|
ScheduleStage scheduleStage;
|
|
ExecStage execStage;
|
|
GlobalMemPipeline globalMemoryPipe;
|
|
LocalMemPipeline localMemoryPipe;
|
|
ScalarMemPipeline scalarMemoryPipe;
|
|
|
|
EventFunctionWrapper tickEvent;
|
|
|
|
typedef ComputeUnitParams Params;
|
|
std::vector<std::vector<Wavefront*>> wfList;
|
|
int cu_id;
|
|
|
|
// array of vector register files, one per SIMD
|
|
std::vector<VectorRegisterFile*> vrf;
|
|
// array of scalar register files, one per SIMD
|
|
std::vector<ScalarRegisterFile*> srf;
|
|
|
|
// Width per VALU/SIMD unit: number of work items that can be executed
|
|
// on the vector ALU simultaneously in a SIMD unit
|
|
int simdWidth;
|
|
// number of pipe stages for bypassing data to next dependent single
|
|
// precision vector instruction inside the vector ALU pipeline
|
|
int spBypassPipeLength;
|
|
// number of pipe stages for bypassing data to next dependent double
|
|
// precision vector instruction inside the vector ALU pipeline
|
|
int dpBypassPipeLength;
|
|
// number of pipe stages for scalar ALU
|
|
int scalarPipeStages;
|
|
// number of pipe stages for operand collection & distribution network
|
|
int operandNetworkLength;
|
|
// number of cycles per instruction issue period
|
|
Cycles issuePeriod;
|
|
|
|
// VRF to GM Bus latency
|
|
Cycles vrf_gm_bus_latency;
|
|
// SRF to Scalar Mem Bus latency
|
|
Cycles srf_scm_bus_latency;
|
|
// VRF to LM Bus latency
|
|
Cycles vrf_lm_bus_latency;
|
|
|
|
// tracks the last cycle a vector instruction was executed on a SIMD
|
|
std::vector<uint64_t> lastExecCycle;
|
|
|
|
// tracks the number of dyn inst executed per SIMD
|
|
std::vector<uint64_t> instExecPerSimd;
|
|
|
|
// true if we allow a separate TLB per lane
|
|
bool perLaneTLB;
|
|
// if 0, TLB prefetching is off.
|
|
int prefetchDepth;
|
|
// if fixed-stride prefetching, this is the stride.
|
|
int prefetchStride;
|
|
|
|
std::vector<Addr> lastVaddrCU;
|
|
std::vector<std::vector<Addr>> lastVaddrSimd;
|
|
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
|
|
enums::PrefetchType prefetchType;
|
|
EXEC_POLICY exec_policy;
|
|
|
|
bool debugSegFault;
|
|
// Idle CU timeout in ticks
|
|
Tick idleCUTimeout;
|
|
int idleWfs;
|
|
bool functionalTLB;
|
|
bool localMemBarrier;
|
|
|
|
/*
|
|
* for Counting page accesses
|
|
*/
|
|
bool countPages;
|
|
|
|
Shader *shader;
|
|
|
|
Tick req_tick_latency;
|
|
Tick resp_tick_latency;
|
|
Tick scalar_req_tick_latency;
|
|
Tick scalar_resp_tick_latency;
|
|
|
|
/**
|
|
* Number of WFs to schedule to each SIMD. This vector is populated
|
|
* by hasDispResources(), and consumed by the subsequent call to
|
|
* dispWorkgroup(), to schedule the specified number of WFs to the
|
|
* SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
|
|
*/
|
|
std::vector<int> numWfsToSched;
|
|
|
|
// number of currently reserved vector registers per SIMD unit
|
|
std::vector<int> vectorRegsReserved;
|
|
// number of currently reserved scalar registers per SIMD unit
|
|
std::vector<int> scalarRegsReserved;
|
|
// number of vector registers per SIMD unit
|
|
int numVecRegsPerSimd;
|
|
// number of available scalar registers per SIMD unit
|
|
int numScalarRegsPerSimd;
|
|
|
|
// this hash map will keep track of page divergence
|
|
// per memory instruction per wavefront. The hash map
|
|
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
|
|
std::map<Addr, int> pagesTouched;
|
|
|
|
void insertInPipeMap(Wavefront *w);
|
|
void deleteFromPipeMap(Wavefront *w);
|
|
|
|
ComputeUnit(const Params &p);
|
|
~ComputeUnit();
|
|
|
|
// Timing Functions
|
|
int oprNetPipeLength() const { return operandNetworkLength; }
|
|
int simdUnitWidth() const { return simdWidth; }
|
|
int spBypassLength() const { return spBypassPipeLength; }
|
|
int dpBypassLength() const { return dpBypassPipeLength; }
|
|
int scalarPipeLength() const { return scalarPipeStages; }
|
|
int storeBusLength() const { return numCyclesPerStoreTransfer; }
|
|
int loadBusLength() const { return numCyclesPerLoadTransfer; }
|
|
int wfSize() const { return wavefrontSize; }
|
|
|
|
void exec();
|
|
void initiateFetch(Wavefront *wavefront);
|
|
void fetch(PacketPtr pkt, Wavefront *wavefront);
|
|
void fillKernelState(Wavefront *w, HSAQueueEntry *task);
|
|
|
|
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
|
|
HSAQueueEntry *task, int bar_id,
|
|
bool fetchContext=false);
|
|
|
|
void doInvalidate(RequestPtr req, int kernId);
|
|
void doFlush(GPUDynInstPtr gpuDynInst);
|
|
|
|
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
|
|
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
|
|
|
|
int cacheLineSize() const { return _cacheLineSize; }
|
|
int getCacheLineBits() const { return cacheLineBits; }
|
|
|
|
void resetRegisterPool();
|
|
|
|
private:
|
|
WFBarrier&
|
|
barrierSlot(int bar_id)
|
|
{
|
|
assert(bar_id > WFBarrier::InvalidID);
|
|
return wfBarrierSlots.at(bar_id);
|
|
}
|
|
|
|
int
|
|
getFreeBarrierId()
|
|
{
|
|
assert(freeBarrierIds.size());
|
|
auto free_bar_id = freeBarrierIds.begin();
|
|
int bar_id = *free_bar_id;
|
|
freeBarrierIds.erase(free_bar_id);
|
|
return bar_id;
|
|
}
|
|
|
|
public:
|
|
int numYetToReachBarrier(int bar_id);
|
|
bool allAtBarrier(int bar_id);
|
|
void incNumAtBarrier(int bar_id);
|
|
int numAtBarrier(int bar_id);
|
|
int maxBarrierCnt(int bar_id);
|
|
void resetBarrier(int bar_id);
|
|
void decMaxBarrierCnt(int bar_id);
|
|
void releaseBarrier(int bar_id);
|
|
void releaseWFsFromBarrier(int bar_id);
|
|
int numBarrierSlots() const { return _numBarrierSlots; }
|
|
|
|
template<typename c0, typename c1>
|
|
void doSmReturn(GPUDynInstPtr gpuDynInst);
|
|
|
|
virtual void init() override;
|
|
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt);
|
|
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
|
|
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
|
|
bool kernelMemSync,
|
|
RequestPtr req=nullptr);
|
|
void handleMemPacket(PacketPtr pkt, int memport_index);
|
|
bool processTimingPacket(PacketPtr pkt);
|
|
void processFetchReturn(PacketPtr pkt);
|
|
void updatePageDivergenceDist(Addr addr);
|
|
|
|
RequestorID requestorId() { return _requestorId; }
|
|
RequestorID vramRequestorId();
|
|
|
|
bool isDone() const;
|
|
bool isVectorAluIdle(uint32_t simdId) const;
|
|
|
|
void handleSQCReturn(PacketPtr pkt);
|
|
|
|
protected:
|
|
RequestorID _requestorId;
|
|
|
|
LdsState &lds;
|
|
|
|
public:
|
|
LdsState &
|
|
getLds() const
|
|
{
|
|
return lds;
|
|
}
|
|
|
|
int32_t
|
|
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
|
|
|
|
[[nodiscard]] bool sendToLds(GPUDynInstPtr gpuDynInst);
|
|
|
|
typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
|
|
pageDataStruct pageAccesses;
|
|
|
|
void exitCallback();
|
|
|
|
class GMTokenPort : public TokenRequestPort
|
|
{
|
|
public:
|
|
GMTokenPort(const std::string& name, SimObject *owner,
|
|
PortID id = InvalidPortID)
|
|
: TokenRequestPort(name, owner, id)
|
|
{ }
|
|
~GMTokenPort() { }
|
|
|
|
protected:
|
|
bool recvTimingResp(PacketPtr) { return false; }
|
|
void recvReqRetry() { }
|
|
};
|
|
|
|
// Manager for the number of tokens available to this compute unit to
|
|
// send global memory request packets to the coalescer this is only used
|
|
// between global memory pipe and TCP coalescer.
|
|
TokenManager *memPortTokens;
|
|
GMTokenPort gmTokenPort;
|
|
|
|
/** Data access Port **/
|
|
class DataPort : public RequestPort
|
|
{
|
|
public:
|
|
DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
|
|
: RequestPort(_name, _cu, id), computeUnit(_cu) { }
|
|
|
|
bool snoopRangeSent;
|
|
|
|
struct SenderState : public Packet::SenderState
|
|
{
|
|
GPUDynInstPtr _gpuDynInst;
|
|
PortID port_index;
|
|
Packet::SenderState *saved;
|
|
|
|
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
|
|
Packet::SenderState *sender_state=nullptr)
|
|
: _gpuDynInst(gpuDynInst),
|
|
port_index(_port_index),
|
|
saved(sender_state) { }
|
|
};
|
|
|
|
class SystemHubEvent : public Event
|
|
{
|
|
DataPort *dataPort;
|
|
PacketPtr reqPkt;
|
|
|
|
public:
|
|
SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
|
|
: dataPort(_dataPort), reqPkt(pkt)
|
|
{
|
|
setFlags(Event::AutoDelete);
|
|
}
|
|
|
|
void
|
|
process()
|
|
{
|
|
// DMAs do not operate on packets and therefore do not
|
|
// convert to a response. Do that here instead.
|
|
reqPkt->makeResponse();
|
|
dataPort->handleResponse(reqPkt);
|
|
}
|
|
};
|
|
|
|
void processMemReqEvent(PacketPtr pkt);
|
|
EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
|
|
|
|
void processMemRespEvent(PacketPtr pkt);
|
|
EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);
|
|
|
|
std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
|
|
|
|
bool handleResponse(PacketPtr pkt);
|
|
|
|
protected:
|
|
ComputeUnit *computeUnit;
|
|
|
|
virtual bool recvTimingResp(PacketPtr pkt);
|
|
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
|
virtual void recvFunctional(PacketPtr pkt) { }
|
|
virtual void recvRangeChange() { }
|
|
virtual void recvReqRetry();
|
|
|
|
virtual void
|
|
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
|
|
{
|
|
resp.clear();
|
|
snoop = true;
|
|
}
|
|
|
|
};
|
|
|
|
// Scalar data cache access port
|
|
class ScalarDataPort : public RequestPort
|
|
{
|
|
public:
|
|
ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
|
|
: RequestPort(_name, _cu), computeUnit(_cu)
|
|
{
|
|
}
|
|
|
|
bool recvTimingResp(PacketPtr pkt) override;
|
|
void recvReqRetry() override;
|
|
|
|
struct SenderState : public Packet::SenderState
|
|
{
|
|
SenderState(GPUDynInstPtr gpuDynInst,
|
|
Packet::SenderState *sender_state=nullptr)
|
|
: _gpuDynInst(gpuDynInst), saved(sender_state)
|
|
{
|
|
}
|
|
|
|
GPUDynInstPtr _gpuDynInst;
|
|
Packet::SenderState *saved;
|
|
};
|
|
|
|
class MemReqEvent : public Event
|
|
{
|
|
private:
|
|
ScalarDataPort &scalarDataPort;
|
|
PacketPtr pkt;
|
|
|
|
public:
|
|
MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
|
|
: Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
|
|
{
|
|
setFlags(Event::AutoDelete);
|
|
}
|
|
|
|
void process();
|
|
const char *description() const;
|
|
};
|
|
|
|
class SystemHubEvent : public Event
|
|
{
|
|
ScalarDataPort *dataPort;
|
|
PacketPtr reqPkt;
|
|
|
|
public:
|
|
SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
|
|
: dataPort(_dataPort), reqPkt(pkt)
|
|
{
|
|
setFlags(Event::AutoDelete);
|
|
}
|
|
|
|
void
|
|
process()
|
|
{
|
|
// DMAs do not operate on packets and therefore do not
|
|
// convert to a response. Do that here instead.
|
|
reqPkt->makeResponse();
|
|
dataPort->handleResponse(reqPkt);
|
|
}
|
|
};
|
|
|
|
bool handleResponse(PacketPtr pkt);
|
|
|
|
std::deque<PacketPtr> retries;
|
|
|
|
private:
|
|
ComputeUnit *computeUnit;
|
|
};
|
|
|
|
// Instruction cache access port
|
|
class SQCPort : public RequestPort
|
|
{
|
|
public:
|
|
SQCPort(const std::string &_name, ComputeUnit *_cu)
|
|
: RequestPort(_name, _cu), computeUnit(_cu) { }
|
|
|
|
bool snoopRangeSent;
|
|
|
|
struct SenderState : public Packet::SenderState
|
|
{
|
|
Wavefront *wavefront;
|
|
Packet::SenderState *saved;
|
|
// kernel id to be used in handling I-Cache invalidate response
|
|
int kernId;
|
|
|
|
SenderState(Wavefront *_wavefront, Packet::SenderState
|
|
*sender_state=nullptr, int _kernId=-1)
|
|
: wavefront(_wavefront), saved(sender_state),
|
|
kernId(_kernId){ }
|
|
};
|
|
|
|
std::deque<std::pair<PacketPtr, Wavefront*>> retries;
|
|
|
|
protected:
|
|
ComputeUnit *computeUnit;
|
|
|
|
virtual bool recvTimingResp(PacketPtr pkt);
|
|
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
|
virtual void recvFunctional(PacketPtr pkt) { }
|
|
virtual void recvRangeChange() { }
|
|
virtual void recvReqRetry();
|
|
|
|
virtual void
|
|
getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
|
|
{
|
|
resp.clear();
|
|
snoop = true;
|
|
}
|
|
};
|
|
|
|
/** Data TLB port **/
|
|
class DTLBPort : public RequestPort
|
|
{
|
|
public:
|
|
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
|
|
: RequestPort(_name, _cu, id), computeUnit(_cu),
|
|
stalled(false)
|
|
{ }
|
|
|
|
bool isStalled() { return stalled; }
|
|
void stallPort() { stalled = true; }
|
|
void unstallPort() { stalled = false; }
|
|
|
|
/**
|
|
* here we queue all the translation requests that were
|
|
* not successfully sent.
|
|
*/
|
|
std::deque<PacketPtr> retries;
|
|
|
|
/** SenderState is information carried along with the packet
|
|
* throughout the TLB hierarchy
|
|
*/
|
|
struct SenderState: public Packet::SenderState
|
|
{
|
|
// the memInst that this is associated with
|
|
GPUDynInstPtr _gpuDynInst;
|
|
|
|
// the lane in the memInst this is associated with, so we send
|
|
// the memory request down the right port
|
|
PortID portIndex;
|
|
|
|
// constructor used for packets involved in timing accesses
|
|
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
|
|
: _gpuDynInst(gpuDynInst), portIndex(port_index) { }
|
|
|
|
};
|
|
|
|
protected:
|
|
ComputeUnit *computeUnit;
|
|
bool stalled;
|
|
|
|
virtual bool recvTimingResp(PacketPtr pkt);
|
|
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
|
virtual void recvFunctional(PacketPtr pkt) { }
|
|
virtual void recvRangeChange() { }
|
|
virtual void recvReqRetry();
|
|
};
|
|
|
|
class ScalarDTLBPort : public RequestPort
|
|
{
|
|
public:
|
|
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
|
|
: RequestPort(_name, _cu), computeUnit(_cu), stalled(false)
|
|
{
|
|
}
|
|
|
|
struct SenderState : public Packet::SenderState
|
|
{
|
|
SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
|
|
GPUDynInstPtr _gpuDynInst;
|
|
};
|
|
|
|
bool recvTimingResp(PacketPtr pkt) override;
|
|
void recvReqRetry() override { assert(false); }
|
|
|
|
bool isStalled() const { return stalled; }
|
|
void stallPort() { stalled = true; }
|
|
void unstallPort() { stalled = false; }
|
|
|
|
std::deque<PacketPtr> retries;
|
|
|
|
private:
|
|
ComputeUnit *computeUnit;
|
|
bool stalled;
|
|
};
|
|
|
|
class ITLBPort : public RequestPort
|
|
{
|
|
public:
|
|
ITLBPort(const std::string &_name, ComputeUnit *_cu)
|
|
: RequestPort(_name, _cu), computeUnit(_cu), stalled(false) { }
|
|
|
|
|
|
bool isStalled() { return stalled; }
|
|
void stallPort() { stalled = true; }
|
|
void unstallPort() { stalled = false; }
|
|
|
|
/**
|
|
* here we queue all the translation requests that were
|
|
* not successfully sent.
|
|
*/
|
|
std::deque<PacketPtr> retries;
|
|
|
|
/** SenderState is information carried along with the packet
|
|
* throughout the TLB hierarchy
|
|
*/
|
|
struct SenderState: public Packet::SenderState
|
|
{
|
|
// The wavefront associated with this request
|
|
Wavefront *wavefront;
|
|
|
|
SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
|
|
};
|
|
|
|
protected:
|
|
ComputeUnit *computeUnit;
|
|
bool stalled;
|
|
|
|
virtual bool recvTimingResp(PacketPtr pkt);
|
|
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
|
virtual void recvFunctional(PacketPtr pkt) { }
|
|
virtual void recvRangeChange() { }
|
|
virtual void recvReqRetry();
|
|
};
|
|
|
|
/**
|
|
* the port intended to communicate between the CU and its LDS
|
|
*/
|
|
class LDSPort : public RequestPort
|
|
{
|
|
public:
|
|
LDSPort(const std::string &_name, ComputeUnit *_cu)
|
|
: RequestPort(_name, _cu), computeUnit(_cu)
|
|
{
|
|
}
|
|
|
|
bool isStalled() const { return stalled; }
|
|
void stallPort() { stalled = true; }
|
|
void unstallPort() { stalled = false; }
|
|
|
|
/**
|
|
* here we queue all the requests that were
|
|
* not successfully sent.
|
|
*/
|
|
std::queue<PacketPtr> retries;
|
|
|
|
/**
|
|
* SenderState is information carried along with the packet, esp. the
|
|
* GPUDynInstPtr
|
|
*/
|
|
class SenderState: public Packet::SenderState
|
|
{
|
|
protected:
|
|
// The actual read/write/atomic request that goes with this command
|
|
GPUDynInstPtr _gpuDynInst = nullptr;
|
|
|
|
public:
|
|
SenderState(GPUDynInstPtr gpuDynInst):
|
|
_gpuDynInst(gpuDynInst)
|
|
{
|
|
}
|
|
|
|
GPUDynInstPtr
|
|
getMemInst() const
|
|
{
|
|
return _gpuDynInst;
|
|
}
|
|
};
|
|
|
|
virtual bool
|
|
sendTimingReq(PacketPtr pkt);
|
|
|
|
protected:
|
|
|
|
bool stalled = false; ///< whether or not it is stalled
|
|
|
|
ComputeUnit *computeUnit;
|
|
|
|
virtual bool
|
|
recvTimingResp(PacketPtr pkt);
|
|
|
|
virtual Tick
|
|
recvAtomic(PacketPtr pkt) { return 0; }
|
|
|
|
virtual void
|
|
recvFunctional(PacketPtr pkt)
|
|
{
|
|
}
|
|
|
|
virtual void
|
|
recvRangeChange()
|
|
{
|
|
}
|
|
|
|
virtual void
|
|
recvReqRetry();
|
|
};
|
|
|
|
/** The port to access the Local Data Store
|
|
* Can be connected to a LDS object
|
|
*/
|
|
LDSPort ldsPort;
|
|
|
|
TokenManager *
|
|
getTokenManager()
|
|
{
|
|
return memPortTokens;
|
|
}
|
|
|
|
/** The memory port for SIMD data accesses.
|
|
* Can be connected to PhysMem for Ruby for timing simulations
|
|
*/
|
|
std::vector<DataPort> memPort;
|
|
// port to the TLB hierarchy (i.e., the L1 TLB)
|
|
std::vector<DTLBPort> tlbPort;
|
|
// port to the scalar data cache
|
|
ScalarDataPort scalarDataPort;
|
|
// port to the scalar data TLB
|
|
ScalarDTLBPort scalarDTLBPort;
|
|
// port to the SQC (i.e. the I-cache)
|
|
SQCPort sqcPort;
|
|
// port to the SQC TLB (there's a separate TLB for each I-cache)
|
|
ITLBPort sqcTLBPort;
|
|
|
|
Port &
|
|
getPort(const std::string &if_name, PortID idx) override
|
|
{
|
|
if (if_name == "memory_port" && idx < memPort.size()) {
|
|
return memPort[idx];
|
|
} else if (if_name == "translation_port" && idx < tlbPort.size()) {
|
|
return tlbPort[idx];
|
|
} else if (if_name == "scalar_port") {
|
|
return scalarDataPort;
|
|
} else if (if_name == "scalar_tlb_port") {
|
|
return scalarDTLBPort;
|
|
} else if (if_name == "sqc_port") {
|
|
return sqcPort;
|
|
} else if (if_name == "sqc_tlb_port") {
|
|
return sqcTLBPort;
|
|
} else if (if_name == "ldsPort") {
|
|
return ldsPort;
|
|
} else if (if_name == "gmTokenPort") {
|
|
return gmTokenPort;
|
|
} else {
|
|
return ClockedObject::getPort(if_name, idx);
|
|
}
|
|
}
|
|
|
|
InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }
|
|
|
|
private:
|
|
const int _cacheLineSize;
|
|
const int _numBarrierSlots;
|
|
int cacheLineBits;
|
|
InstSeqNum globalSeqNum;
|
|
int wavefrontSize;
|
|
|
|
/**
|
|
* TODO: Update these comments once the pipe stage interface has
|
|
* been fully refactored.
|
|
*
|
|
* Pipeline stage interfaces.
|
|
*
|
|
* Buffers used to communicate between various pipeline stages
|
|
* List of waves which will be dispatched to
|
|
* each execution resource. An EXREADY implies
|
|
* dispatch list is non-empty and
|
|
* execution unit has something to execute
|
|
* this cycle. Currently, the dispatch list of
|
|
* an execution resource can hold only one wave because
|
|
* an execution resource can execute only one wave in a cycle.
|
|
* dispatchList is used to communicate between schedule
|
|
* and exec stage
|
|
*
|
|
* At a high level, the following intra-/inter-stage communication occurs:
|
|
* SCB to SCH: readyList provides per exec resource list of waves that
|
|
* passed dependency and readiness checks. If selected by
|
|
* scheduler, attempt to add wave to schList conditional on
|
|
* RF support.
|
|
* SCH: schList holds waves that are gathering operands or waiting
|
|
* for execution resource availability. Once ready, waves are
|
|
* placed on the dispatchList as candidates for execution. A wave
|
|
* may spend multiple cycles in SCH stage, on the schList due to
|
|
* RF access conflicts or execution resource contention.
|
|
* SCH to EX: dispatchList holds waves that are ready to be executed.
|
|
* LM/FLAT arbitration may remove an LM wave and place it
|
|
* back on the schList. RF model may also force a wave back
|
|
* to the schList if using the detailed model.
|
|
*/
|
|
ScoreboardCheckToSchedule scoreboardCheckToSchedule;
|
|
ScheduleToExecute scheduleToExecute;
|
|
|
|
/**
|
|
* The barrier slots for this CU.
|
|
*/
|
|
std::vector<WFBarrier> wfBarrierSlots;
|
|
/**
|
|
* A set used to easily retrieve a free barrier ID.
|
|
*/
|
|
std::unordered_set<int> freeBarrierIds;
|
|
|
|
// hold the time of the arrival of the first cache block related to
|
|
// a particular GPUDynInst. This is used to calculate the difference
|
|
// between the first and last chace block arrival times.
|
|
std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
|
|
|
|
public:
|
|
void updateInstStats(GPUDynInstPtr gpuDynInst);
|
|
int activeWaves;
|
|
|
|
struct ComputeUnitStats : public statistics::Group
|
|
{
|
|
ComputeUnitStats(statistics::Group *parent, int n_wf);
|
|
|
|
statistics::Scalar vALUInsts;
|
|
statistics::Formula vALUInstsPerWF;
|
|
statistics::Scalar sALUInsts;
|
|
statistics::Formula sALUInstsPerWF;
|
|
statistics::Scalar instCyclesVALU;
|
|
statistics::Scalar instCyclesSALU;
|
|
statistics::Scalar threadCyclesVALU;
|
|
statistics::Formula vALUUtilization;
|
|
statistics::Scalar ldsNoFlatInsts;
|
|
statistics::Formula ldsNoFlatInstsPerWF;
|
|
statistics::Scalar flatVMemInsts;
|
|
statistics::Formula flatVMemInstsPerWF;
|
|
statistics::Scalar flatLDSInsts;
|
|
statistics::Formula flatLDSInstsPerWF;
|
|
statistics::Scalar vectorMemWrites;
|
|
statistics::Formula vectorMemWritesPerWF;
|
|
statistics::Scalar vectorMemReads;
|
|
statistics::Formula vectorMemReadsPerWF;
|
|
statistics::Scalar scalarMemWrites;
|
|
statistics::Formula scalarMemWritesPerWF;
|
|
statistics::Scalar scalarMemReads;
|
|
statistics::Formula scalarMemReadsPerWF;
|
|
|
|
statistics::Formula vectorMemReadsPerKiloInst;
|
|
statistics::Formula vectorMemWritesPerKiloInst;
|
|
statistics::Formula vectorMemInstsPerKiloInst;
|
|
statistics::Formula scalarMemReadsPerKiloInst;
|
|
statistics::Formula scalarMemWritesPerKiloInst;
|
|
statistics::Formula scalarMemInstsPerKiloInst;
|
|
|
|
// Cycles required to send register source (addr and data) from
|
|
// register files to memory pipeline, per SIMD.
|
|
statistics::Vector instCyclesVMemPerSimd;
|
|
statistics::Vector instCyclesScMemPerSimd;
|
|
statistics::Vector instCyclesLdsPerSimd;
|
|
|
|
statistics::Scalar globalReads;
|
|
statistics::Scalar globalWrites;
|
|
statistics::Formula globalMemInsts;
|
|
statistics::Scalar argReads;
|
|
statistics::Scalar argWrites;
|
|
statistics::Formula argMemInsts;
|
|
statistics::Scalar spillReads;
|
|
statistics::Scalar spillWrites;
|
|
statistics::Formula spillMemInsts;
|
|
statistics::Scalar groupReads;
|
|
statistics::Scalar groupWrites;
|
|
statistics::Formula groupMemInsts;
|
|
statistics::Scalar privReads;
|
|
statistics::Scalar privWrites;
|
|
statistics::Formula privMemInsts;
|
|
statistics::Scalar readonlyReads;
|
|
statistics::Scalar readonlyWrites;
|
|
statistics::Formula readonlyMemInsts;
|
|
statistics::Scalar kernargReads;
|
|
statistics::Scalar kernargWrites;
|
|
statistics::Formula kernargMemInsts;
|
|
|
|
statistics::Distribution waveLevelParallelism;
|
|
|
|
// the following stats compute the avg. TLB accesslatency per
|
|
// uncoalesced request (only for data)
|
|
statistics::Scalar tlbRequests;
|
|
statistics::Scalar tlbCycles;
|
|
statistics::Formula tlbLatency;
|
|
// hitsPerTLBLevel[x] are the hits in Level x TLB.
|
|
// x = 0 is the page table.
|
|
statistics::Vector hitsPerTLBLevel;
|
|
|
|
statistics::Scalar ldsBankAccesses;
|
|
statistics::Distribution ldsBankConflictDist;
|
|
|
|
// over all memory instructions executed over all wavefronts
|
|
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
|
|
statistics::Distribution pageDivergenceDist;
|
|
// count of non-flat global memory vector instructions executed
|
|
statistics::Scalar dynamicGMemInstrCnt;
|
|
// count of flat global memory vector instructions executed
|
|
statistics::Scalar dynamicFlatMemInstrCnt;
|
|
statistics::Scalar dynamicLMemInstrCnt;
|
|
|
|
statistics::Scalar wgBlockedDueBarrierAllocation;
|
|
statistics::Scalar wgBlockedDueLdsAllocation;
|
|
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
|
|
// active when the instruction is committed, this number is still
|
|
// incremented by 1
|
|
statistics::Scalar numInstrExecuted;
|
|
// Number of cycles among successive instruction executions across all
|
|
// wavefronts of the same CU
|
|
statistics::Distribution execRateDist;
|
|
// number of individual vector operations executed
|
|
statistics::Scalar numVecOpsExecuted;
|
|
// number of individual f16 vector operations executed
|
|
statistics::Scalar numVecOpsExecutedF16;
|
|
// number of individual f32 vector operations executed
|
|
statistics::Scalar numVecOpsExecutedF32;
|
|
// number of individual f64 vector operations executed
|
|
statistics::Scalar numVecOpsExecutedF64;
|
|
// number of individual FMA 16,32,64 vector operations executed
|
|
statistics::Scalar numVecOpsExecutedFMA16;
|
|
statistics::Scalar numVecOpsExecutedFMA32;
|
|
statistics::Scalar numVecOpsExecutedFMA64;
|
|
// number of individual MAC 16,32,64 vector operations executed
|
|
statistics::Scalar numVecOpsExecutedMAC16;
|
|
statistics::Scalar numVecOpsExecutedMAC32;
|
|
statistics::Scalar numVecOpsExecutedMAC64;
|
|
// number of individual MAD 16,32,64 vector operations executed
|
|
statistics::Scalar numVecOpsExecutedMAD16;
|
|
statistics::Scalar numVecOpsExecutedMAD32;
|
|
statistics::Scalar numVecOpsExecutedMAD64;
|
|
// total number of two op FP vector operations executed
|
|
statistics::Scalar numVecOpsExecutedTwoOpFP;
|
|
// Total cycles that something is running on the GPU
|
|
statistics::Scalar totalCycles;
|
|
statistics::Formula vpc; // vector ops per cycle
|
|
statistics::Formula vpc_f16; // vector ops per cycle
|
|
statistics::Formula vpc_f32; // vector ops per cycle
|
|
statistics::Formula vpc_f64; // vector ops per cycle
|
|
statistics::Formula ipc; // vector instructions per cycle
|
|
statistics::Distribution controlFlowDivergenceDist;
|
|
statistics::Distribution activeLanesPerGMemInstrDist;
|
|
statistics::Distribution activeLanesPerLMemInstrDist;
|
|
// number of vector ALU instructions received
|
|
statistics::Formula numALUInstsExecuted;
|
|
// number of times a WG cannot start due to lack of free VGPRs in SIMDs
|
|
statistics::Scalar numTimesWgBlockedDueVgprAlloc;
|
|
// number of times a WG cannot start due to lack of free SGPRs in SIMDs
|
|
statistics::Scalar numTimesWgBlockedDueSgprAlloc;
|
|
statistics::Scalar numCASOps;
|
|
statistics::Scalar numFailedCASOps;
|
|
statistics::Scalar completedWfs;
|
|
statistics::Scalar completedWGs;
|
|
|
|
// distrubtion in latency difference between first and last cache block
|
|
// arrival ticks
|
|
statistics::Distribution headTailLatency;
|
|
|
|
// Track the amount of interleaving between wavefronts on each SIMD.
|
|
// This stat is sampled using instExecPerSimd to compute the number
|
|
// of instructions that have been executed on a SIMD between a WF
|
|
// executing two successive instructions.
|
|
statistics::VectorDistribution instInterleave;
|
|
} stats;
|
|
};
|
|
|
|
} // namespace gem5
|
|
|
|
#endif // __COMPUTE_UNIT_HH__
|