Change-Id: Ib129e82be5348c641a8ae18093324bcedfb38abe Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29939 Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
526 lines
18 KiB
C++
526 lines
18 KiB
C++
/*
|
|
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
|
|
#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
|
|
|
|
#include <iostream>
|
|
#include <unordered_map>
|
|
|
|
#include "base/statistics.hh"
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
#include "gpu-compute/misc.hh"
|
|
#include "mem/request.hh"
|
|
#include "mem/ruby/common/Address.hh"
|
|
#include "mem/ruby/common/Consumer.hh"
|
|
#include "mem/ruby/protocol/PrefetchBit.hh"
|
|
#include "mem/ruby/protocol/RubyAccessMode.hh"
|
|
#include "mem/ruby/protocol/RubyRequestType.hh"
|
|
#include "mem/ruby/protocol/SequencerRequestType.hh"
|
|
#include "mem/ruby/system/Sequencer.hh"
|
|
#include "mem/token_port.hh"
|
|
|
|
class DataBlock;
|
|
class CacheMsg;
|
|
class MachineID;
|
|
class CacheMemory;
|
|
|
|
class RubyGPUCoalescerParams;
|
|
|
|
// List of packets that belongs to a specific instruction.
|
|
typedef std::list<PacketPtr> PerInstPackets;
|
|
|
|
class UncoalescedTable
|
|
{
|
|
public:
|
|
UncoalescedTable(GPUCoalescer *gc);
|
|
~UncoalescedTable() {}
|
|
|
|
void insertPacket(PacketPtr pkt);
|
|
bool packetAvailable();
|
|
void printRequestTable(std::stringstream& ss);
|
|
|
|
// Returns a pointer to the list of packets corresponding to an
|
|
// instruction in the instruction map or nullptr if there are no
|
|
// instructions at the offset.
|
|
PerInstPackets* getInstPackets(int offset);
|
|
void updateResources();
|
|
bool areRequestsDone(const uint64_t instSeqNum);
|
|
|
|
// Check if a packet hasn't been removed from instMap in too long.
|
|
// Panics if a deadlock is detected and returns nothing otherwise.
|
|
void checkDeadlock(Tick threshold);
|
|
|
|
private:
|
|
GPUCoalescer *coalescer;
|
|
|
|
// Maps an instructions unique sequence number to a queue of packets
|
|
// which need responses. This data structure assumes the sequence number
|
|
// is monotonically increasing (which is true for CU class) in order to
|
|
// issue packets in age order.
|
|
std::map<uint64_t, PerInstPackets> instMap;
|
|
};
|
|
|
|
class CoalescedRequest
|
|
{
|
|
public:
|
|
CoalescedRequest(uint64_t _seqNum)
|
|
: seqNum(_seqNum), issueTime(Cycles(0)),
|
|
rubyType(RubyRequestType_NULL)
|
|
{}
|
|
~CoalescedRequest() {}
|
|
|
|
void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
|
|
void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
|
|
void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
|
|
void setRubyType(RubyRequestType type) { rubyType = type; }
|
|
|
|
uint64_t getSeqNum() const { return seqNum; }
|
|
PacketPtr getFirstPkt() const { return pkts[0]; }
|
|
Cycles getIssueTime() const { return issueTime; }
|
|
RubyRequestType getRubyType() const { return rubyType; }
|
|
std::vector<PacketPtr>& getPackets() { return pkts; }
|
|
|
|
private:
|
|
uint64_t seqNum;
|
|
Cycles issueTime;
|
|
RubyRequestType rubyType;
|
|
std::vector<PacketPtr> pkts;
|
|
};
|
|
|
|
// PendingWriteInst tracks the number of outstanding Ruby requests
|
|
// per write instruction. Once all requests associated with one instruction
|
|
// are completely done in Ruby, we call back the requester to mark
|
|
// that this instruction is complete.
|
|
class PendingWriteInst
|
|
{
|
|
public:
|
|
PendingWriteInst()
|
|
: numPendingStores(0),
|
|
originalPort(nullptr),
|
|
gpuDynInstPtr(nullptr)
|
|
{}
|
|
|
|
~PendingWriteInst()
|
|
{}
|
|
|
|
void
|
|
addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
|
|
bool usingRubyTester)
|
|
{
|
|
assert(port);
|
|
originalPort = port;
|
|
|
|
if (!usingRubyTester) {
|
|
gpuDynInstPtr = inst;
|
|
}
|
|
|
|
numPendingStores++;
|
|
}
|
|
|
|
// return true if no more ack is expected
|
|
bool
|
|
receiveWriteCompleteAck()
|
|
{
|
|
assert(numPendingStores > 0);
|
|
numPendingStores--;
|
|
return (numPendingStores == 0) ? true : false;
|
|
}
|
|
|
|
// ack the original requester that this write instruction is complete
|
|
void
|
|
ackWriteCompletion(bool usingRubyTester)
|
|
{
|
|
assert(numPendingStores == 0);
|
|
|
|
// make a response packet
|
|
PacketPtr pkt = new Packet(std::make_shared<Request>(),
|
|
MemCmd::WriteCompleteResp);
|
|
|
|
if (!usingRubyTester) {
|
|
assert(gpuDynInstPtr);
|
|
ComputeUnit::DataPort::SenderState* ss =
|
|
new ComputeUnit::DataPort::SenderState
|
|
(gpuDynInstPtr, 0, nullptr);
|
|
pkt->senderState = ss;
|
|
}
|
|
|
|
// send the ack response to the requester
|
|
originalPort->sendTimingResp(pkt);
|
|
}
|
|
|
|
int
|
|
getNumPendingStores() {
|
|
return numPendingStores;
|
|
}
|
|
|
|
private:
|
|
// the number of stores waiting for writeCompleteCallback
|
|
int numPendingStores;
|
|
// The original port that sent one of packets associated with this
|
|
// write instruction. We may have more than one packet per instruction,
|
|
// which implies multiple ports per instruction. However, we need
|
|
// only 1 of the ports to call back the CU. Therefore, here we keep
|
|
// track the port that sent the first packet of this instruction.
|
|
RubyPort::MemSlavePort* originalPort;
|
|
// similar to the originalPort, this gpuDynInstPtr is set only for
|
|
// the first packet of this instruction.
|
|
GPUDynInstPtr gpuDynInstPtr;
|
|
};
|
|
|
|
class GPUCoalescer : public RubyPort
|
|
{
|
|
public:
|
|
class GMTokenPort : public TokenSlavePort
|
|
{
|
|
public:
|
|
GMTokenPort(const std::string& name, ClockedObject *owner,
|
|
PortID id = InvalidPortID)
|
|
: TokenSlavePort(name, owner, id)
|
|
{ }
|
|
~GMTokenPort() { }
|
|
|
|
protected:
|
|
Tick recvAtomic(PacketPtr) { return Tick(0); }
|
|
void recvFunctional(PacketPtr) { }
|
|
bool recvTimingReq(PacketPtr) { return false; }
|
|
AddrRangeList getAddrRanges() const
|
|
{
|
|
AddrRangeList ranges;
|
|
return ranges;
|
|
}
|
|
};
|
|
|
|
typedef RubyGPUCoalescerParams Params;
|
|
GPUCoalescer(const Params *);
|
|
~GPUCoalescer();
|
|
|
|
Port &getPort(const std::string &if_name,
|
|
PortID idx = InvalidPortID) override;
|
|
|
|
// Public Methods
|
|
void wakeup(); // Used only for deadlock detection
|
|
void printRequestTable(std::stringstream& ss);
|
|
|
|
void printProgress(std::ostream& out) const;
|
|
void resetStats() override;
|
|
void collateStats();
|
|
void regStats() override;
|
|
|
|
// each store request needs two callbacks:
|
|
// (1) writeCallback is called when the store is received and processed
|
|
// by TCP. This writeCallback does not guarantee the store is actually
|
|
// completed at its destination cache or memory. writeCallback helps
|
|
// release hardware resources (e.g., its entry in coalescedTable)
|
|
// allocated for the store so that subsequent requests will not be
|
|
// blocked unnecessarily due to hardware resource constraints.
|
|
// (2) writeCompleteCallback is called when the store is fully completed
|
|
// at its destination cache or memory. writeCompleteCallback
|
|
// guarantees that the store is fully completed. This callback
|
|
// will decrement hardware counters in CU
|
|
void writeCallback(Addr address, DataBlock& data);
|
|
|
|
void writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data);
|
|
|
|
void writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion);
|
|
|
|
void writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime);
|
|
|
|
void writeCompleteCallback(Addr address,
|
|
uint64_t instSeqNum,
|
|
MachineType mach);
|
|
|
|
void readCallback(Addr address, DataBlock& data);
|
|
|
|
void readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data);
|
|
|
|
void readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime);
|
|
|
|
void readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion);
|
|
|
|
/* atomics need their own callback because the data
|
|
might be const coming from SLICC */
|
|
virtual void atomicCallback(Addr address,
|
|
MachineType mach,
|
|
const DataBlock& data);
|
|
|
|
RequestStatus makeRequest(PacketPtr pkt) override;
|
|
int outstandingCount() const override { return m_outstanding_count; }
|
|
|
|
bool
|
|
isDeadlockEventScheduled() const override
|
|
{
|
|
return deadlockCheckEvent.scheduled();
|
|
}
|
|
|
|
void
|
|
descheduleDeadlockEvent() override
|
|
{
|
|
deschedule(deadlockCheckEvent);
|
|
}
|
|
|
|
bool empty() const;
|
|
|
|
void print(std::ostream& out) const;
|
|
|
|
void evictionCallback(Addr address);
|
|
void completeIssue();
|
|
|
|
void insertKernel(int wavefront_id, PacketPtr pkt);
|
|
|
|
GMTokenPort& getGMTokenPort() { return gmTokenPort; }
|
|
|
|
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
|
|
|
|
Stats::Histogram& getLatencyHist() { return m_latencyHist; }
|
|
Stats::Histogram& getTypeLatencyHist(uint32_t t)
|
|
{ return *m_typeLatencyHist[t]; }
|
|
|
|
Stats::Histogram& getMissLatencyHist()
|
|
{ return m_missLatencyHist; }
|
|
Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
|
|
{ return *m_missTypeLatencyHist[t]; }
|
|
|
|
Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
|
|
{ return *m_missMachLatencyHist[t]; }
|
|
|
|
Stats::Histogram&
|
|
getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
|
|
{ return *m_missTypeMachLatencyHist[r][t]; }
|
|
|
|
Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
|
|
{ return *m_IssueToInitialDelayHist[t]; }
|
|
|
|
Stats::Histogram&
|
|
getInitialToForwardDelayHist(const MachineType t) const
|
|
{ return *m_InitialToForwardDelayHist[t]; }
|
|
|
|
Stats::Histogram&
|
|
getForwardRequestToFirstResponseHist(const MachineType t) const
|
|
{ return *m_ForwardToFirstResponseDelayHist[t]; }
|
|
|
|
Stats::Histogram&
|
|
getFirstResponseToCompletionDelayHist(const MachineType t) const
|
|
{ return *m_FirstResponseToCompletionDelayHist[t]; }
|
|
|
|
protected:
|
|
bool tryCacheAccess(Addr addr, RubyRequestType type,
|
|
Addr pc, RubyAccessMode access_mode,
|
|
int size, DataBlock*& data_ptr);
|
|
|
|
// since the two following issue functions are protocol-specific,
|
|
// they must be implemented in a derived coalescer
|
|
virtual void issueRequest(CoalescedRequest* crequest) = 0;
|
|
virtual void issueMemSyncRequest(PacketPtr pkt) {}
|
|
|
|
void kernelCallback(int wavefront_id);
|
|
|
|
void hitCallback(CoalescedRequest* crequest,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
bool success,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion);
|
|
void recordMissLatency(CoalescedRequest* crequest,
|
|
MachineType mach,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool success, bool isRegion);
|
|
void completeHitCallback(std::vector<PacketPtr> & mylist);
|
|
|
|
virtual RubyRequestType getRequestType(PacketPtr pkt);
|
|
|
|
// Attempt to remove a packet from the uncoalescedTable and coalesce
|
|
// with a previous request from the same instruction. If there is no
|
|
// previous instruction and the max number of outstanding requests has
|
|
// not be reached, a new coalesced request is created and added to the
|
|
// "target" list of the coalescedTable.
|
|
bool coalescePacket(PacketPtr pkt);
|
|
|
|
EventFunctionWrapper issueEvent;
|
|
|
|
protected:
|
|
int m_max_outstanding_requests;
|
|
Cycles m_deadlock_threshold;
|
|
|
|
CacheMemory* m_dataCache_ptr;
|
|
CacheMemory* m_instCache_ptr;
|
|
|
|
// coalescingWindow is the maximum number of instructions that are
|
|
// allowed to be coalesced in a single cycle.
|
|
int coalescingWindow;
|
|
|
|
// The uncoalescedTable contains several "columns" which hold memory
|
|
// request packets for an instruction. The maximum size is the number of
|
|
// columns * the wavefront size.
|
|
UncoalescedTable uncoalescedTable;
|
|
|
|
// An MSHR-like struct for holding coalesced requests. The requests in
|
|
// this table may or may not be outstanding in the memory hierarchy. The
|
|
// maximum size is equal to the maximum outstanding requests for a CU
|
|
// (typically the number of blocks in TCP). If there are duplicates of
|
|
// an address, the are serviced in age order.
|
|
std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
|
|
|
|
// a map btw an instruction sequence number and PendingWriteInst
|
|
// this is used to do a final call back for each write when it is
|
|
// completely done in the memory system
|
|
std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
|
|
|
|
// Global outstanding request count, across all request tables
|
|
int m_outstanding_count;
|
|
bool m_deadlock_check_scheduled;
|
|
std::unordered_map<int, PacketPtr> kernelEndList;
|
|
std::vector<int> newKernelEnds;
|
|
|
|
int m_store_waiting_on_load_cycles;
|
|
int m_store_waiting_on_store_cycles;
|
|
int m_load_waiting_on_store_cycles;
|
|
int m_load_waiting_on_load_cycles;
|
|
|
|
bool m_runningGarnetStandalone;
|
|
|
|
EventFunctionWrapper deadlockCheckEvent;
|
|
bool assumingRfOCoherence;
|
|
|
|
// TODO - Need to update the following stats once the VIPER protocol
|
|
// is re-integrated.
|
|
// // m5 style stats for TCP hit/miss counts
|
|
// Stats::Scalar GPU_TCPLdHits;
|
|
// Stats::Scalar GPU_TCPLdTransfers;
|
|
// Stats::Scalar GPU_TCCLdHits;
|
|
// Stats::Scalar GPU_LdMiss;
|
|
//
|
|
// Stats::Scalar GPU_TCPStHits;
|
|
// Stats::Scalar GPU_TCPStTransfers;
|
|
// Stats::Scalar GPU_TCCStHits;
|
|
// Stats::Scalar GPU_StMiss;
|
|
//
|
|
// Stats::Scalar CP_TCPLdHits;
|
|
// Stats::Scalar CP_TCPLdTransfers;
|
|
// Stats::Scalar CP_TCCLdHits;
|
|
// Stats::Scalar CP_LdMiss;
|
|
//
|
|
// Stats::Scalar CP_TCPStHits;
|
|
// Stats::Scalar CP_TCPStTransfers;
|
|
// Stats::Scalar CP_TCCStHits;
|
|
// Stats::Scalar CP_StMiss;
|
|
|
|
//! Histogram for number of outstanding requests per cycle.
|
|
Stats::Histogram m_outstandReqHist;
|
|
|
|
//! Histogram for holding latency profile of all requests.
|
|
Stats::Histogram m_latencyHist;
|
|
std::vector<Stats::Histogram *> m_typeLatencyHist;
|
|
|
|
//! Histogram for holding latency profile of all requests that
|
|
//! miss in the controller connected to this sequencer.
|
|
Stats::Histogram m_missLatencyHist;
|
|
std::vector<Stats::Histogram *> m_missTypeLatencyHist;
|
|
|
|
//! Histograms for profiling the latencies for requests that
|
|
//! required external messages.
|
|
std::vector<Stats::Histogram *> m_missMachLatencyHist;
|
|
std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
|
|
|
|
//! Histograms for recording the breakdown of miss latency
|
|
std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
|
|
std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
|
|
std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
|
|
std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
|
|
|
|
// TODO - Need to update the following stats once the VIPER protocol
|
|
// is re-integrated.
|
|
// Stats::Distribution numHopDelays;
|
|
// Stats::Distribution tcpToTccDelay;
|
|
// Stats::Distribution tccToSdDelay;
|
|
// Stats::Distribution sdToSdDelay;
|
|
// Stats::Distribution sdToTccDelay;
|
|
// Stats::Distribution tccToTcpDelay;
|
|
//
|
|
// Stats::Average avgTcpToTcc;
|
|
// Stats::Average avgTccToSd;
|
|
// Stats::Average avgSdToSd;
|
|
// Stats::Average avgSdToTcc;
|
|
// Stats::Average avgTccToTcp;
|
|
|
|
private:
|
|
// Token port is used to send/receive tokens to/from GPU's global memory
|
|
// pipeline across the port boundary. There is one per <wave size> data
|
|
// ports in the CU.
|
|
GMTokenPort gmTokenPort;
|
|
|
|
// Private copy constructor and assignment operator
|
|
GPUCoalescer(const GPUCoalescer& obj);
|
|
GPUCoalescer& operator=(const GPUCoalescer& obj);
|
|
};
|
|
|
|
inline std::ostream&
|
|
operator<<(std::ostream& out, const GPUCoalescer& obj)
|
|
{
|
|
obj.print(out);
|
|
out << std::flush;
|
|
return out;
|
|
}
|
|
|
|
#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
|