The is a bug in the GPUCoalescer which occurs in the following situation: 1) An instruction crosses a page boundary causing multiple TLB requests to be sent. 2) The TLB responses arrive at different times, causing the vector memory requests to be sent at different times. 3) The first vector memory request completes before the second vector memory request arrives at the coalescer. This caused the coalescer to consider the instruction sequence number done and return its token. Then the second request would arrive and complete sending back another token. Eventually this increases the token count beyond the maximum tripping an assert. This change keeps track of the number of per-lane requests which are expected to be sent in the vector memory request by looking at the exec mask of the instruction. The token is not returned until the expected number of per-lane requests have been coalesced. This fixes "#7" in the list of issues in JIRA-300. There are also style fixes for local variables in code nearby the changes in this CL. Change-Id: I152fd9397920ad82ba6079112908387e71ff3cce JIRA: https://gem5.atlassian.net/browse/GEM5-300 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/35176 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Reviewed-by: Kyle Roarty <kyleroarty1716@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
536 lines
18 KiB
C++
536 lines
18 KiB
C++
/*
|
|
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
|
|
#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
|
|
|
|
#include <iostream>
|
|
#include <unordered_map>
|
|
|
|
#include "base/statistics.hh"
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
#include "gpu-compute/misc.hh"
|
|
#include "mem/request.hh"
|
|
#include "mem/ruby/common/Address.hh"
|
|
#include "mem/ruby/common/Consumer.hh"
|
|
#include "mem/ruby/protocol/PrefetchBit.hh"
|
|
#include "mem/ruby/protocol/RubyAccessMode.hh"
|
|
#include "mem/ruby/protocol/RubyRequestType.hh"
|
|
#include "mem/ruby/protocol/SequencerRequestType.hh"
|
|
#include "mem/ruby/system/Sequencer.hh"
|
|
#include "mem/token_port.hh"
|
|
|
|
class DataBlock;
|
|
class CacheMsg;
|
|
class MachineID;
|
|
class CacheMemory;
|
|
|
|
class RubyGPUCoalescerParams;
|
|
|
|
// List of packets that belongs to a specific instruction.
|
|
typedef std::list<PacketPtr> PerInstPackets;
|
|
|
|
class UncoalescedTable
|
|
{
|
|
public:
|
|
UncoalescedTable(GPUCoalescer *gc);
|
|
~UncoalescedTable() {}
|
|
|
|
void insertPacket(PacketPtr pkt);
|
|
bool packetAvailable();
|
|
void printRequestTable(std::stringstream& ss);
|
|
|
|
// Modify packets remaining map. Init sets value iff the seqNum has not
|
|
// yet been seen before. get/set act as a regular getter/setter.
|
|
void initPacketsRemaining(InstSeqNum seqNum, int count);
|
|
int getPacketsRemaining(InstSeqNum seqNum);
|
|
void setPacketsRemaining(InstSeqNum seqNum, int count);
|
|
|
|
// Returns a pointer to the list of packets corresponding to an
|
|
// instruction in the instruction map or nullptr if there are no
|
|
// instructions at the offset.
|
|
PerInstPackets* getInstPackets(int offset);
|
|
void updateResources();
|
|
bool areRequestsDone(const InstSeqNum instSeqNum);
|
|
|
|
// Check if a packet hasn't been removed from instMap in too long.
|
|
// Panics if a deadlock is detected and returns nothing otherwise.
|
|
void checkDeadlock(Tick threshold);
|
|
|
|
private:
|
|
GPUCoalescer *coalescer;
|
|
|
|
// Maps an instructions unique sequence number to a queue of packets
|
|
// which need responses. This data structure assumes the sequence number
|
|
// is monotonically increasing (which is true for CU class) in order to
|
|
// issue packets in age order.
|
|
std::map<InstSeqNum, PerInstPackets> instMap;
|
|
|
|
std::map<InstSeqNum, int> instPktsRemaining;
|
|
};
|
|
|
|
class CoalescedRequest
|
|
{
|
|
public:
|
|
CoalescedRequest(uint64_t _seqNum)
|
|
: seqNum(_seqNum), issueTime(Cycles(0)),
|
|
rubyType(RubyRequestType_NULL)
|
|
{}
|
|
~CoalescedRequest() {}
|
|
|
|
void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
|
|
void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
|
|
void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
|
|
void setRubyType(RubyRequestType type) { rubyType = type; }
|
|
|
|
uint64_t getSeqNum() const { return seqNum; }
|
|
PacketPtr getFirstPkt() const { return pkts[0]; }
|
|
Cycles getIssueTime() const { return issueTime; }
|
|
RubyRequestType getRubyType() const { return rubyType; }
|
|
std::vector<PacketPtr>& getPackets() { return pkts; }
|
|
|
|
private:
|
|
uint64_t seqNum;
|
|
Cycles issueTime;
|
|
RubyRequestType rubyType;
|
|
std::vector<PacketPtr> pkts;
|
|
};
|
|
|
|
// PendingWriteInst tracks the number of outstanding Ruby requests
|
|
// per write instruction. Once all requests associated with one instruction
|
|
// are completely done in Ruby, we call back the requestor to mark
|
|
// that this instruction is complete.
|
|
class PendingWriteInst
|
|
{
|
|
public:
|
|
PendingWriteInst()
|
|
: numPendingStores(0),
|
|
originalPort(nullptr),
|
|
gpuDynInstPtr(nullptr)
|
|
{}
|
|
|
|
~PendingWriteInst()
|
|
{}
|
|
|
|
void
|
|
addPendingReq(RubyPort::MemResponsePort* port, GPUDynInstPtr inst,
|
|
bool usingRubyTester)
|
|
{
|
|
assert(port);
|
|
originalPort = port;
|
|
|
|
if (!usingRubyTester) {
|
|
gpuDynInstPtr = inst;
|
|
}
|
|
|
|
numPendingStores++;
|
|
}
|
|
|
|
// return true if no more ack is expected
|
|
bool
|
|
receiveWriteCompleteAck()
|
|
{
|
|
assert(numPendingStores > 0);
|
|
numPendingStores--;
|
|
return (numPendingStores == 0) ? true : false;
|
|
}
|
|
|
|
// ack the original requestor that this write instruction is complete
|
|
void
|
|
ackWriteCompletion(bool usingRubyTester)
|
|
{
|
|
assert(numPendingStores == 0);
|
|
|
|
// make a response packet
|
|
PacketPtr pkt = new Packet(std::make_shared<Request>(),
|
|
MemCmd::WriteCompleteResp);
|
|
|
|
if (!usingRubyTester) {
|
|
assert(gpuDynInstPtr);
|
|
ComputeUnit::DataPort::SenderState* ss =
|
|
new ComputeUnit::DataPort::SenderState
|
|
(gpuDynInstPtr, 0, nullptr);
|
|
pkt->senderState = ss;
|
|
}
|
|
|
|
// send the ack response to the requestor
|
|
originalPort->sendTimingResp(pkt);
|
|
}
|
|
|
|
int
|
|
getNumPendingStores() {
|
|
return numPendingStores;
|
|
}
|
|
|
|
private:
|
|
// the number of stores waiting for writeCompleteCallback
|
|
int numPendingStores;
|
|
// The original port that sent one of packets associated with this
|
|
// write instruction. We may have more than one packet per instruction,
|
|
// which implies multiple ports per instruction. However, we need
|
|
// only 1 of the ports to call back the CU. Therefore, here we keep
|
|
// track the port that sent the first packet of this instruction.
|
|
RubyPort::MemResponsePort* originalPort;
|
|
// similar to the originalPort, this gpuDynInstPtr is set only for
|
|
// the first packet of this instruction.
|
|
GPUDynInstPtr gpuDynInstPtr;
|
|
};
|
|
|
|
class GPUCoalescer : public RubyPort
|
|
{
|
|
public:
|
|
class GMTokenPort : public TokenResponsePort
|
|
{
|
|
public:
|
|
GMTokenPort(const std::string& name, ClockedObject *owner,
|
|
PortID id = InvalidPortID)
|
|
: TokenResponsePort(name, owner, id)
|
|
{ }
|
|
~GMTokenPort() { }
|
|
|
|
protected:
|
|
Tick recvAtomic(PacketPtr) { return Tick(0); }
|
|
void recvFunctional(PacketPtr) { }
|
|
bool recvTimingReq(PacketPtr) { return false; }
|
|
AddrRangeList getAddrRanges() const
|
|
{
|
|
AddrRangeList ranges;
|
|
return ranges;
|
|
}
|
|
};
|
|
|
|
typedef RubyGPUCoalescerParams Params;
|
|
GPUCoalescer(const Params *);
|
|
~GPUCoalescer();
|
|
|
|
Port &getPort(const std::string &if_name,
|
|
PortID idx = InvalidPortID) override;
|
|
|
|
// Public Methods
|
|
void wakeup(); // Used only for deadlock detection
|
|
void printRequestTable(std::stringstream& ss);
|
|
|
|
void printProgress(std::ostream& out) const;
|
|
void resetStats() override;
|
|
void collateStats();
|
|
void regStats() override;
|
|
|
|
// each store request needs two callbacks:
|
|
// (1) writeCallback is called when the store is received and processed
|
|
// by TCP. This writeCallback does not guarantee the store is actually
|
|
// completed at its destination cache or memory. writeCallback helps
|
|
// release hardware resources (e.g., its entry in coalescedTable)
|
|
// allocated for the store so that subsequent requests will not be
|
|
// blocked unnecessarily due to hardware resource constraints.
|
|
// (2) writeCompleteCallback is called when the store is fully completed
|
|
// at its destination cache or memory. writeCompleteCallback
|
|
// guarantees that the store is fully completed. This callback
|
|
// will decrement hardware counters in CU
|
|
void writeCallback(Addr address, DataBlock& data);
|
|
|
|
void writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data);
|
|
|
|
void writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion);
|
|
|
|
void writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime);
|
|
|
|
void writeCompleteCallback(Addr address,
|
|
uint64_t instSeqNum,
|
|
MachineType mach);
|
|
|
|
void readCallback(Addr address, DataBlock& data);
|
|
|
|
void readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data);
|
|
|
|
void readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime);
|
|
|
|
void readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion);
|
|
|
|
/* atomics need their own callback because the data
|
|
might be const coming from SLICC */
|
|
virtual void atomicCallback(Addr address,
|
|
MachineType mach,
|
|
const DataBlock& data);
|
|
|
|
RequestStatus makeRequest(PacketPtr pkt) override;
|
|
int outstandingCount() const override { return m_outstanding_count; }
|
|
|
|
bool
|
|
isDeadlockEventScheduled() const override
|
|
{
|
|
return deadlockCheckEvent.scheduled();
|
|
}
|
|
|
|
void
|
|
descheduleDeadlockEvent() override
|
|
{
|
|
deschedule(deadlockCheckEvent);
|
|
}
|
|
|
|
bool empty() const;
|
|
|
|
void print(std::ostream& out) const;
|
|
|
|
void evictionCallback(Addr address);
|
|
void completeIssue();
|
|
|
|
void insertKernel(int wavefront_id, PacketPtr pkt);
|
|
|
|
GMTokenPort& getGMTokenPort() { return gmTokenPort; }
|
|
|
|
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
|
|
|
|
Stats::Histogram& getLatencyHist() { return m_latencyHist; }
|
|
Stats::Histogram& getTypeLatencyHist(uint32_t t)
|
|
{ return *m_typeLatencyHist[t]; }
|
|
|
|
Stats::Histogram& getMissLatencyHist()
|
|
{ return m_missLatencyHist; }
|
|
Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
|
|
{ return *m_missTypeLatencyHist[t]; }
|
|
|
|
Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
|
|
{ return *m_missMachLatencyHist[t]; }
|
|
|
|
Stats::Histogram&
|
|
getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
|
|
{ return *m_missTypeMachLatencyHist[r][t]; }
|
|
|
|
Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
|
|
{ return *m_IssueToInitialDelayHist[t]; }
|
|
|
|
Stats::Histogram&
|
|
getInitialToForwardDelayHist(const MachineType t) const
|
|
{ return *m_InitialToForwardDelayHist[t]; }
|
|
|
|
Stats::Histogram&
|
|
getForwardRequestToFirstResponseHist(const MachineType t) const
|
|
{ return *m_ForwardToFirstResponseDelayHist[t]; }
|
|
|
|
Stats::Histogram&
|
|
getFirstResponseToCompletionDelayHist(const MachineType t) const
|
|
{ return *m_FirstResponseToCompletionDelayHist[t]; }
|
|
|
|
protected:
|
|
bool tryCacheAccess(Addr addr, RubyRequestType type,
|
|
Addr pc, RubyAccessMode access_mode,
|
|
int size, DataBlock*& data_ptr);
|
|
|
|
// since the two following issue functions are protocol-specific,
|
|
// they must be implemented in a derived coalescer
|
|
virtual void issueRequest(CoalescedRequest* crequest) = 0;
|
|
virtual void issueMemSyncRequest(PacketPtr pkt) {}
|
|
|
|
void kernelCallback(int wavefront_id);
|
|
|
|
void hitCallback(CoalescedRequest* crequest,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
bool success,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion);
|
|
void recordMissLatency(CoalescedRequest* crequest,
|
|
MachineType mach,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool success, bool isRegion);
|
|
void completeHitCallback(std::vector<PacketPtr> & mylist);
|
|
|
|
virtual RubyRequestType getRequestType(PacketPtr pkt);
|
|
|
|
GPUDynInstPtr getDynInst(PacketPtr pkt) const;
|
|
|
|
// Attempt to remove a packet from the uncoalescedTable and coalesce
|
|
// with a previous request from the same instruction. If there is no
|
|
// previous instruction and the max number of outstanding requests has
|
|
// not be reached, a new coalesced request is created and added to the
|
|
// "target" list of the coalescedTable.
|
|
bool coalescePacket(PacketPtr pkt);
|
|
|
|
EventFunctionWrapper issueEvent;
|
|
|
|
protected:
|
|
int m_max_outstanding_requests;
|
|
Cycles m_deadlock_threshold;
|
|
|
|
CacheMemory* m_dataCache_ptr;
|
|
CacheMemory* m_instCache_ptr;
|
|
|
|
// coalescingWindow is the maximum number of instructions that are
|
|
// allowed to be coalesced in a single cycle.
|
|
int coalescingWindow;
|
|
|
|
// The uncoalescedTable contains several "columns" which hold memory
|
|
// request packets for an instruction. The maximum size is the number of
|
|
// columns * the wavefront size.
|
|
UncoalescedTable uncoalescedTable;
|
|
|
|
// An MSHR-like struct for holding coalesced requests. The requests in
|
|
// this table may or may not be outstanding in the memory hierarchy. The
|
|
// maximum size is equal to the maximum outstanding requests for a CU
|
|
// (typically the number of blocks in TCP). If there are duplicates of
|
|
// an address, the are serviced in age order.
|
|
std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
|
|
|
|
// a map btw an instruction sequence number and PendingWriteInst
|
|
// this is used to do a final call back for each write when it is
|
|
// completely done in the memory system
|
|
std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
|
|
|
|
// Global outstanding request count, across all request tables
|
|
int m_outstanding_count;
|
|
bool m_deadlock_check_scheduled;
|
|
std::unordered_map<int, PacketPtr> kernelEndList;
|
|
std::vector<int> newKernelEnds;
|
|
|
|
int m_store_waiting_on_load_cycles;
|
|
int m_store_waiting_on_store_cycles;
|
|
int m_load_waiting_on_store_cycles;
|
|
int m_load_waiting_on_load_cycles;
|
|
|
|
bool m_runningGarnetStandalone;
|
|
|
|
EventFunctionWrapper deadlockCheckEvent;
|
|
bool assumingRfOCoherence;
|
|
|
|
// TODO - Need to update the following stats once the VIPER protocol
|
|
// is re-integrated.
|
|
// // m5 style stats for TCP hit/miss counts
|
|
// Stats::Scalar GPU_TCPLdHits;
|
|
// Stats::Scalar GPU_TCPLdTransfers;
|
|
// Stats::Scalar GPU_TCCLdHits;
|
|
// Stats::Scalar GPU_LdMiss;
|
|
//
|
|
// Stats::Scalar GPU_TCPStHits;
|
|
// Stats::Scalar GPU_TCPStTransfers;
|
|
// Stats::Scalar GPU_TCCStHits;
|
|
// Stats::Scalar GPU_StMiss;
|
|
//
|
|
// Stats::Scalar CP_TCPLdHits;
|
|
// Stats::Scalar CP_TCPLdTransfers;
|
|
// Stats::Scalar CP_TCCLdHits;
|
|
// Stats::Scalar CP_LdMiss;
|
|
//
|
|
// Stats::Scalar CP_TCPStHits;
|
|
// Stats::Scalar CP_TCPStTransfers;
|
|
// Stats::Scalar CP_TCCStHits;
|
|
// Stats::Scalar CP_StMiss;
|
|
|
|
//! Histogram for number of outstanding requests per cycle.
|
|
Stats::Histogram m_outstandReqHist;
|
|
|
|
//! Histogram for holding latency profile of all requests.
|
|
Stats::Histogram m_latencyHist;
|
|
std::vector<Stats::Histogram *> m_typeLatencyHist;
|
|
|
|
//! Histogram for holding latency profile of all requests that
|
|
//! miss in the controller connected to this sequencer.
|
|
Stats::Histogram m_missLatencyHist;
|
|
std::vector<Stats::Histogram *> m_missTypeLatencyHist;
|
|
|
|
//! Histograms for profiling the latencies for requests that
|
|
//! required external messages.
|
|
std::vector<Stats::Histogram *> m_missMachLatencyHist;
|
|
std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
|
|
|
|
//! Histograms for recording the breakdown of miss latency
|
|
std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
|
|
std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
|
|
std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
|
|
std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
|
|
|
|
// TODO - Need to update the following stats once the VIPER protocol
|
|
// is re-integrated.
|
|
// Stats::Distribution numHopDelays;
|
|
// Stats::Distribution tcpToTccDelay;
|
|
// Stats::Distribution tccToSdDelay;
|
|
// Stats::Distribution sdToSdDelay;
|
|
// Stats::Distribution sdToTccDelay;
|
|
// Stats::Distribution tccToTcpDelay;
|
|
//
|
|
// Stats::Average avgTcpToTcc;
|
|
// Stats::Average avgTccToSd;
|
|
// Stats::Average avgSdToSd;
|
|
// Stats::Average avgSdToTcc;
|
|
// Stats::Average avgTccToTcp;
|
|
|
|
private:
|
|
// Token port is used to send/receive tokens to/from GPU's global memory
|
|
// pipeline across the port boundary. There is one per <wave size> data
|
|
// ports in the CU.
|
|
GMTokenPort gmTokenPort;
|
|
|
|
// Private copy constructor and assignment operator
|
|
GPUCoalescer(const GPUCoalescer& obj);
|
|
GPUCoalescer& operator=(const GPUCoalescer& obj);
|
|
};
|
|
|
|
inline std::ostream&
|
|
operator<<(std::ostream& out, const GPUCoalescer& obj)
|
|
{
|
|
obj.print(out);
|
|
out << std::flush;
|
|
return out;
|
|
}
|
|
|
|
#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
|