The GPUDynInst for sending memory requests through the CUs data port is required but only used for DPRINTFs. Relax this constraint so that the methods can be reused for requests such as probes generated by the GPU device. Change-Id: I16094e400968225596370b684d6471580888d98a
1030 lines
35 KiB
C++
1030 lines
35 KiB
C++
/*
|
|
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "mem/ruby/system/GPUCoalescer.hh"
|
|
|
|
#include "base/compiler.hh"
|
|
#include "base/logging.hh"
|
|
#include "base/str.hh"
|
|
#include "cpu/testers/rubytest/RubyTester.hh"
|
|
#include "debug/GPUCoalescer.hh"
|
|
#include "debug/MemoryAccess.hh"
|
|
#include "debug/ProtocolTrace.hh"
|
|
#include "debug/RubyPort.hh"
|
|
#include "debug/RubyStats.hh"
|
|
#include "gpu-compute/shader.hh"
|
|
#include "mem/packet.hh"
|
|
#include "mem/ruby/common/DataBlock.hh"
|
|
#include "mem/ruby/common/SubBlock.hh"
|
|
#include "mem/ruby/network/MessageBuffer.hh"
|
|
#include "mem/ruby/profiler/Profiler.hh"
|
|
#include "mem/ruby/slicc_interface/AbstractController.hh"
|
|
#include "mem/ruby/slicc_interface/RubyRequest.hh"
|
|
#include "mem/ruby/structures/CacheMemory.hh"
|
|
#include "mem/ruby/system/RubySystem.hh"
|
|
#include "params/RubyGPUCoalescer.hh"
|
|
|
|
namespace gem5
|
|
{
|
|
|
|
namespace ruby
|
|
{
|
|
|
|
UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
|
|
: coalescer(gc)
|
|
{
|
|
}
|
|
|
|
void
|
|
UncoalescedTable::insertPacket(PacketPtr pkt)
|
|
{
|
|
uint64_t seqNum = pkt->req->getReqInstSeqNum();
|
|
|
|
instMap[seqNum].push_back(pkt);
|
|
DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
|
|
pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
|
|
}
|
|
|
|
void
|
|
UncoalescedTable::insertReqType(PacketPtr pkt, RubyRequestType type)
|
|
{
|
|
uint64_t seqNum = pkt->req->getReqInstSeqNum();
|
|
|
|
reqTypeMap[seqNum] = type;
|
|
}
|
|
|
|
bool
|
|
UncoalescedTable::packetAvailable()
|
|
{
|
|
return !instMap.empty();
|
|
}
|
|
|
|
void
|
|
UncoalescedTable::initPacketsRemaining(InstSeqNum seqNum, int count)
|
|
{
|
|
if (!instPktsRemaining.count(seqNum)) {
|
|
instPktsRemaining[seqNum] = count;
|
|
}
|
|
}
|
|
|
|
int
|
|
UncoalescedTable::getPacketsRemaining(InstSeqNum seqNum)
|
|
{
|
|
return instPktsRemaining[seqNum];
|
|
}
|
|
|
|
void
|
|
UncoalescedTable::setPacketsRemaining(InstSeqNum seqNum, int count)
|
|
{
|
|
instPktsRemaining[seqNum] = count;
|
|
}
|
|
|
|
PerInstPackets*
|
|
UncoalescedTable::getInstPackets(int offset)
|
|
{
|
|
if (offset >= instMap.size()) {
|
|
return nullptr;
|
|
}
|
|
|
|
auto instMapIter = instMap.begin();
|
|
std::advance(instMapIter, offset);
|
|
|
|
return &(instMapIter->second);
|
|
}
|
|
|
|
void
|
|
UncoalescedTable::updateResources()
|
|
{
|
|
for (auto iter = instMap.begin(); iter != instMap.end(); ) {
|
|
InstSeqNum seq_num = iter->first;
|
|
DPRINTF(GPUCoalescer, "%s checking remaining pkts for %d\n",
|
|
coalescer->name().c_str(), seq_num);
|
|
assert(instPktsRemaining.count(seq_num));
|
|
|
|
if (instPktsRemaining[seq_num] == 0) {
|
|
assert(iter->second.empty());
|
|
|
|
// Remove from both maps
|
|
instMap.erase(iter++);
|
|
instPktsRemaining.erase(seq_num);
|
|
|
|
// Release the token if the Ruby system is not in cooldown
|
|
// or warmup phases. When in these phases, the RubyPorts
|
|
// are accessed directly using the makeRequest() command
|
|
// instead of accessing through the port. This makes
|
|
// sending tokens through the port unnecessary
|
|
if (!RubySystem::getWarmupEnabled()
|
|
&& !RubySystem::getCooldownEnabled()) {
|
|
if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) {
|
|
DPRINTF(GPUCoalescer,
|
|
"Returning token seqNum %d\n", seq_num);
|
|
coalescer->getGMTokenPort().sendTokens(1);
|
|
}
|
|
}
|
|
|
|
reqTypeMap.erase(seq_num);
|
|
} else {
|
|
++iter;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool
|
|
UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
|
|
// iterate the instructions held in UncoalescedTable to see whether there
|
|
// are more requests to issue; if yes, not yet done; otherwise, done
|
|
for (auto& inst : instMap) {
|
|
DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
|
|
,inst.first, inst.second.size());
|
|
if (inst.first == instSeqNum) { return false; }
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
UncoalescedTable::printRequestTable(std::stringstream& ss)
|
|
{
|
|
ss << "Listing pending packets from " << instMap.size() << " instructions";
|
|
|
|
for (auto& inst : instMap) {
|
|
ss << "\tAddr: " << printAddress(inst.first) << " with "
|
|
<< inst.second.size() << " pending packets" << std::endl;
|
|
}
|
|
}
|
|
|
|
void
|
|
UncoalescedTable::checkDeadlock(Tick threshold)
|
|
{
|
|
Tick current_time = curTick();
|
|
|
|
for (auto &it : instMap) {
|
|
for (auto &pkt : it.second) {
|
|
if (current_time - pkt->req->time() > threshold) {
|
|
std::stringstream ss;
|
|
printRequestTable(ss);
|
|
|
|
panic("Possible Deadlock detected. Aborting!\n"
|
|
"version: %d request.paddr: 0x%x uncoalescedTable: %d "
|
|
"current time: %u issue_time: %d difference: %d\n"
|
|
"Request Tables:\n\n%s", coalescer->getId(),
|
|
pkt->getAddr(), instMap.size(), current_time,
|
|
pkt->req->time(), current_time - pkt->req->time(),
|
|
ss.str());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GPUCoalescer::GPUCoalescer(const Params &p)
|
|
: RubyPort(p),
|
|
issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
|
|
false, Event::Progress_Event_Pri),
|
|
uncoalescedTable(this),
|
|
deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
|
|
gmTokenPort(name() + ".gmTokenPort")
|
|
{
|
|
m_store_waiting_on_load_cycles = 0;
|
|
m_store_waiting_on_store_cycles = 0;
|
|
m_load_waiting_on_store_cycles = 0;
|
|
m_load_waiting_on_load_cycles = 0;
|
|
|
|
m_outstanding_count = 0;
|
|
|
|
coalescingWindow = p.max_coalesces_per_cycle;
|
|
|
|
m_max_outstanding_requests = 0;
|
|
m_instCache_ptr = nullptr;
|
|
m_dataCache_ptr = nullptr;
|
|
|
|
m_instCache_ptr = p.icache;
|
|
m_dataCache_ptr = p.dcache;
|
|
m_max_outstanding_requests = p.max_outstanding_requests;
|
|
m_deadlock_threshold = p.deadlock_threshold;
|
|
|
|
assert(m_max_outstanding_requests > 0);
|
|
assert(m_deadlock_threshold > 0);
|
|
assert(m_instCache_ptr);
|
|
assert(m_dataCache_ptr);
|
|
|
|
m_runningGarnetStandalone = p.garnet_standalone;
|
|
|
|
|
|
// These statistical variables are not for display.
|
|
// The profiler will collate these across different
|
|
// coalescers and display those collated statistics.
|
|
m_outstandReqHist.init(10);
|
|
m_latencyHist.init(10);
|
|
m_missLatencyHist.init(10);
|
|
|
|
for (int i = 0; i < RubyRequestType_NUM; i++) {
|
|
m_typeLatencyHist.push_back(new statistics::Histogram());
|
|
m_typeLatencyHist[i]->init(10);
|
|
|
|
m_missTypeLatencyHist.push_back(new statistics::Histogram());
|
|
m_missTypeLatencyHist[i]->init(10);
|
|
}
|
|
|
|
for (int i = 0; i < MachineType_NUM; i++) {
|
|
m_missMachLatencyHist.push_back(new statistics::Histogram());
|
|
m_missMachLatencyHist[i]->init(10);
|
|
|
|
m_IssueToInitialDelayHist.push_back(new statistics::Histogram());
|
|
m_IssueToInitialDelayHist[i]->init(10);
|
|
|
|
m_InitialToForwardDelayHist.push_back(new statistics::Histogram());
|
|
m_InitialToForwardDelayHist[i]->init(10);
|
|
|
|
m_ForwardToFirstResponseDelayHist.push_back(
|
|
new statistics::Histogram());
|
|
m_ForwardToFirstResponseDelayHist[i]->init(10);
|
|
|
|
m_FirstResponseToCompletionDelayHist.push_back(
|
|
new statistics::Histogram());
|
|
m_FirstResponseToCompletionDelayHist[i]->init(10);
|
|
}
|
|
|
|
for (int i = 0; i < RubyRequestType_NUM; i++) {
|
|
m_missTypeMachLatencyHist.push_back(
|
|
std::vector<statistics::Histogram *>());
|
|
|
|
for (int j = 0; j < MachineType_NUM; j++) {
|
|
m_missTypeMachLatencyHist[i].push_back(
|
|
new statistics::Histogram());
|
|
m_missTypeMachLatencyHist[i][j]->init(10);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
GPUCoalescer::~GPUCoalescer()
|
|
{
|
|
}
|
|
|
|
Port &
|
|
GPUCoalescer::getPort(const std::string &if_name, PortID idx)
|
|
{
|
|
if (if_name == "gmTokenPort") {
|
|
return gmTokenPort;
|
|
}
|
|
|
|
// delgate to RubyPort otherwise
|
|
return RubyPort::getPort(if_name, idx);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::wakeup()
|
|
{
|
|
Cycles current_time = curCycle();
|
|
for (auto& requestList : coalescedTable) {
|
|
for (auto& req : requestList.second) {
|
|
if (current_time - req->getIssueTime() > m_deadlock_threshold) {
|
|
std::stringstream ss;
|
|
printRequestTable(ss);
|
|
warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
|
|
m_version, ss.str());
|
|
panic("Aborting due to deadlock!\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
|
|
uncoalescedTable.checkDeadlock(tick_threshold);
|
|
|
|
if (m_outstanding_count > 0) {
|
|
schedule(deadlockCheckEvent,
|
|
m_deadlock_threshold * clockPeriod() +
|
|
curTick());
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::printRequestTable(std::stringstream& ss)
|
|
{
|
|
ss << "Printing out " << coalescedTable.size()
|
|
<< " outstanding requests in the coalesced table\n";
|
|
|
|
for (auto& requestList : coalescedTable) {
|
|
for (auto& request : requestList.second) {
|
|
ss << "\tAddr: " << printAddress(requestList.first) << "\n"
|
|
<< "\tInstruction sequence number: "
|
|
<< request->getSeqNum() << "\n"
|
|
<< "\t\tType: "
|
|
<< RubyRequestType_to_string(request->getRubyType()) << "\n"
|
|
<< "\t\tNumber of associated packets: "
|
|
<< request->getPackets().size() << "\n"
|
|
<< "\t\tIssue time: "
|
|
<< request->getIssueTime() * clockPeriod() << "\n"
|
|
<< "\t\tDifference from current tick: "
|
|
<< (curCycle() - request->getIssueTime()) * clockPeriod()
|
|
<< "\n";
|
|
}
|
|
}
|
|
|
|
// print out packets waiting to be issued in uncoalesced table
|
|
uncoalescedTable.printRequestTable(ss);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::resetStats()
|
|
{
|
|
m_latencyHist.reset();
|
|
m_missLatencyHist.reset();
|
|
for (int i = 0; i < RubyRequestType_NUM; i++) {
|
|
m_typeLatencyHist[i]->reset();
|
|
m_missTypeLatencyHist[i]->reset();
|
|
for (int j = 0; j < MachineType_NUM; j++) {
|
|
m_missTypeMachLatencyHist[i][j]->reset();
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < MachineType_NUM; i++) {
|
|
m_missMachLatencyHist[i]->reset();
|
|
|
|
m_IssueToInitialDelayHist[i]->reset();
|
|
m_InitialToForwardDelayHist[i]->reset();
|
|
m_ForwardToFirstResponseDelayHist[i]->reset();
|
|
m_FirstResponseToCompletionDelayHist[i]->reset();
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::printProgress(std::ostream& out) const
|
|
{
|
|
}
|
|
|
|
// sets the kernelEndList
|
|
void
|
|
GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
|
|
{
|
|
// Don't know if this will happen or is possible
|
|
// but I just want to be careful and not have it become
|
|
// simulator hang in the future
|
|
DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
|
|
assert(kernelEndList.count(wavefront_id) == 0);
|
|
|
|
kernelEndList[wavefront_id] = pkt;
|
|
DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
|
|
kernelEndList.size());
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::writeCallback(Addr address, DataBlock& data)
|
|
{
|
|
writeCallback(address, MachineType_NULL, data);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data)
|
|
{
|
|
writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime)
|
|
{
|
|
writeCallback(address, mach, data,
|
|
initialRequestTime, forwardRequestTime, firstResponseTime,
|
|
false);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::writeCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion)
|
|
{
|
|
assert(address == makeLineAddress(address));
|
|
assert(coalescedTable.count(address));
|
|
|
|
auto crequest = coalescedTable.at(address).front();
|
|
|
|
hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
|
|
forwardRequestTime, firstResponseTime, isRegion);
|
|
|
|
// remove this crequest in coalescedTable
|
|
delete crequest;
|
|
coalescedTable.at(address).pop_front();
|
|
|
|
if (coalescedTable.at(address).empty()) {
|
|
coalescedTable.erase(address);
|
|
} else {
|
|
auto nextRequest = coalescedTable.at(address).front();
|
|
issueRequest(nextRequest);
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::writeCompleteCallback(Addr address,
|
|
uint64_t instSeqNum,
|
|
MachineType mach)
|
|
{
|
|
DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
|
|
" instSeqNum = %d\n", address, instSeqNum);
|
|
|
|
assert(pendingWriteInsts.count(instSeqNum) == 1);
|
|
PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
|
|
|
|
// check the uncoalescedTable to see whether all requests for the inst
|
|
// have been issued or not
|
|
bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
|
|
DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
|
|
"reqsAllIssued=%d\n", reqsAllIssued,
|
|
inst.getNumPendingStores()-1, reqsAllIssued);
|
|
|
|
if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
|
|
// if the pending write instruction has received all write completion
|
|
// callbacks for its issued Ruby requests, we can now start respond
|
|
// the requesting CU in one response packet.
|
|
inst.ackWriteCompletion(m_usingRubyTester);
|
|
|
|
DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
|
|
instSeqNum);
|
|
pendingWriteInsts.erase(instSeqNum);
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::readCallback(Addr address, DataBlock& data)
|
|
{
|
|
readCallback(address, MachineType_NULL, data);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data)
|
|
{
|
|
readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime)
|
|
{
|
|
|
|
readCallback(address, mach, data,
|
|
initialRequestTime, forwardRequestTime, firstResponseTime,
|
|
false);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::readCallback(Addr address,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion)
|
|
{
|
|
assert(address == makeLineAddress(address));
|
|
assert(coalescedTable.count(address));
|
|
|
|
auto crequest = coalescedTable.at(address).front();
|
|
fatal_if(crequest->getRubyType() != RubyRequestType_LD,
|
|
"readCallback received non-read type response\n");
|
|
|
|
hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
|
|
forwardRequestTime, firstResponseTime, isRegion);
|
|
|
|
delete crequest;
|
|
coalescedTable.at(address).pop_front();
|
|
if (coalescedTable.at(address).empty()) {
|
|
coalescedTable.erase(address);
|
|
} else {
|
|
auto nextRequest = coalescedTable.at(address).front();
|
|
issueRequest(nextRequest);
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::hitCallback(CoalescedRequest* crequest,
|
|
MachineType mach,
|
|
DataBlock& data,
|
|
bool success,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool isRegion)
|
|
{
|
|
PacketPtr pkt = crequest->getFirstPkt();
|
|
Addr request_address = pkt->getAddr();
|
|
[[maybe_unused]] Addr request_line_address =
|
|
makeLineAddress(request_address);
|
|
|
|
RubyRequestType type = crequest->getRubyType();
|
|
|
|
DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
|
|
|
|
recordMissLatency(crequest, mach,
|
|
initialRequestTime,
|
|
forwardRequestTime,
|
|
firstResponseTime,
|
|
success, isRegion);
|
|
// update the data
|
|
//
|
|
// MUST ADD DOING THIS FOR EACH REQUEST IN COALESCER
|
|
std::vector<PacketPtr> pktList = crequest->getPackets();
|
|
|
|
uint8_t* log = nullptr;
|
|
DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
|
|
pktList.size(), request_line_address);
|
|
uint32_t offset;
|
|
int pkt_size;
|
|
for (auto& pkt : pktList) {
|
|
offset = getOffset(pkt->getAddr());
|
|
pkt_size = pkt->getSize();
|
|
request_address = pkt->getAddr();
|
|
|
|
// When the Ruby system is cooldown phase, the requests come from
|
|
// the cache recorder. These requests do not get coalesced and
|
|
// do not return valid data.
|
|
if (RubySystem::getCooldownEnabled())
|
|
continue;
|
|
|
|
if (pkt->getPtr<uint8_t>()) {
|
|
switch(type) {
|
|
// Store and AtomicNoReturns follow the same path, as the
|
|
// data response is not needed.
|
|
case RubyRequestType_ATOMIC_NO_RETURN:
|
|
assert(pkt->isAtomicOp());
|
|
break;
|
|
case RubyRequestType_ST:
|
|
break;
|
|
case RubyRequestType_LD:
|
|
pkt->setData(data.getData(offset, pkt_size));
|
|
break;
|
|
case RubyRequestType_ATOMIC_RETURN:
|
|
assert(pkt->isAtomicOp());
|
|
// Atomic operations are performed by the WriteMask
|
|
// in packet order, set by the crequest. Thus, when
|
|
// unpacking the changes from the log, we read from
|
|
// the front of the log to correctly map response
|
|
// data into the packets.
|
|
|
|
// Log entry contains the old value before the current
|
|
// atomic operation occurred.
|
|
log = data.popAtomicLogEntryFront();
|
|
pkt->setData(&log[offset]);
|
|
delete [] log;
|
|
log = nullptr;
|
|
break;
|
|
default:
|
|
panic("Unsupported ruby packet type:%s\n",
|
|
RubyRequestType_to_string(type));
|
|
break;
|
|
}
|
|
} else {
|
|
DPRINTF(MemoryAccess,
|
|
"WARNING. Data not transfered from Ruby to M5 for type " \
|
|
"%s\n",
|
|
RubyRequestType_to_string(type));
|
|
}
|
|
}
|
|
assert(data.numAtomicLogEntries() == 0);
|
|
|
|
m_outstanding_count--;
|
|
assert(m_outstanding_count >= 0);
|
|
|
|
completeHitCallback(pktList);
|
|
}
|
|
|
|
bool
|
|
GPUCoalescer::empty() const
|
|
{
|
|
return coalescedTable.empty();
|
|
}
|
|
|
|
RubyRequestType
|
|
GPUCoalescer::getRequestType(PacketPtr pkt)
|
|
{
|
|
RubyRequestType req_type = RubyRequestType_NULL;
|
|
|
|
// These types are not support or not used in GPU caches.
|
|
assert(!pkt->req->isLLSC());
|
|
assert(!pkt->req->isLockedRMW());
|
|
assert(!pkt->req->isInstFetch());
|
|
|
|
if (pkt->req->isAtomicReturn()) {
|
|
req_type = RubyRequestType_ATOMIC_RETURN;
|
|
} else if (pkt->req->isAtomicNoReturn()) {
|
|
req_type = RubyRequestType_ATOMIC_NO_RETURN;
|
|
} else if (pkt->isRead()) {
|
|
req_type = RubyRequestType_LD;
|
|
} else if (pkt->isWrite()) {
|
|
req_type = RubyRequestType_ST;
|
|
} else if (pkt->isFlush()) {
|
|
req_type = RubyRequestType_FLUSH;
|
|
} else {
|
|
panic("Unsupported ruby packet type\n");
|
|
}
|
|
|
|
return req_type;
|
|
}
|
|
|
|
// Places an uncoalesced packet in uncoalescedTable. If the packet is a
|
|
// special type (MemFence, scoping, etc), it is issued immediately.
|
|
RequestStatus
|
|
GPUCoalescer::makeRequest(PacketPtr pkt)
|
|
{
|
|
if (pkt->cmd == MemCmd::MemSyncReq) {
|
|
// issue mem_sync requests immediately to the cache system without
|
|
// going through uncoalescedTable like normal LD/ST/Atomic requests
|
|
issueMemSyncRequest(pkt);
|
|
} else {
|
|
// all packets must have valid instruction sequence numbers
|
|
assert(pkt->req->hasInstSeqNum());
|
|
|
|
// otherwise, this must be either read or write command
|
|
assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());
|
|
|
|
InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
|
|
|
|
// in the case of protocol tester, there is one packet per sequence
|
|
// number. The number of packets during simulation depends on the
|
|
// number of lanes actives for that vmem request (i.e., the popcnt
|
|
// of the exec_mask.
|
|
int num_packets = 1;
|
|
|
|
// When Ruby is in warmup or cooldown phase, the requests come from
|
|
// the cache recorder. There is no dynamic instruction associated
|
|
// with these requests either
|
|
if (!RubySystem::getWarmupEnabled()
|
|
&& !RubySystem::getCooldownEnabled()) {
|
|
if (!m_usingRubyTester) {
|
|
num_packets = 0;
|
|
for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
|
|
num_packets += getDynInst(pkt)->getLaneStatus(i);
|
|
}
|
|
}
|
|
}
|
|
|
|
// the pkt is temporarily stored in the uncoalesced table until
|
|
// it's picked for coalescing process later in this cycle or in a
|
|
// future cycle. Packets remaining is set to the number of excepted
|
|
// requests from the instruction based on its exec_mask.
|
|
uncoalescedTable.insertPacket(pkt);
|
|
uncoalescedTable.insertReqType(pkt, getRequestType(pkt));
|
|
uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
|
|
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
|
|
pkt->getAddr());
|
|
|
|
// we schedule an issue event here to process the uncoalesced table
|
|
// and try to issue Ruby request to cache system
|
|
if (!issueEvent.scheduled()) {
|
|
DPRINTF(GPUCoalescer, "Scheduled issueEvent for seqNum %d\n",
|
|
seq_num);
|
|
schedule(issueEvent, curTick());
|
|
}
|
|
}
|
|
|
|
// we always return RequestStatus_Issued in this coalescer
|
|
// b/c the coalescer's resouce was checked ealier and the coalescer is
|
|
// queueing up aliased requets in its coalesced table
|
|
return RequestStatus_Issued;
|
|
}
|
|
|
|
template <class KEY, class VALUE>
|
|
std::ostream &
|
|
operator<<(std::ostream &out, const std::unordered_map<KEY, VALUE> &map)
|
|
{
|
|
out << "[";
|
|
for (auto i = map.begin(); i != map.end(); ++i)
|
|
out << " " << i->first << "=" << i->second;
|
|
out << " ]";
|
|
|
|
return out;
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::print(std::ostream& out) const
|
|
{
|
|
out << "[GPUCoalescer: " << m_version
|
|
<< ", outstanding requests: " << m_outstanding_count
|
|
<< "]";
|
|
}
|
|
|
|
GPUDynInstPtr
|
|
GPUCoalescer::getDynInst(PacketPtr pkt) const
|
|
{
|
|
RubyPort::SenderState* ss =
|
|
safe_cast<RubyPort::SenderState*>(pkt->senderState);
|
|
|
|
ComputeUnit::DataPort::SenderState* cu_state =
|
|
safe_cast<ComputeUnit::DataPort::SenderState*>
|
|
(ss->predecessor);
|
|
|
|
return cu_state->_gpuDynInst;
|
|
}
|
|
|
|
bool
|
|
GPUCoalescer::coalescePacket(PacketPtr pkt)
|
|
{
|
|
uint64_t seqNum = pkt->req->getReqInstSeqNum();
|
|
Addr line_addr = makeLineAddress(pkt->getAddr());
|
|
|
|
// If the packet has the same line address as a request already in the
|
|
// coalescedTable and has the same sequence number, it can be coalesced.
|
|
if (coalescedTable.count(line_addr)) {
|
|
// Search for a previous coalesced request with the same seqNum.
|
|
auto& creqQueue = coalescedTable.at(line_addr);
|
|
auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
|
|
[&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
|
|
);
|
|
if (citer != creqQueue.end()) {
|
|
(*citer)->insertPacket(pkt);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (m_outstanding_count < m_max_outstanding_requests) {
|
|
// This is an "aliased" or new request. Create a RubyRequest and
|
|
// append it to the list of "targets" in the coalescing table.
|
|
DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
|
|
line_addr);
|
|
|
|
CoalescedRequest *creq = new CoalescedRequest(seqNum);
|
|
creq->insertPacket(pkt);
|
|
creq->setRubyType(getRequestType(pkt));
|
|
creq->setIssueTime(curCycle());
|
|
|
|
if (!coalescedTable.count(line_addr)) {
|
|
// If there is no outstanding request for this line address,
|
|
// create a new coalecsed request and issue it immediately.
|
|
auto reqList = std::deque<CoalescedRequest*> { creq };
|
|
coalescedTable.insert(std::make_pair(line_addr, reqList));
|
|
if (!coalescedReqs.count(seqNum)) {
|
|
coalescedReqs.insert(std::make_pair(seqNum, reqList));
|
|
} else {
|
|
coalescedReqs.at(seqNum).push_back(creq);
|
|
}
|
|
} else {
|
|
// The request is for a line address that is already outstanding
|
|
// but for a different instruction. Add it as a new request to be
|
|
// issued when the current outstanding request is completed.
|
|
coalescedTable.at(line_addr).push_back(creq);
|
|
DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
|
|
line_addr, seqNum);
|
|
}
|
|
|
|
// In both cases, requests are added to the coalescing table and will
|
|
// be counted as outstanding requests.
|
|
m_outstanding_count++;
|
|
|
|
// We track all issued or to-be-issued Ruby requests associated with
|
|
// write instructions. An instruction may have multiple Ruby
|
|
// requests.
|
|
if (pkt->cmd == MemCmd::WriteReq) {
|
|
DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
|
|
" the pending write instruction list\n", seqNum,
|
|
line_addr);
|
|
|
|
RubyPort::SenderState* ss =
|
|
safe_cast<RubyPort::SenderState*>(pkt->senderState);
|
|
|
|
// we need to save this port because it will be used to call
|
|
// back the requesting CU when we receive write
|
|
// complete callbacks for all issued Ruby requests of this
|
|
// instruction.
|
|
RubyPort::MemResponsePort* mem_response_port = ss->port;
|
|
|
|
GPUDynInstPtr gpuDynInst = nullptr;
|
|
|
|
if (!m_usingRubyTester) {
|
|
// If this coalescer is connected to a real CU, we need
|
|
// to save the corresponding gpu dynamic instruction.
|
|
// CU will use that instruction to decrement wait counters
|
|
// in the issuing wavefront.
|
|
// For Ruby tester, gpuDynInst == nullptr
|
|
gpuDynInst = getDynInst(pkt);
|
|
}
|
|
|
|
PendingWriteInst& inst = pendingWriteInsts[seqNum];
|
|
inst.addPendingReq(mem_response_port, gpuDynInst,
|
|
m_usingRubyTester);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// The maximum number of outstanding requests have been issued.
|
|
return false;
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::completeIssue()
|
|
{
|
|
// Iterate over the maximum number of instructions we can coalesce
|
|
// per cycle (coalescingWindow).
|
|
for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
|
|
PerInstPackets *pkt_list =
|
|
uncoalescedTable.getInstPackets(instIdx);
|
|
|
|
// getInstPackets will return nullptr if no instruction
|
|
// exists at the current offset.
|
|
if (!pkt_list) {
|
|
break;
|
|
} else if (pkt_list->empty()) {
|
|
// Found something, but it has not been cleaned up by update
|
|
// resources yet. See if there is anything else to coalesce.
|
|
// Assume we can't check anymore if the coalescing window is 1.
|
|
continue;
|
|
} else {
|
|
// All packets in the list have the same seqNum, use first.
|
|
InstSeqNum seq_num = pkt_list->front()->req->getReqInstSeqNum();
|
|
|
|
// The difference in list size before and after tells us the
|
|
// number of packets which were coalesced.
|
|
size_t pkt_list_size = pkt_list->size();
|
|
|
|
// Since we have a pointer to the list of packets in the inst,
|
|
// erase them from the list if coalescing is successful and
|
|
// leave them in the list otherwise. This aggressively attempts
|
|
// to coalesce as many packets as possible from the current inst.
|
|
pkt_list->remove_if(
|
|
[&](PacketPtr pkt) { return coalescePacket(pkt); }
|
|
);
|
|
|
|
if (coalescedReqs.count(seq_num)) {
|
|
auto& creqs = coalescedReqs.at(seq_num);
|
|
for (auto creq : creqs) {
|
|
DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
|
|
RubyRequestType_to_string(creq->getRubyType()),
|
|
seq_num);
|
|
issueRequest(creq);
|
|
}
|
|
coalescedReqs.erase(seq_num);
|
|
}
|
|
|
|
assert(pkt_list_size >= pkt_list->size());
|
|
size_t pkt_list_diff = pkt_list_size - pkt_list->size();
|
|
|
|
int num_remaining = uncoalescedTable.getPacketsRemaining(seq_num);
|
|
num_remaining -= pkt_list_diff;
|
|
assert(num_remaining >= 0);
|
|
|
|
uncoalescedTable.setPacketsRemaining(seq_num, num_remaining);
|
|
DPRINTF(GPUCoalescer,
|
|
"Coalesced %d pkts for seqNum %d, %d remaining\n",
|
|
pkt_list_diff, seq_num, num_remaining);
|
|
}
|
|
}
|
|
|
|
// Clean up any instructions in the uncoalesced table that have had
|
|
// all of their packets coalesced and return a token for that column.
|
|
uncoalescedTable.updateResources();
|
|
|
|
// have Kernel End releases been issued this cycle
|
|
int len = newKernelEnds.size();
|
|
for (int i = 0; i < len; i++) {
|
|
kernelCallback(newKernelEnds[i]);
|
|
}
|
|
newKernelEnds.clear();
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::evictionCallback(Addr address)
|
|
{
|
|
ruby_eviction_callback(address);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::kernelCallback(int wavefront_id)
|
|
{
|
|
assert(kernelEndList.count(wavefront_id));
|
|
|
|
ruby_hit_callback(kernelEndList[wavefront_id]);
|
|
|
|
kernelEndList.erase(wavefront_id);
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::atomicCallback(Addr address,
|
|
MachineType mach,
|
|
const DataBlock& data)
|
|
{
|
|
assert(address == makeLineAddress(address));
|
|
assert(coalescedTable.count(address));
|
|
|
|
auto crequest = coalescedTable.at(address).front();
|
|
|
|
fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
|
|
crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
|
|
crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
|
|
"atomicCallback saw non-atomic type response\n");
|
|
|
|
hitCallback(crequest, mach, (DataBlock&)data, true,
|
|
crequest->getIssueTime(), Cycles(0), Cycles(0), false);
|
|
|
|
delete crequest;
|
|
coalescedTable.at(address).pop_front();
|
|
|
|
if (coalescedTable.at(address).empty()) {
|
|
coalescedTable.erase(address);
|
|
} else {
|
|
auto nextRequest = coalescedTable.at(address).front();
|
|
issueRequest(nextRequest);
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
|
|
{
|
|
for (auto& pkt : mylist) {
|
|
// When Ruby is in warmup or cooldown phase, the requests come
|
|
// from the cache recorder. They do not track which port to use
|
|
// and do not need to send the response back
|
|
if (!RubySystem::getWarmupEnabled()
|
|
&& !RubySystem::getCooldownEnabled()) {
|
|
RubyPort::SenderState *ss =
|
|
safe_cast<RubyPort::SenderState *>(pkt->senderState);
|
|
MemResponsePort *port = ss->port;
|
|
assert(port != NULL);
|
|
|
|
pkt->senderState = ss->predecessor;
|
|
|
|
if (pkt->cmd != MemCmd::WriteReq) {
|
|
// for WriteReq, we keep the original senderState until
|
|
// writeCompleteCallback
|
|
delete ss;
|
|
}
|
|
|
|
port->hitCallback(pkt);
|
|
trySendRetries();
|
|
}
|
|
}
|
|
|
|
// We schedule an event in the same tick as hitCallback (similar to
|
|
// makeRequest) rather than calling completeIssue directly to reduce
|
|
// function calls to complete issue. This can only happen if the max
|
|
// outstanding requests is less than the number of slots in the
|
|
// uncoalesced table and makeRequest is not called again.
|
|
if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) {
|
|
schedule(issueEvent, curTick());
|
|
}
|
|
|
|
RubySystem *rs = m_ruby_system;
|
|
if (RubySystem::getWarmupEnabled()) {
|
|
rs->m_cache_recorder->enqueueNextFetchRequest();
|
|
} else if (RubySystem::getCooldownEnabled()) {
|
|
rs->m_cache_recorder->enqueueNextFlushRequest();
|
|
} else {
|
|
testDrainComplete();
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
|
|
MachineType mach,
|
|
Cycles initialRequestTime,
|
|
Cycles forwardRequestTime,
|
|
Cycles firstResponseTime,
|
|
bool success, bool isRegion)
|
|
{
|
|
}
|
|
|
|
} // namespace ruby
|
|
} // namespace gem5
|