gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Tony Gutierrez
2018-05-01 16:59:35 -04:00
committed by Anthony Gutierrez
parent b0eac7857a
commit b8da9abba7
86 changed files with 10299 additions and 3734 deletions

View File

@@ -86,6 +86,14 @@ MemCmd::commandInfo[] =
WriteResp, "WriteReq" },
/* WriteResp */
{ SET2(IsWrite, IsResponse), InvalidCmd, "WriteResp" },
/* WriteCompleteResp - The WriteCompleteResp command is needed
* because in the GPU memory model we use a WriteResp to indicate
* that a write has reached the cache controller so we can free
* resources at the coalescer. Later, when the write succesfully
* completes we send a WriteCompleteResp to the CU so its wait
* counters can be updated. Wait counters in the CU is how memory
* dependences are handled in the GPU ISA. */
{ SET2(IsWrite, IsResponse), InvalidCmd, "WriteCompleteResp" },
/* WritebackDirty */
{ SET5(IsWrite, IsRequest, IsEviction, HasData, FromCache),
InvalidCmd, "WritebackDirty" },

View File

@@ -83,6 +83,7 @@ class MemCmd
ReadRespWithInvalidate,
WriteReq,
WriteResp,
WriteCompleteResp,
WritebackDirty,
WritebackClean,
WriteClean, // writes dirty data below without evicting

View File

@@ -298,9 +298,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
} else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
if (in_msg.segment == HSASegment:SPILL) {
trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
} else if (WB) {
if (WB) {
trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
} else {
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);

View File

@@ -0,0 +1,68 @@
/*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
structure (GPUCoalescer, external = "yes") {
void readCallback(Addr, DataBlock);
void readCallback(Addr, MachineType, DataBlock);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void writeCallback(Addr, DataBlock);
void writeCallback(Addr, MachineType, DataBlock);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void evictionCallback(Addr);
void recordCPReadCallBack(MachineID, MachineID);
void recordCPWriteCallBack(MachineID, MachineID);
}
structure (VIPERCoalescer, external = "yes") {
void readCallback(Addr, DataBlock);
void readCallback(Addr, MachineType, DataBlock);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void writeCallback(Addr, DataBlock);
void writeCallback(Addr, MachineType, DataBlock);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void invCallback(Addr);
void wbCallback(Addr);
void evictionCallback(Addr);
}

View File

@@ -3,6 +3,7 @@ include "RubySlicc_interfaces.slicc";
include "MOESI_AMD_Base-msg.sm";
include "MOESI_AMD_Base-dir.sm";
include "MOESI_AMD_Base-CorePair.sm";
include "GPU_VIPER-msg.sm";
include "GPU_VIPER-TCP.sm";
include "GPU_VIPER-SQC.sm";
include "GPU_VIPER-TCC.sm";

View File

@@ -135,7 +135,6 @@ structure(CPURequestMsg, desc="...", interface="Message") {
CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer";
WriteMask writeMask, desc="Write Through Data";
MachineID WTRequestor, desc="Node who initiated the write through";
HSAScope scope, default="HSAScope_SYSTEM", desc="Request Scope";
int wfid, default="0", desc="wavefront id";
bool NoWriteConflict, default="true", desc="write collided with CAB entry";
int ProgramCounter, desc="PC that accesses to this block";

View File

@@ -103,26 +103,6 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent")
NotPresent, desc="block is NotPresent";
Busy, desc="block is in a transient state, currently invalid";
}
//HSA scopes
enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") {
UNSPECIFIED, desc="Unspecified scope";
NOSCOPE, desc="Explictly unscoped";
WAVEFRONT, desc="Wavefront scope";
WORKGROUP, desc="Workgroup scope";
DEVICE, desc="Device scope";
SYSTEM, desc="System scope";
}
// HSA segment types
enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") {
GLOBAL, desc="Global segment";
GROUP, desc="Group segment";
PRIVATE, desc="Private segment";
KERNARG, desc="Kernarg segment";
READONLY, desc="Readonly segment";
SPILL, desc="Spill segment";
ARG, desc="Arg segment";
}
// TesterStatus
enumeration(TesterStatus, desc="...") {

View File

@@ -138,42 +138,6 @@ structure (Sequencer, external = "yes") {
bool checkResourceAvailable(CacheResourceType, Addr);
}
structure (GPUCoalescer, external = "yes") {
void readCallback(Addr, DataBlock);
void readCallback(Addr, MachineType, DataBlock);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void writeCallback(Addr, DataBlock);
void writeCallback(Addr, MachineType, DataBlock);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void evictionCallback(Addr);
void recordCPReadCallBack(MachineID, MachineID);
void recordCPWriteCallBack(MachineID, MachineID);
}
structure (VIPERCoalescer, external = "yes") {
void readCallback(Addr, DataBlock);
void readCallback(Addr, MachineType, DataBlock);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void writeCallback(Addr, DataBlock);
void writeCallback(Addr, MachineType, DataBlock);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void invCallback(Addr);
void wbCallback(Addr);
void evictionCallback(Addr);
}
structure(RubyRequest, desc="...", interface="Message", external="yes") {
Addr LineAddress, desc="Line address for this request";
Addr PhysicalAddress, desc="Physical address for this request";
@@ -186,8 +150,6 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
WriteMask writeMask, desc="Writethrough mask";
DataBlock WTData, desc="Writethrough data block";
int wfid, desc="Writethrough wavefront";
HSAScope scope, desc="HSA scope";
HSASegment segment, desc="HSA segment";
PacketPtr pkt, desc="Packet associated with this request";
}

View File

@@ -43,7 +43,6 @@
#include "debug/RubyQueue.hh"
#include "mem/ruby/network/Network.hh"
#include "mem/ruby/protocol/MemoryMsg.hh"
#include "mem/ruby/system/GPUCoalescer.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "mem/ruby/system/Sequencer.hh"
#include "sim/system.hh"

View File

@@ -35,8 +35,6 @@
#include "mem/ruby/common/Address.hh"
#include "mem/ruby/common/DataBlock.hh"
#include "mem/ruby/common/WriteMask.hh"
#include "mem/ruby/protocol/HSAScope.hh"
#include "mem/ruby/protocol/HSASegment.hh"
#include "mem/ruby/protocol/Message.hh"
#include "mem/ruby/protocol/PrefetchBit.hh"
#include "mem/ruby/protocol/RubyAccessMode.hh"

View File

@@ -61,58 +61,6 @@
using namespace std;
GPUCoalescer *
RubyGPUCoalescerParams::create()
{
return new GPUCoalescer(this);
}
HSAScope
reqScopeToHSAScope(const RequestPtr &req)
{
HSAScope accessScope = HSAScope_UNSPECIFIED;
if (req->isScoped()) {
if (req->isWavefrontScope()) {
accessScope = HSAScope_WAVEFRONT;
} else if (req->isWorkgroupScope()) {
accessScope = HSAScope_WORKGROUP;
} else if (req->isDeviceScope()) {
accessScope = HSAScope_DEVICE;
} else if (req->isSystemScope()) {
accessScope = HSAScope_SYSTEM;
} else {
fatal("Bad scope type");
}
}
return accessScope;
}
HSASegment
reqSegmentToHSASegment(const RequestPtr &req)
{
HSASegment accessSegment = HSASegment_GLOBAL;
if (req->isGlobalSegment()) {
accessSegment = HSASegment_GLOBAL;
} else if (req->isGroupSegment()) {
accessSegment = HSASegment_GROUP;
} else if (req->isPrivateSegment()) {
accessSegment = HSASegment_PRIVATE;
} else if (req->isKernargSegment()) {
accessSegment = HSASegment_KERNARG;
} else if (req->isReadonlySegment()) {
accessSegment = HSASegment_READONLY;
} else if (req->isSpillSegment()) {
accessSegment = HSASegment_SPILL;
} else if (req->isArgSegment()) {
accessSegment = HSASegment_ARG;
} else {
fatal("Bad segment type");
}
return accessSegment;
}
UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
: coalescer(gc)
{
@@ -152,6 +100,7 @@ UncoalescedTable::updateResources()
{
for (auto iter = instMap.begin(); iter != instMap.end(); ) {
if (iter->second.empty()) {
DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
instMap.erase(iter++);
coalescer->getGMTokenPort().sendTokens(1);
} else {
@@ -160,15 +109,27 @@ UncoalescedTable::updateResources()
}
}
bool
UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
// iterate the instructions held in UncoalescedTable to see whether there
// are more requests to issue; if yes, not yet done; otherwise, done
for (auto& inst : instMap) {
DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
,inst.first, inst.second.size());
if (inst.first == instSeqNum) { return false; }
}
return true;
}
void
UncoalescedTable::printRequestTable(std::stringstream& ss)
{
ss << "UncoalescedTable contains " << instMap.size()
<< " address entries." << std::endl;
ss << "Listing pending packets from " << instMap.size() << " instructions";
for (auto& inst : instMap) {
ss << "Addr 0x" << std::hex << inst.first << std::dec
<< " with " << inst.second.size() << " packets"
<< std::endl;
ss << "\tAddr: " << printAddress(inst.first) << " with "
<< inst.second.size() << " pending packets" << std::endl;
}
}
@@ -227,7 +188,6 @@ GPUCoalescer::GPUCoalescer(const Params *p)
assert(m_dataCache_ptr);
m_runningGarnetStandalone = p->garnet_standalone;
assumingRfOCoherence = p->assume_rfo;
}
GPUCoalescer::~GPUCoalescer()
@@ -254,18 +214,9 @@ GPUCoalescer::wakeup()
if (current_time - req->getIssueTime() > m_deadlock_threshold) {
std::stringstream ss;
printRequestTable(ss);
ss << "Outstanding requests: " << m_outstanding_count
<< std::endl;
panic("Possible Deadlock detected. Aborting!\n"
"version: %d request.paddr: 0x%x coalescedTable: %d "
"current time: %u issue_time: %d difference: %d\n"
"Request Tables:\n %s", m_version,
req->getFirstPkt()->getAddr(),
coalescedTable.size(), cyclesToTicks(current_time),
cyclesToTicks(req->getIssueTime()),
cyclesToTicks(current_time - req->getIssueTime()),
ss.str());
warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
m_version, ss.str());
panic("Aborting due to deadlock!\n");
}
}
}
@@ -283,21 +234,27 @@ GPUCoalescer::wakeup()
void
GPUCoalescer::printRequestTable(std::stringstream& ss)
{
uncoalescedTable.printRequestTable(ss);
ss << "Printing out " << coalescedTable.size()
<< " outstanding requests in the coalesced table\n";
ss << "CoalescedTable contains " << coalescedTable.size()
<< " address entries." << std::endl;
for (auto& requestList : coalescedTable) {
ss << "Addr 0x" << std::hex << requestList.first << std::dec
<< ": type-";
for (auto& request : requestList.second) {
ss << RubyRequestType_to_string(request->getRubyType())
<< " pkts-" << request->getPackets().size()
<< " issued-" << request->getIssueTime() << " seqNum-"
<< request->getSeqNum() << "; ";
ss << "\tAddr: " << printAddress(requestList.first) << "\n"
<< "\tInstruction sequence number: "
<< request->getSeqNum() << "\n"
<< "\t\tType: "
<< RubyRequestType_to_string(request->getRubyType()) << "\n"
<< "\t\tNumber of associated packets: "
<< request->getPackets().size() << "\n"
<< "\t\tIssue time: "
<< request->getIssueTime() * clockPeriod() << "\n"
<< "\t\tDifference from current tick: "
<< (curCycle() - request->getIssueTime()) * clockPeriod();
}
ss << std::endl;
}
// print out packets waiting to be issued in uncoalesced table
uncoalescedTable.printRequestTable(ss);
}
void
@@ -387,6 +344,7 @@ GPUCoalescer::writeCallback(Addr address,
hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
forwardRequestTime, firstResponseTime, isRegion);
// remove this crequest in coalescedTable
delete crequest;
coalescedTable.at(address).pop_front();
@@ -398,6 +356,36 @@ GPUCoalescer::writeCallback(Addr address,
}
}
void
GPUCoalescer::writeCompleteCallback(Addr address,
uint64_t instSeqNum,
MachineType mach)
{
DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
" instSeqNum = %d\n", address, instSeqNum);
assert(pendingWriteInsts.count(instSeqNum) == 1);
PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
// check the uncoalescedTable to see whether all requests for the inst
// have been issued or not
bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
"reqsAllIssued=%d\n", reqsAllIssued,
inst.getNumPendingStores()-1, reqsAllIssued);
if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
// if the pending write instruction has received all write completion
// callbacks for its issued Ruby requests, we can now start respond
// the requesting CU in one response packet.
inst.ackWriteCompletion(m_usingRubyTester);
DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
instSeqNum);
pendingWriteInsts.erase(instSeqNum);
}
}
void
GPUCoalescer::readCallback(Addr address, DataBlock& data)
{
@@ -477,7 +465,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
{
PacketPtr pkt = crequest->getFirstPkt();
Addr request_address = pkt->getAddr();
Addr request_line_address = makeLineAddress(request_address);
Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);
RubyRequestType type = crequest->getRubyType();
@@ -516,20 +504,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
"%s\n",
RubyRequestType_to_string(type));
}
// If using the RubyTester, update the RubyTester sender state's
// subBlock with the recieved data. The tester will later access
// this state.
// Note: RubyPort will access it's sender state before the
// RubyTester.
if (m_usingRubyTester) {
RubyPort::SenderState *requestSenderState =
safe_cast<RubyPort::SenderState*>(pkt->senderState);
RubyTester::SenderState* testerSenderState =
safe_cast<RubyTester::SenderState*>
(requestSenderState->predecessor);
testerSenderState->subBlock.mergeFrom(data);
}
}
@@ -566,8 +540,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
} else if (pkt->isWrite()) {
req_type = RubyRequestType_ST;
} else {
// Acquire and release packets will have been issued by
// makeRequest, so we do not need to check for it here.
panic("Unsupported ruby packet type\n");
}
@@ -579,71 +551,43 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
RequestStatus
GPUCoalescer::makeRequest(PacketPtr pkt)
{
// Check for GPU Barrier Kernel End or Kernel Begin
// Leave these to be handled by the child class
// Kernel End/Barrier = isFlush + isRelease
// Kernel Begin = isFlush + isAcquire
if (pkt->req->isKernel()) {
if (pkt->req->isAcquire()){
// This is a Kernel Begin leave handling to
// virtual xCoalescer::makeRequest
return RequestStatus_Issued;
}else if (pkt->req->isRelease()) {
// This is a Kernel End leave handling to
// virtual xCoalescer::makeRequest
// If we are here then we didn't call
// a virtual version of this function
// so we will also schedule the callback
int wf_id = 0;
if (pkt->req->hasContextId()) {
wf_id = pkt->req->contextId();
}
insertKernel(wf_id, pkt);
newKernelEnds.push_back(wf_id);
if (!issueEvent.scheduled()) {
schedule(issueEvent, curTick());
}
return RequestStatus_Issued;
// all packets must have valid instruction sequence numbers
assert(pkt->req->hasInstSeqNum());
if (pkt->cmd == MemCmd::MemSyncReq) {
// issue mem_sync requests immedidately to the cache system without
// going though uncoalescedTable like normal LD/ST/Atomic requests
issueMemSyncRequest(pkt);
} else {
// otherwise, this must be either read or write command
assert(pkt->isRead() || pkt->isWrite());
// the pkt is temporarily stored in the uncoalesced table until
// it's picked for coalescing process later in this cycle or in a
// future cycle
uncoalescedTable.insertPacket(pkt);
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
pkt->getAddr());
// we schedule an issue event here to process the uncoalesced table
// and try to issue Ruby request to cache system
if (!issueEvent.scheduled()) {
schedule(issueEvent, curTick());
}
}
if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
!pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
(pkt->req->isRelease() || pkt->req->isAcquire())) {
if (assumingRfOCoherence) {
// If we reached here, this request must be a memFence
// and the protocol implements RfO, the coalescer can
// assume sequentially consistency and schedule the callback
// immediately.
// Currently the code implements fence callbacks
// by reusing the mechanism for kernel completions.
// This should be fixed.
int wf_id = 0;
if (pkt->req->hasContextId()) {
wf_id = pkt->req->contextId();
}
insertKernel(wf_id, pkt);
newKernelEnds.push_back(wf_id);
if (!issueEvent.scheduled()) {
schedule(issueEvent, curTick());
}
return RequestStatus_Issued;
} else {
// If not RfO, return issued here and let the child coalescer
// take care of it.
return RequestStatus_Issued;
}
}
uncoalescedTable.insertPacket(pkt);
DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
if (!issueEvent.scheduled())
schedule(issueEvent, curTick());
// TODO: issue hardware prefetches here
// we always return RequestStatus_Issued in this coalescer
// b/c the coalescer's resouce was checked ealier and the coalescer is
// queueing up aliased requets in its coalesced table
return RequestStatus_Issued;
}
/**
* TODO: Figure out what do with this code. This code may go away
* and/or be merged into the VIPER coalescer once the VIPER
* protocol is re-integrated with GCN3 codes.
*/
/*
void
GPUCoalescer::issueRequest(CoalescedRequest* crequest)
{
@@ -736,8 +680,8 @@ GPUCoalescer::issueRequest(CoalescedRequest* crequest)
}
assert(m_mandatory_q_ptr);
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
}
m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
}*/
template <class KEY, class VALUE>
std::ostream &
@@ -760,12 +704,6 @@ GPUCoalescer::print(ostream& out) const
}
void
GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
DPRINTF(RubyStats, "Recorded statistic: %s\n",
SequencerRequestType_to_string(requestType));
}
bool
GPUCoalescer::coalescePacket(PacketPtr pkt)
{
@@ -819,6 +757,41 @@ GPUCoalescer::coalescePacket(PacketPtr pkt)
// be counted as outstanding requests.
m_outstanding_count++;
// We track all issued or to-be-issued Ruby requests associated with
// write instructions. An instruction may have multiple Ruby
// requests.
if (pkt->cmd == MemCmd::WriteReq) {
DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
" the pending write instruction list\n", seqNum,
line_addr);
RubyPort::SenderState* ss =
safe_cast<RubyPort::SenderState*>(pkt->senderState);
// we need to save this port because it will be used to call
// back the requesting CU when we receive write
// complete callbacks for all issued Ruby requests of this
// instruction.
RubyPort::MemSlavePort* mem_slave_port = ss->port;
GPUDynInstPtr gpuDynInst = nullptr;
if (!m_usingRubyTester) {
// If this coalescer is connected to a real CU, we need
// to save the corresponding gpu dynamic instruction.
// CU will use that instruction to decrement wait counters
// in the issuing wavefront.
// For Ruby tester, gpuDynInst == nullptr
ComputeUnit::DataPort::SenderState* cu_state =
safe_cast<ComputeUnit::DataPort::SenderState*>
(ss->predecessor);
gpuDynInst = cu_state->_gpuDynInst;
}
PendingWriteInst& inst = pendingWriteInsts[seqNum];
inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
}
return true;
}
@@ -907,34 +880,6 @@ GPUCoalescer::atomicCallback(Addr address,
}
}
void
GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
{
if (myMachID == senderMachID) {
CP_TCPLdHits++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
CP_TCPLdTransfers++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
CP_TCCLdHits++;
} else {
CP_LdMiss++;
}
}
void
GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
{
if (myMachID == senderMachID) {
CP_TCPStHits++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
CP_TCPStTransfers++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
CP_TCCStHits++;
} else {
CP_StMiss++;
}
}
void
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
{
@@ -970,74 +915,6 @@ GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
Cycles firstResponseTime,
bool success, bool isRegion)
{
RubyRequestType type = crequest->getRubyType();
Cycles issued_time = crequest->getIssueTime();
Cycles completion_time = curCycle();
assert(completion_time >= issued_time);
Cycles total_lat = completion_time - issued_time;
// cache stats (valid for RfO protocol only)
if (mach == MachineType_TCP) {
if (type == RubyRequestType_LD) {
GPU_TCPLdHits++;
} else {
GPU_TCPStHits++;
}
} else if (mach == MachineType_L1Cache_wCC) {
if (type == RubyRequestType_LD) {
GPU_TCPLdTransfers++;
} else {
GPU_TCPStTransfers++;
}
} else if (mach == MachineType_TCC) {
if (type == RubyRequestType_LD) {
GPU_TCCLdHits++;
} else {
GPU_TCCStHits++;
}
} else {
if (type == RubyRequestType_LD) {
GPU_LdMiss++;
} else {
GPU_StMiss++;
}
}
// Profile all access latency, even zero latency accesses
m_latencyHist.sample(total_lat);
m_typeLatencyHist[type]->sample(total_lat);
// Profile the miss latency for all non-zero demand misses
if (total_lat != Cycles(0)) {
m_missLatencyHist.sample(total_lat);
m_missTypeLatencyHist[type]->sample(total_lat);
if (mach != MachineType_NUM) {
m_missMachLatencyHist[mach]->sample(total_lat);
m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
if ((issued_time <= initialRequestTime) &&
(initialRequestTime <= forwardRequestTime) &&
(forwardRequestTime <= firstResponseTime) &&
(firstResponseTime <= completion_time)) {
m_IssueToInitialDelayHist[mach]->sample(
initialRequestTime - issued_time);
m_InitialToForwardDelayHist[mach]->sample(
forwardRequestTime - initialRequestTime);
m_ForwardToFirstResponseDelayHist[mach]->sample(
firstResponseTime - forwardRequestTime);
m_FirstResponseToCompletionDelayHist[mach]->sample(
completion_time - firstResponseTime);
}
}
}
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
curTick(), m_version, "Coal",
success ? "Done" : "SC_Failed", "", "",
printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
}
void
@@ -1085,74 +962,4 @@ GPUCoalescer::regStats()
m_missTypeMachLatencyHist[i][j]->init(10);
}
}
// GPU cache stats
GPU_TCPLdHits
.name(name() + ".gpu_tcp_ld_hits")
.desc("loads that hit in the TCP")
;
GPU_TCPLdTransfers
.name(name() + ".gpu_tcp_ld_transfers")
.desc("TCP to TCP load transfers")
;
GPU_TCCLdHits
.name(name() + ".gpu_tcc_ld_hits")
.desc("loads that hit in the TCC")
;
GPU_LdMiss
.name(name() + ".gpu_ld_misses")
.desc("loads that miss in the GPU")
;
GPU_TCPStHits
.name(name() + ".gpu_tcp_st_hits")
.desc("stores that hit in the TCP")
;
GPU_TCPStTransfers
.name(name() + ".gpu_tcp_st_transfers")
.desc("TCP to TCP store transfers")
;
GPU_TCCStHits
.name(name() + ".gpu_tcc_st_hits")
.desc("stores that hit in the TCC")
;
GPU_StMiss
.name(name() + ".gpu_st_misses")
.desc("stores that miss in the GPU")
;
// CP cache stats
CP_TCPLdHits
.name(name() + ".cp_tcp_ld_hits")
.desc("loads that hit in the TCP")
;
CP_TCPLdTransfers
.name(name() + ".cp_tcp_ld_transfers")
.desc("TCP to TCP load transfers")
;
CP_TCCLdHits
.name(name() + ".cp_tcc_ld_hits")
.desc("loads that hit in the TCC")
;
CP_LdMiss
.name(name() + ".cp_ld_misses")
.desc("loads that miss in the GPU")
;
CP_TCPStHits
.name(name() + ".cp_tcp_st_hits")
.desc("stores that hit in the TCP")
;
CP_TCPStTransfers
.name(name() + ".cp_tcp_st_transfers")
.desc("TCP to TCP store transfers")
;
CP_TCCStHits
.name(name() + ".cp_tcc_st_hits")
.desc("stores that hit in the TCC")
;
CP_StMiss
.name(name() + ".cp_st_misses")
.desc("stores that miss in the GPU")
;
}

View File

@@ -38,11 +38,11 @@
#include <unordered_map>
#include "base/statistics.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/misc.hh"
#include "mem/request.hh"
#include "mem/ruby/common/Address.hh"
#include "mem/ruby/common/Consumer.hh"
#include "mem/ruby/protocol/HSAScope.hh"
#include "mem/ruby/protocol/HSASegment.hh"
#include "mem/ruby/protocol/PrefetchBit.hh"
#include "mem/ruby/protocol/RubyAccessMode.hh"
#include "mem/ruby/protocol/RubyRequestType.hh"
@@ -57,9 +57,6 @@ class CacheMemory;
class RubyGPUCoalescerParams;
HSAScope reqScopeToHSAScope(const RequestPtr &req);
HSASegment reqSegmentToHSASegment(const RequestPtr &req);
// List of packets that belongs to a specific instruction.
typedef std::list<PacketPtr> PerInstPackets;
@@ -78,6 +75,7 @@ class UncoalescedTable
// instructions at the offset.
PerInstPackets* getInstPackets(int offset);
void updateResources();
bool areRequestsDone(const uint64_t instSeqNum);
// Check if a packet hasn't been removed from instMap in too long.
// Panics if a deadlock is detected and returns nothing otherwise.
@@ -120,6 +118,86 @@ class CoalescedRequest
std::vector<PacketPtr> pkts;
};
// PendingWriteInst tracks the number of outstanding Ruby requests
// per write instruction. Once all requests associated with one instruction
// are completely done in Ruby, we call back the requester to mark
// that this instruction is complete.
class PendingWriteInst
{
public:
PendingWriteInst()
: numPendingStores(0),
originalPort(nullptr),
gpuDynInstPtr(nullptr)
{}
~PendingWriteInst()
{}
void
addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
bool usingRubyTester)
{
assert(port);
originalPort = port;
if (!usingRubyTester) {
gpuDynInstPtr = inst;
}
numPendingStores++;
}
// return true if no more ack is expected
bool
receiveWriteCompleteAck()
{
assert(numPendingStores > 0);
numPendingStores--;
return (numPendingStores == 0) ? true : false;
}
// ack the original requester that this write instruction is complete
void
ackWriteCompletion(bool usingRubyTester)
{
assert(numPendingStores == 0);
// make a response packet
PacketPtr pkt = new Packet(std::make_shared<Request>(),
MemCmd::WriteCompleteResp);
if (!usingRubyTester) {
assert(gpuDynInstPtr);
ComputeUnit::DataPort::SenderState* ss =
new ComputeUnit::DataPort::SenderState
(gpuDynInstPtr, 0, nullptr);
pkt->senderState = ss;
}
// send the ack response to the requester
originalPort->sendTimingResp(pkt);
}
int
getNumPendingStores() {
return numPendingStores;
}
private:
// the number of stores waiting for writeCompleteCallback
int numPendingStores;
// The original port that sent one of packets associated with this
// write instruction. We may have more than one packet per instruction,
// which implies multiple ports per instruction. However, we need
// only 1 of the ports to call back the CU. Therefore, here we keep
// track the port that sent the first packet of this instruction.
RubyPort::MemSlavePort* originalPort;
// similar to the originalPort, this gpuDynInstPtr is set only for
// the first packet of this instruction.
GPUDynInstPtr gpuDynInstPtr;
};
class GPUCoalescer : public RubyPort
{
public:
@@ -159,6 +237,17 @@ class GPUCoalescer : public RubyPort
void collateStats();
void regStats() override;
// each store request needs two callbacks:
// (1) writeCallback is called when the store is received and processed
// by TCP. This writeCallback does not guarantee the store is actually
// completed at its destination cache or memory. writeCallback helps
// release hardware resources (e.g., its entry in coalescedTable)
// allocated for the store so that subsequent requests will not be
// blocked unnecessarily due to hardware resource constraints.
// (2) writeCompleteCallback is called when the store is fully completed
// at its destination cache or memory. writeCompleteCallback
// guarantees that the store is fully completed. This callback
// will decrement hardware counters in CU
void writeCallback(Addr address, DataBlock& data);
void writeCallback(Addr address,
@@ -180,6 +269,10 @@ class GPUCoalescer : public RubyPort
Cycles forwardRequestTime,
Cycles firstResponseTime);
void writeCompleteCallback(Addr address,
uint64_t instSeqNum,
MachineType mach);
void readCallback(Addr address, DataBlock& data);
void readCallback(Addr address,
@@ -200,18 +293,12 @@ class GPUCoalescer : public RubyPort
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool isRegion);
/* atomics need their own callback because the data
might be const coming from SLICC */
void atomicCallback(Addr address,
MachineType mach,
const DataBlock& data);
void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
// Alternate implementations in VIPER Coalescer
virtual RequestStatus makeRequest(PacketPtr pkt) override;
RequestStatus makeRequest(PacketPtr pkt) override;
int outstandingCount() const override { return m_outstanding_count; }
bool
@@ -237,7 +324,6 @@ class GPUCoalescer : public RubyPort
GMTokenPort& getGMTokenPort() { return gmTokenPort; }
void recordRequestType(SequencerRequestType requestType);
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
Stats::Histogram& getLatencyHist() { return m_latencyHist; }
@@ -271,15 +357,17 @@ class GPUCoalescer : public RubyPort
getFirstResponseToCompletionDelayHist(const MachineType t) const
{ return *m_FirstResponseToCompletionDelayHist[t]; }
// Changed to protected to enable inheritance by VIPER Coalescer
protected:
bool tryCacheAccess(Addr addr, RubyRequestType type,
Addr pc, RubyAccessMode access_mode,
int size, DataBlock*& data_ptr);
// Alternate implementations in VIPER Coalescer
virtual void issueRequest(CoalescedRequest* crequest);
void kernelCallback(int wavfront_id);
// since the two following issue functions are protocol-specific,
// they must be implemented in a derived coalescer
virtual void issueRequest(CoalescedRequest* crequest) = 0;
virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
void kernelCallback(int wavefront_id);
void hitCallback(CoalescedRequest* crequest,
MachineType mach,
@@ -297,7 +385,6 @@ class GPUCoalescer : public RubyPort
bool success, bool isRegion);
void completeHitCallback(std::vector<PacketPtr> & mylist);
virtual RubyRequestType getRequestType(PacketPtr pkt);
// Attempt to remove a packet from the uncoalescedTable and coalesce
@@ -309,8 +396,6 @@ class GPUCoalescer : public RubyPort
EventFunctionWrapper issueEvent;
// Changed to protected to enable inheritance by VIPER Coalescer
protected:
int m_max_outstanding_requests;
Cycles m_deadlock_threshold;
@@ -334,6 +419,11 @@ class GPUCoalescer : public RubyPort
// an address, the are serviced in age order.
std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
// a map btw an instruction sequence number and PendingWriteInst
// this is used to do a final call back for each write when it is
// completely done in the memory system
std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
// Global outstanding request count, across all request tables
int m_outstanding_count;
bool m_deadlock_check_scheduled;
@@ -350,26 +440,28 @@ class GPUCoalescer : public RubyPort
EventFunctionWrapper deadlockCheckEvent;
bool assumingRfOCoherence;
// m5 style stats for TCP hit/miss counts
Stats::Scalar GPU_TCPLdHits;
Stats::Scalar GPU_TCPLdTransfers;
Stats::Scalar GPU_TCCLdHits;
Stats::Scalar GPU_LdMiss;
Stats::Scalar GPU_TCPStHits;
Stats::Scalar GPU_TCPStTransfers;
Stats::Scalar GPU_TCCStHits;
Stats::Scalar GPU_StMiss;
Stats::Scalar CP_TCPLdHits;
Stats::Scalar CP_TCPLdTransfers;
Stats::Scalar CP_TCCLdHits;
Stats::Scalar CP_LdMiss;
Stats::Scalar CP_TCPStHits;
Stats::Scalar CP_TCPStTransfers;
Stats::Scalar CP_TCCStHits;
Stats::Scalar CP_StMiss;
// TODO - Need to update the following stats once the VIPER protocol
// is re-integrated.
// // m5 style stats for TCP hit/miss counts
// Stats::Scalar GPU_TCPLdHits;
// Stats::Scalar GPU_TCPLdTransfers;
// Stats::Scalar GPU_TCCLdHits;
// Stats::Scalar GPU_LdMiss;
//
// Stats::Scalar GPU_TCPStHits;
// Stats::Scalar GPU_TCPStTransfers;
// Stats::Scalar GPU_TCCStHits;
// Stats::Scalar GPU_StMiss;
//
// Stats::Scalar CP_TCPLdHits;
// Stats::Scalar CP_TCPLdTransfers;
// Stats::Scalar CP_TCCLdHits;
// Stats::Scalar CP_LdMiss;
//
// Stats::Scalar CP_TCPStHits;
// Stats::Scalar CP_TCPStTransfers;
// Stats::Scalar CP_TCCStHits;
// Stats::Scalar CP_StMiss;
//! Histogram for number of outstanding requests per cycle.
Stats::Histogram m_outstandReqHist;
@@ -394,6 +486,21 @@ class GPUCoalescer : public RubyPort
std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
// TODO - Need to update the following stats once the VIPER protocol
// is re-integrated.
// Stats::Distribution numHopDelays;
// Stats::Distribution tcpToTccDelay;
// Stats::Distribution tccToSdDelay;
// Stats::Distribution sdToSdDelay;
// Stats::Distribution sdToTccDelay;
// Stats::Distribution tccToTcpDelay;
//
// Stats::Average avgTcpToTcc;
// Stats::Average avgTccToSd;
// Stats::Average avgSdToSd;
// Stats::Average avgSdToTcc;
// Stats::Average avgTccToTcp;
private:
// Token port is used to send/receive tokens to/from GPU's global memory
// pipeline across the port boundary. There is one per <wave size> data

View File

@@ -36,6 +36,7 @@ from m5.objects.Sequencer import *
class RubyGPUCoalescer(RubyPort):
type = 'RubyGPUCoalescer'
abstract = True
cxx_class = 'GPUCoalescer'
cxx_header = "mem/ruby/system/GPUCoalescer.hh"
@@ -44,8 +45,6 @@ class RubyGPUCoalescer(RubyPort):
"max requests (incl. prefetches) outstanding")
max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
"coalesced in a single cycle")
assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
"Ownership coherence");
icache = Param.RubyCache("")
dcache = Param.RubyCache("")

View File

@@ -58,7 +58,7 @@ class VIPERCoalescer : public GPUCoalescer
VIPERCoalescer(const Params *);
~VIPERCoalescer();
void issueMemSyncRequest(PacketPtr pkt);
void issueMemSyncRequest(PacketPtr pkt) override;
void issueRequest(CoalescedRequest* crequest) override;
void wbCallback(Addr address);
void invCallback(Addr address);

View File

@@ -39,4 +39,3 @@ class VIPERCoalescer(RubyGPUCoalescer):
cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
assume_rfo = False