gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model
Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
committed by
Anthony Gutierrez
parent
b0eac7857a
commit
b8da9abba7
@@ -86,6 +86,14 @@ MemCmd::commandInfo[] =
|
||||
WriteResp, "WriteReq" },
|
||||
/* WriteResp */
|
||||
{ SET2(IsWrite, IsResponse), InvalidCmd, "WriteResp" },
|
||||
/* WriteCompleteResp - The WriteCompleteResp command is needed
|
||||
* because in the GPU memory model we use a WriteResp to indicate
|
||||
* that a write has reached the cache controller so we can free
|
||||
* resources at the coalescer. Later, when the write succesfully
|
||||
* completes we send a WriteCompleteResp to the CU so its wait
|
||||
* counters can be updated. Wait counters in the CU is how memory
|
||||
* dependences are handled in the GPU ISA. */
|
||||
{ SET2(IsWrite, IsResponse), InvalidCmd, "WriteCompleteResp" },
|
||||
/* WritebackDirty */
|
||||
{ SET5(IsWrite, IsRequest, IsEviction, HasData, FromCache),
|
||||
InvalidCmd, "WritebackDirty" },
|
||||
|
||||
@@ -83,6 +83,7 @@ class MemCmd
|
||||
ReadRespWithInvalidate,
|
||||
WriteReq,
|
||||
WriteResp,
|
||||
WriteCompleteResp,
|
||||
WritebackDirty,
|
||||
WritebackClean,
|
||||
WriteClean, // writes dirty data below without evicting
|
||||
|
||||
@@ -298,9 +298,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else {
|
||||
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
|
||||
if (in_msg.segment == HSASegment:SPILL) {
|
||||
trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else if (WB) {
|
||||
if (WB) {
|
||||
trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else {
|
||||
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
|
||||
|
||||
68
src/mem/ruby/protocol/GPU_VIPER-msg.sm
Normal file
68
src/mem/ruby/protocol/GPU_VIPER-msg.sm
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
structure (GPUCoalescer, external = "yes") {
|
||||
void readCallback(Addr, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void writeCallback(Addr, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void evictionCallback(Addr);
|
||||
void recordCPReadCallBack(MachineID, MachineID);
|
||||
void recordCPWriteCallBack(MachineID, MachineID);
|
||||
}
|
||||
|
||||
structure (VIPERCoalescer, external = "yes") {
|
||||
void readCallback(Addr, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void writeCallback(Addr, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void invCallback(Addr);
|
||||
void wbCallback(Addr);
|
||||
void evictionCallback(Addr);
|
||||
}
|
||||
@@ -3,6 +3,7 @@ include "RubySlicc_interfaces.slicc";
|
||||
include "MOESI_AMD_Base-msg.sm";
|
||||
include "MOESI_AMD_Base-dir.sm";
|
||||
include "MOESI_AMD_Base-CorePair.sm";
|
||||
include "GPU_VIPER-msg.sm";
|
||||
include "GPU_VIPER-TCP.sm";
|
||||
include "GPU_VIPER-SQC.sm";
|
||||
include "GPU_VIPER-TCC.sm";
|
||||
|
||||
@@ -135,7 +135,6 @@ structure(CPURequestMsg, desc="...", interface="Message") {
|
||||
CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer";
|
||||
WriteMask writeMask, desc="Write Through Data";
|
||||
MachineID WTRequestor, desc="Node who initiated the write through";
|
||||
HSAScope scope, default="HSAScope_SYSTEM", desc="Request Scope";
|
||||
int wfid, default="0", desc="wavefront id";
|
||||
bool NoWriteConflict, default="true", desc="write collided with CAB entry";
|
||||
int ProgramCounter, desc="PC that accesses to this block";
|
||||
|
||||
@@ -103,26 +103,6 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent")
|
||||
NotPresent, desc="block is NotPresent";
|
||||
Busy, desc="block is in a transient state, currently invalid";
|
||||
}
|
||||
//HSA scopes
|
||||
enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") {
|
||||
UNSPECIFIED, desc="Unspecified scope";
|
||||
NOSCOPE, desc="Explictly unscoped";
|
||||
WAVEFRONT, desc="Wavefront scope";
|
||||
WORKGROUP, desc="Workgroup scope";
|
||||
DEVICE, desc="Device scope";
|
||||
SYSTEM, desc="System scope";
|
||||
}
|
||||
|
||||
// HSA segment types
|
||||
enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") {
|
||||
GLOBAL, desc="Global segment";
|
||||
GROUP, desc="Group segment";
|
||||
PRIVATE, desc="Private segment";
|
||||
KERNARG, desc="Kernarg segment";
|
||||
READONLY, desc="Readonly segment";
|
||||
SPILL, desc="Spill segment";
|
||||
ARG, desc="Arg segment";
|
||||
}
|
||||
|
||||
// TesterStatus
|
||||
enumeration(TesterStatus, desc="...") {
|
||||
|
||||
@@ -138,42 +138,6 @@ structure (Sequencer, external = "yes") {
|
||||
bool checkResourceAvailable(CacheResourceType, Addr);
|
||||
}
|
||||
|
||||
structure (GPUCoalescer, external = "yes") {
|
||||
void readCallback(Addr, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void writeCallback(Addr, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void evictionCallback(Addr);
|
||||
void recordCPReadCallBack(MachineID, MachineID);
|
||||
void recordCPWriteCallBack(MachineID, MachineID);
|
||||
}
|
||||
|
||||
structure (VIPERCoalescer, external = "yes") {
|
||||
void readCallback(Addr, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void writeCallback(Addr, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void invCallback(Addr);
|
||||
void wbCallback(Addr);
|
||||
void evictionCallback(Addr);
|
||||
}
|
||||
|
||||
structure(RubyRequest, desc="...", interface="Message", external="yes") {
|
||||
Addr LineAddress, desc="Line address for this request";
|
||||
Addr PhysicalAddress, desc="Physical address for this request";
|
||||
@@ -186,8 +150,6 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
|
||||
WriteMask writeMask, desc="Writethrough mask";
|
||||
DataBlock WTData, desc="Writethrough data block";
|
||||
int wfid, desc="Writethrough wavefront";
|
||||
HSAScope scope, desc="HSA scope";
|
||||
HSASegment segment, desc="HSA segment";
|
||||
PacketPtr pkt, desc="Packet associated with this request";
|
||||
}
|
||||
|
||||
|
||||
@@ -43,7 +43,6 @@
|
||||
#include "debug/RubyQueue.hh"
|
||||
#include "mem/ruby/network/Network.hh"
|
||||
#include "mem/ruby/protocol/MemoryMsg.hh"
|
||||
#include "mem/ruby/system/GPUCoalescer.hh"
|
||||
#include "mem/ruby/system/RubySystem.hh"
|
||||
#include "mem/ruby/system/Sequencer.hh"
|
||||
#include "sim/system.hh"
|
||||
|
||||
@@ -35,8 +35,6 @@
|
||||
#include "mem/ruby/common/Address.hh"
|
||||
#include "mem/ruby/common/DataBlock.hh"
|
||||
#include "mem/ruby/common/WriteMask.hh"
|
||||
#include "mem/ruby/protocol/HSAScope.hh"
|
||||
#include "mem/ruby/protocol/HSASegment.hh"
|
||||
#include "mem/ruby/protocol/Message.hh"
|
||||
#include "mem/ruby/protocol/PrefetchBit.hh"
|
||||
#include "mem/ruby/protocol/RubyAccessMode.hh"
|
||||
|
||||
@@ -61,58 +61,6 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
GPUCoalescer *
|
||||
RubyGPUCoalescerParams::create()
|
||||
{
|
||||
return new GPUCoalescer(this);
|
||||
}
|
||||
|
||||
HSAScope
|
||||
reqScopeToHSAScope(const RequestPtr &req)
|
||||
{
|
||||
HSAScope accessScope = HSAScope_UNSPECIFIED;
|
||||
if (req->isScoped()) {
|
||||
if (req->isWavefrontScope()) {
|
||||
accessScope = HSAScope_WAVEFRONT;
|
||||
} else if (req->isWorkgroupScope()) {
|
||||
accessScope = HSAScope_WORKGROUP;
|
||||
} else if (req->isDeviceScope()) {
|
||||
accessScope = HSAScope_DEVICE;
|
||||
} else if (req->isSystemScope()) {
|
||||
accessScope = HSAScope_SYSTEM;
|
||||
} else {
|
||||
fatal("Bad scope type");
|
||||
}
|
||||
}
|
||||
return accessScope;
|
||||
}
|
||||
|
||||
HSASegment
|
||||
reqSegmentToHSASegment(const RequestPtr &req)
|
||||
{
|
||||
HSASegment accessSegment = HSASegment_GLOBAL;
|
||||
|
||||
if (req->isGlobalSegment()) {
|
||||
accessSegment = HSASegment_GLOBAL;
|
||||
} else if (req->isGroupSegment()) {
|
||||
accessSegment = HSASegment_GROUP;
|
||||
} else if (req->isPrivateSegment()) {
|
||||
accessSegment = HSASegment_PRIVATE;
|
||||
} else if (req->isKernargSegment()) {
|
||||
accessSegment = HSASegment_KERNARG;
|
||||
} else if (req->isReadonlySegment()) {
|
||||
accessSegment = HSASegment_READONLY;
|
||||
} else if (req->isSpillSegment()) {
|
||||
accessSegment = HSASegment_SPILL;
|
||||
} else if (req->isArgSegment()) {
|
||||
accessSegment = HSASegment_ARG;
|
||||
} else {
|
||||
fatal("Bad segment type");
|
||||
}
|
||||
|
||||
return accessSegment;
|
||||
}
|
||||
|
||||
UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
|
||||
: coalescer(gc)
|
||||
{
|
||||
@@ -152,6 +100,7 @@ UncoalescedTable::updateResources()
|
||||
{
|
||||
for (auto iter = instMap.begin(); iter != instMap.end(); ) {
|
||||
if (iter->second.empty()) {
|
||||
DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
|
||||
instMap.erase(iter++);
|
||||
coalescer->getGMTokenPort().sendTokens(1);
|
||||
} else {
|
||||
@@ -160,15 +109,27 @@ UncoalescedTable::updateResources()
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
|
||||
// iterate the instructions held in UncoalescedTable to see whether there
|
||||
// are more requests to issue; if yes, not yet done; otherwise, done
|
||||
for (auto& inst : instMap) {
|
||||
DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
|
||||
,inst.first, inst.second.size());
|
||||
if (inst.first == instSeqNum) { return false; }
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
UncoalescedTable::printRequestTable(std::stringstream& ss)
|
||||
{
|
||||
ss << "UncoalescedTable contains " << instMap.size()
|
||||
<< " address entries." << std::endl;
|
||||
ss << "Listing pending packets from " << instMap.size() << " instructions";
|
||||
|
||||
for (auto& inst : instMap) {
|
||||
ss << "Addr 0x" << std::hex << inst.first << std::dec
|
||||
<< " with " << inst.second.size() << " packets"
|
||||
<< std::endl;
|
||||
ss << "\tAddr: " << printAddress(inst.first) << " with "
|
||||
<< inst.second.size() << " pending packets" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,7 +188,6 @@ GPUCoalescer::GPUCoalescer(const Params *p)
|
||||
assert(m_dataCache_ptr);
|
||||
|
||||
m_runningGarnetStandalone = p->garnet_standalone;
|
||||
assumingRfOCoherence = p->assume_rfo;
|
||||
}
|
||||
|
||||
GPUCoalescer::~GPUCoalescer()
|
||||
@@ -254,18 +214,9 @@ GPUCoalescer::wakeup()
|
||||
if (current_time - req->getIssueTime() > m_deadlock_threshold) {
|
||||
std::stringstream ss;
|
||||
printRequestTable(ss);
|
||||
ss << "Outstanding requests: " << m_outstanding_count
|
||||
<< std::endl;
|
||||
|
||||
panic("Possible Deadlock detected. Aborting!\n"
|
||||
"version: %d request.paddr: 0x%x coalescedTable: %d "
|
||||
"current time: %u issue_time: %d difference: %d\n"
|
||||
"Request Tables:\n %s", m_version,
|
||||
req->getFirstPkt()->getAddr(),
|
||||
coalescedTable.size(), cyclesToTicks(current_time),
|
||||
cyclesToTicks(req->getIssueTime()),
|
||||
cyclesToTicks(current_time - req->getIssueTime()),
|
||||
ss.str());
|
||||
warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
|
||||
m_version, ss.str());
|
||||
panic("Aborting due to deadlock!\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -283,21 +234,27 @@ GPUCoalescer::wakeup()
|
||||
void
|
||||
GPUCoalescer::printRequestTable(std::stringstream& ss)
|
||||
{
|
||||
uncoalescedTable.printRequestTable(ss);
|
||||
ss << "Printing out " << coalescedTable.size()
|
||||
<< " outstanding requests in the coalesced table\n";
|
||||
|
||||
ss << "CoalescedTable contains " << coalescedTable.size()
|
||||
<< " address entries." << std::endl;
|
||||
for (auto& requestList : coalescedTable) {
|
||||
ss << "Addr 0x" << std::hex << requestList.first << std::dec
|
||||
<< ": type-";
|
||||
for (auto& request : requestList.second) {
|
||||
ss << RubyRequestType_to_string(request->getRubyType())
|
||||
<< " pkts-" << request->getPackets().size()
|
||||
<< " issued-" << request->getIssueTime() << " seqNum-"
|
||||
<< request->getSeqNum() << "; ";
|
||||
ss << "\tAddr: " << printAddress(requestList.first) << "\n"
|
||||
<< "\tInstruction sequence number: "
|
||||
<< request->getSeqNum() << "\n"
|
||||
<< "\t\tType: "
|
||||
<< RubyRequestType_to_string(request->getRubyType()) << "\n"
|
||||
<< "\t\tNumber of associated packets: "
|
||||
<< request->getPackets().size() << "\n"
|
||||
<< "\t\tIssue time: "
|
||||
<< request->getIssueTime() * clockPeriod() << "\n"
|
||||
<< "\t\tDifference from current tick: "
|
||||
<< (curCycle() - request->getIssueTime()) * clockPeriod();
|
||||
}
|
||||
ss << std::endl;
|
||||
}
|
||||
|
||||
// print out packets waiting to be issued in uncoalesced table
|
||||
uncoalescedTable.printRequestTable(ss);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -387,6 +344,7 @@ GPUCoalescer::writeCallback(Addr address,
|
||||
hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
|
||||
forwardRequestTime, firstResponseTime, isRegion);
|
||||
|
||||
// remove this crequest in coalescedTable
|
||||
delete crequest;
|
||||
coalescedTable.at(address).pop_front();
|
||||
|
||||
@@ -398,6 +356,36 @@ GPUCoalescer::writeCallback(Addr address,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCoalescer::writeCompleteCallback(Addr address,
|
||||
uint64_t instSeqNum,
|
||||
MachineType mach)
|
||||
{
|
||||
DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
|
||||
" instSeqNum = %d\n", address, instSeqNum);
|
||||
|
||||
assert(pendingWriteInsts.count(instSeqNum) == 1);
|
||||
PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
|
||||
|
||||
// check the uncoalescedTable to see whether all requests for the inst
|
||||
// have been issued or not
|
||||
bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
|
||||
DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
|
||||
"reqsAllIssued=%d\n", reqsAllIssued,
|
||||
inst.getNumPendingStores()-1, reqsAllIssued);
|
||||
|
||||
if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
|
||||
// if the pending write instruction has received all write completion
|
||||
// callbacks for its issued Ruby requests, we can now start respond
|
||||
// the requesting CU in one response packet.
|
||||
inst.ackWriteCompletion(m_usingRubyTester);
|
||||
|
||||
DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
|
||||
instSeqNum);
|
||||
pendingWriteInsts.erase(instSeqNum);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCoalescer::readCallback(Addr address, DataBlock& data)
|
||||
{
|
||||
@@ -477,7 +465,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
|
||||
{
|
||||
PacketPtr pkt = crequest->getFirstPkt();
|
||||
Addr request_address = pkt->getAddr();
|
||||
Addr request_line_address = makeLineAddress(request_address);
|
||||
Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);
|
||||
|
||||
RubyRequestType type = crequest->getRubyType();
|
||||
|
||||
@@ -516,20 +504,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
|
||||
"%s\n",
|
||||
RubyRequestType_to_string(type));
|
||||
}
|
||||
|
||||
// If using the RubyTester, update the RubyTester sender state's
|
||||
// subBlock with the recieved data. The tester will later access
|
||||
// this state.
|
||||
// Note: RubyPort will access it's sender state before the
|
||||
// RubyTester.
|
||||
if (m_usingRubyTester) {
|
||||
RubyPort::SenderState *requestSenderState =
|
||||
safe_cast<RubyPort::SenderState*>(pkt->senderState);
|
||||
RubyTester::SenderState* testerSenderState =
|
||||
safe_cast<RubyTester::SenderState*>
|
||||
(requestSenderState->predecessor);
|
||||
testerSenderState->subBlock.mergeFrom(data);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -566,8 +540,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
|
||||
} else if (pkt->isWrite()) {
|
||||
req_type = RubyRequestType_ST;
|
||||
} else {
|
||||
// Acquire and release packets will have been issued by
|
||||
// makeRequest, so we do not need to check for it here.
|
||||
panic("Unsupported ruby packet type\n");
|
||||
}
|
||||
|
||||
@@ -579,71 +551,43 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
|
||||
RequestStatus
|
||||
GPUCoalescer::makeRequest(PacketPtr pkt)
|
||||
{
|
||||
// Check for GPU Barrier Kernel End or Kernel Begin
|
||||
// Leave these to be handled by the child class
|
||||
// Kernel End/Barrier = isFlush + isRelease
|
||||
// Kernel Begin = isFlush + isAcquire
|
||||
if (pkt->req->isKernel()) {
|
||||
if (pkt->req->isAcquire()){
|
||||
// This is a Kernel Begin leave handling to
|
||||
// virtual xCoalescer::makeRequest
|
||||
return RequestStatus_Issued;
|
||||
}else if (pkt->req->isRelease()) {
|
||||
// This is a Kernel End leave handling to
|
||||
// virtual xCoalescer::makeRequest
|
||||
// If we are here then we didn't call
|
||||
// a virtual version of this function
|
||||
// so we will also schedule the callback
|
||||
int wf_id = 0;
|
||||
if (pkt->req->hasContextId()) {
|
||||
wf_id = pkt->req->contextId();
|
||||
}
|
||||
insertKernel(wf_id, pkt);
|
||||
newKernelEnds.push_back(wf_id);
|
||||
if (!issueEvent.scheduled()) {
|
||||
schedule(issueEvent, curTick());
|
||||
}
|
||||
return RequestStatus_Issued;
|
||||
// all packets must have valid instruction sequence numbers
|
||||
assert(pkt->req->hasInstSeqNum());
|
||||
|
||||
if (pkt->cmd == MemCmd::MemSyncReq) {
|
||||
// issue mem_sync requests immedidately to the cache system without
|
||||
// going though uncoalescedTable like normal LD/ST/Atomic requests
|
||||
issueMemSyncRequest(pkt);
|
||||
} else {
|
||||
// otherwise, this must be either read or write command
|
||||
assert(pkt->isRead() || pkt->isWrite());
|
||||
|
||||
// the pkt is temporarily stored in the uncoalesced table until
|
||||
// it's picked for coalescing process later in this cycle or in a
|
||||
// future cycle
|
||||
uncoalescedTable.insertPacket(pkt);
|
||||
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
|
||||
pkt->getAddr());
|
||||
|
||||
// we schedule an issue event here to process the uncoalesced table
|
||||
// and try to issue Ruby request to cache system
|
||||
if (!issueEvent.scheduled()) {
|
||||
schedule(issueEvent, curTick());
|
||||
}
|
||||
}
|
||||
|
||||
if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
|
||||
!pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
|
||||
(pkt->req->isRelease() || pkt->req->isAcquire())) {
|
||||
if (assumingRfOCoherence) {
|
||||
// If we reached here, this request must be a memFence
|
||||
// and the protocol implements RfO, the coalescer can
|
||||
// assume sequentially consistency and schedule the callback
|
||||
// immediately.
|
||||
// Currently the code implements fence callbacks
|
||||
// by reusing the mechanism for kernel completions.
|
||||
// This should be fixed.
|
||||
int wf_id = 0;
|
||||
if (pkt->req->hasContextId()) {
|
||||
wf_id = pkt->req->contextId();
|
||||
}
|
||||
insertKernel(wf_id, pkt);
|
||||
newKernelEnds.push_back(wf_id);
|
||||
if (!issueEvent.scheduled()) {
|
||||
schedule(issueEvent, curTick());
|
||||
}
|
||||
return RequestStatus_Issued;
|
||||
} else {
|
||||
// If not RfO, return issued here and let the child coalescer
|
||||
// take care of it.
|
||||
return RequestStatus_Issued;
|
||||
}
|
||||
}
|
||||
|
||||
uncoalescedTable.insertPacket(pkt);
|
||||
DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
|
||||
|
||||
if (!issueEvent.scheduled())
|
||||
schedule(issueEvent, curTick());
|
||||
// TODO: issue hardware prefetches here
|
||||
// we always return RequestStatus_Issued in this coalescer
|
||||
// b/c the coalescer's resouce was checked ealier and the coalescer is
|
||||
// queueing up aliased requets in its coalesced table
|
||||
return RequestStatus_Issued;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO: Figure out what do with this code. This code may go away
|
||||
* and/or be merged into the VIPER coalescer once the VIPER
|
||||
* protocol is re-integrated with GCN3 codes.
|
||||
*/
|
||||
/*
|
||||
void
|
||||
GPUCoalescer::issueRequest(CoalescedRequest* crequest)
|
||||
{
|
||||
@@ -736,8 +680,8 @@ GPUCoalescer::issueRequest(CoalescedRequest* crequest)
|
||||
}
|
||||
|
||||
assert(m_mandatory_q_ptr);
|
||||
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
|
||||
}
|
||||
m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
|
||||
}*/
|
||||
|
||||
template <class KEY, class VALUE>
|
||||
std::ostream &
|
||||
@@ -760,12 +704,6 @@ GPUCoalescer::print(ostream& out) const
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
|
||||
DPRINTF(RubyStats, "Recorded statistic: %s\n",
|
||||
SequencerRequestType_to_string(requestType));
|
||||
}
|
||||
|
||||
bool
|
||||
GPUCoalescer::coalescePacket(PacketPtr pkt)
|
||||
{
|
||||
@@ -819,6 +757,41 @@ GPUCoalescer::coalescePacket(PacketPtr pkt)
|
||||
// be counted as outstanding requests.
|
||||
m_outstanding_count++;
|
||||
|
||||
// We track all issued or to-be-issued Ruby requests associated with
|
||||
// write instructions. An instruction may have multiple Ruby
|
||||
// requests.
|
||||
if (pkt->cmd == MemCmd::WriteReq) {
|
||||
DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
|
||||
" the pending write instruction list\n", seqNum,
|
||||
line_addr);
|
||||
|
||||
RubyPort::SenderState* ss =
|
||||
safe_cast<RubyPort::SenderState*>(pkt->senderState);
|
||||
|
||||
// we need to save this port because it will be used to call
|
||||
// back the requesting CU when we receive write
|
||||
// complete callbacks for all issued Ruby requests of this
|
||||
// instruction.
|
||||
RubyPort::MemSlavePort* mem_slave_port = ss->port;
|
||||
|
||||
GPUDynInstPtr gpuDynInst = nullptr;
|
||||
|
||||
if (!m_usingRubyTester) {
|
||||
// If this coalescer is connected to a real CU, we need
|
||||
// to save the corresponding gpu dynamic instruction.
|
||||
// CU will use that instruction to decrement wait counters
|
||||
// in the issuing wavefront.
|
||||
// For Ruby tester, gpuDynInst == nullptr
|
||||
ComputeUnit::DataPort::SenderState* cu_state =
|
||||
safe_cast<ComputeUnit::DataPort::SenderState*>
|
||||
(ss->predecessor);
|
||||
gpuDynInst = cu_state->_gpuDynInst;
|
||||
}
|
||||
|
||||
PendingWriteInst& inst = pendingWriteInsts[seqNum];
|
||||
inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -907,34 +880,6 @@ GPUCoalescer::atomicCallback(Addr address,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
|
||||
{
|
||||
if (myMachID == senderMachID) {
|
||||
CP_TCPLdHits++;
|
||||
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
|
||||
CP_TCPLdTransfers++;
|
||||
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
|
||||
CP_TCCLdHits++;
|
||||
} else {
|
||||
CP_LdMiss++;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
|
||||
{
|
||||
if (myMachID == senderMachID) {
|
||||
CP_TCPStHits++;
|
||||
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
|
||||
CP_TCPStTransfers++;
|
||||
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
|
||||
CP_TCCStHits++;
|
||||
} else {
|
||||
CP_StMiss++;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
|
||||
{
|
||||
@@ -970,74 +915,6 @@ GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
|
||||
Cycles firstResponseTime,
|
||||
bool success, bool isRegion)
|
||||
{
|
||||
RubyRequestType type = crequest->getRubyType();
|
||||
Cycles issued_time = crequest->getIssueTime();
|
||||
Cycles completion_time = curCycle();
|
||||
assert(completion_time >= issued_time);
|
||||
Cycles total_lat = completion_time - issued_time;
|
||||
|
||||
// cache stats (valid for RfO protocol only)
|
||||
if (mach == MachineType_TCP) {
|
||||
if (type == RubyRequestType_LD) {
|
||||
GPU_TCPLdHits++;
|
||||
} else {
|
||||
GPU_TCPStHits++;
|
||||
}
|
||||
} else if (mach == MachineType_L1Cache_wCC) {
|
||||
if (type == RubyRequestType_LD) {
|
||||
GPU_TCPLdTransfers++;
|
||||
} else {
|
||||
GPU_TCPStTransfers++;
|
||||
}
|
||||
} else if (mach == MachineType_TCC) {
|
||||
if (type == RubyRequestType_LD) {
|
||||
GPU_TCCLdHits++;
|
||||
} else {
|
||||
GPU_TCCStHits++;
|
||||
}
|
||||
} else {
|
||||
if (type == RubyRequestType_LD) {
|
||||
GPU_LdMiss++;
|
||||
} else {
|
||||
GPU_StMiss++;
|
||||
}
|
||||
}
|
||||
|
||||
// Profile all access latency, even zero latency accesses
|
||||
m_latencyHist.sample(total_lat);
|
||||
m_typeLatencyHist[type]->sample(total_lat);
|
||||
|
||||
// Profile the miss latency for all non-zero demand misses
|
||||
if (total_lat != Cycles(0)) {
|
||||
m_missLatencyHist.sample(total_lat);
|
||||
m_missTypeLatencyHist[type]->sample(total_lat);
|
||||
|
||||
if (mach != MachineType_NUM) {
|
||||
m_missMachLatencyHist[mach]->sample(total_lat);
|
||||
m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
|
||||
|
||||
if ((issued_time <= initialRequestTime) &&
|
||||
(initialRequestTime <= forwardRequestTime) &&
|
||||
(forwardRequestTime <= firstResponseTime) &&
|
||||
(firstResponseTime <= completion_time)) {
|
||||
|
||||
m_IssueToInitialDelayHist[mach]->sample(
|
||||
initialRequestTime - issued_time);
|
||||
m_InitialToForwardDelayHist[mach]->sample(
|
||||
forwardRequestTime - initialRequestTime);
|
||||
m_ForwardToFirstResponseDelayHist[mach]->sample(
|
||||
firstResponseTime - forwardRequestTime);
|
||||
m_FirstResponseToCompletionDelayHist[mach]->sample(
|
||||
completion_time - firstResponseTime);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
|
||||
curTick(), m_version, "Coal",
|
||||
success ? "Done" : "SC_Failed", "", "",
|
||||
printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -1085,74 +962,4 @@ GPUCoalescer::regStats()
|
||||
m_missTypeMachLatencyHist[i][j]->init(10);
|
||||
}
|
||||
}
|
||||
|
||||
// GPU cache stats
|
||||
GPU_TCPLdHits
|
||||
.name(name() + ".gpu_tcp_ld_hits")
|
||||
.desc("loads that hit in the TCP")
|
||||
;
|
||||
GPU_TCPLdTransfers
|
||||
.name(name() + ".gpu_tcp_ld_transfers")
|
||||
.desc("TCP to TCP load transfers")
|
||||
;
|
||||
GPU_TCCLdHits
|
||||
.name(name() + ".gpu_tcc_ld_hits")
|
||||
.desc("loads that hit in the TCC")
|
||||
;
|
||||
GPU_LdMiss
|
||||
.name(name() + ".gpu_ld_misses")
|
||||
.desc("loads that miss in the GPU")
|
||||
;
|
||||
|
||||
GPU_TCPStHits
|
||||
.name(name() + ".gpu_tcp_st_hits")
|
||||
.desc("stores that hit in the TCP")
|
||||
;
|
||||
GPU_TCPStTransfers
|
||||
.name(name() + ".gpu_tcp_st_transfers")
|
||||
.desc("TCP to TCP store transfers")
|
||||
;
|
||||
GPU_TCCStHits
|
||||
.name(name() + ".gpu_tcc_st_hits")
|
||||
.desc("stores that hit in the TCC")
|
||||
;
|
||||
GPU_StMiss
|
||||
.name(name() + ".gpu_st_misses")
|
||||
.desc("stores that miss in the GPU")
|
||||
;
|
||||
|
||||
// CP cache stats
|
||||
CP_TCPLdHits
|
||||
.name(name() + ".cp_tcp_ld_hits")
|
||||
.desc("loads that hit in the TCP")
|
||||
;
|
||||
CP_TCPLdTransfers
|
||||
.name(name() + ".cp_tcp_ld_transfers")
|
||||
.desc("TCP to TCP load transfers")
|
||||
;
|
||||
CP_TCCLdHits
|
||||
.name(name() + ".cp_tcc_ld_hits")
|
||||
.desc("loads that hit in the TCC")
|
||||
;
|
||||
CP_LdMiss
|
||||
.name(name() + ".cp_ld_misses")
|
||||
.desc("loads that miss in the GPU")
|
||||
;
|
||||
|
||||
CP_TCPStHits
|
||||
.name(name() + ".cp_tcp_st_hits")
|
||||
.desc("stores that hit in the TCP")
|
||||
;
|
||||
CP_TCPStTransfers
|
||||
.name(name() + ".cp_tcp_st_transfers")
|
||||
.desc("TCP to TCP store transfers")
|
||||
;
|
||||
CP_TCCStHits
|
||||
.name(name() + ".cp_tcc_st_hits")
|
||||
.desc("stores that hit in the TCC")
|
||||
;
|
||||
CP_StMiss
|
||||
.name(name() + ".cp_st_misses")
|
||||
.desc("stores that miss in the GPU")
|
||||
;
|
||||
}
|
||||
|
||||
@@ -38,11 +38,11 @@
|
||||
#include <unordered_map>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "mem/request.hh"
|
||||
#include "mem/ruby/common/Address.hh"
|
||||
#include "mem/ruby/common/Consumer.hh"
|
||||
#include "mem/ruby/protocol/HSAScope.hh"
|
||||
#include "mem/ruby/protocol/HSASegment.hh"
|
||||
#include "mem/ruby/protocol/PrefetchBit.hh"
|
||||
#include "mem/ruby/protocol/RubyAccessMode.hh"
|
||||
#include "mem/ruby/protocol/RubyRequestType.hh"
|
||||
@@ -57,9 +57,6 @@ class CacheMemory;
|
||||
|
||||
class RubyGPUCoalescerParams;
|
||||
|
||||
HSAScope reqScopeToHSAScope(const RequestPtr &req);
|
||||
HSASegment reqSegmentToHSASegment(const RequestPtr &req);
|
||||
|
||||
// List of packets that belongs to a specific instruction.
|
||||
typedef std::list<PacketPtr> PerInstPackets;
|
||||
|
||||
@@ -78,6 +75,7 @@ class UncoalescedTable
|
||||
// instructions at the offset.
|
||||
PerInstPackets* getInstPackets(int offset);
|
||||
void updateResources();
|
||||
bool areRequestsDone(const uint64_t instSeqNum);
|
||||
|
||||
// Check if a packet hasn't been removed from instMap in too long.
|
||||
// Panics if a deadlock is detected and returns nothing otherwise.
|
||||
@@ -120,6 +118,86 @@ class CoalescedRequest
|
||||
std::vector<PacketPtr> pkts;
|
||||
};
|
||||
|
||||
// PendingWriteInst tracks the number of outstanding Ruby requests
|
||||
// per write instruction. Once all requests associated with one instruction
|
||||
// are completely done in Ruby, we call back the requester to mark
|
||||
// that this instruction is complete.
|
||||
class PendingWriteInst
|
||||
{
|
||||
public:
|
||||
PendingWriteInst()
|
||||
: numPendingStores(0),
|
||||
originalPort(nullptr),
|
||||
gpuDynInstPtr(nullptr)
|
||||
{}
|
||||
|
||||
~PendingWriteInst()
|
||||
{}
|
||||
|
||||
void
|
||||
addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
|
||||
bool usingRubyTester)
|
||||
{
|
||||
assert(port);
|
||||
originalPort = port;
|
||||
|
||||
if (!usingRubyTester) {
|
||||
gpuDynInstPtr = inst;
|
||||
}
|
||||
|
||||
numPendingStores++;
|
||||
}
|
||||
|
||||
// return true if no more ack is expected
|
||||
bool
|
||||
receiveWriteCompleteAck()
|
||||
{
|
||||
assert(numPendingStores > 0);
|
||||
numPendingStores--;
|
||||
return (numPendingStores == 0) ? true : false;
|
||||
}
|
||||
|
||||
// ack the original requester that this write instruction is complete
|
||||
void
|
||||
ackWriteCompletion(bool usingRubyTester)
|
||||
{
|
||||
assert(numPendingStores == 0);
|
||||
|
||||
// make a response packet
|
||||
PacketPtr pkt = new Packet(std::make_shared<Request>(),
|
||||
MemCmd::WriteCompleteResp);
|
||||
|
||||
if (!usingRubyTester) {
|
||||
assert(gpuDynInstPtr);
|
||||
ComputeUnit::DataPort::SenderState* ss =
|
||||
new ComputeUnit::DataPort::SenderState
|
||||
(gpuDynInstPtr, 0, nullptr);
|
||||
pkt->senderState = ss;
|
||||
}
|
||||
|
||||
// send the ack response to the requester
|
||||
originalPort->sendTimingResp(pkt);
|
||||
}
|
||||
|
||||
int
|
||||
getNumPendingStores() {
|
||||
return numPendingStores;
|
||||
}
|
||||
|
||||
private:
|
||||
// the number of stores waiting for writeCompleteCallback
|
||||
int numPendingStores;
|
||||
// The original port that sent one of packets associated with this
|
||||
// write instruction. We may have more than one packet per instruction,
|
||||
// which implies multiple ports per instruction. However, we need
|
||||
// only 1 of the ports to call back the CU. Therefore, here we keep
|
||||
// track the port that sent the first packet of this instruction.
|
||||
RubyPort::MemSlavePort* originalPort;
|
||||
// similar to the originalPort, this gpuDynInstPtr is set only for
|
||||
// the first packet of this instruction.
|
||||
GPUDynInstPtr gpuDynInstPtr;
|
||||
};
|
||||
|
||||
class GPUCoalescer : public RubyPort
|
||||
{
|
||||
public:
|
||||
@@ -159,6 +237,17 @@ class GPUCoalescer : public RubyPort
|
||||
void collateStats();
|
||||
void regStats() override;
|
||||
|
||||
// each store request needs two callbacks:
|
||||
// (1) writeCallback is called when the store is received and processed
|
||||
// by TCP. This writeCallback does not guarantee the store is actually
|
||||
// completed at its destination cache or memory. writeCallback helps
|
||||
// release hardware resources (e.g., its entry in coalescedTable)
|
||||
// allocated for the store so that subsequent requests will not be
|
||||
// blocked unnecessarily due to hardware resource constraints.
|
||||
// (2) writeCompleteCallback is called when the store is fully completed
|
||||
// at its destination cache or memory. writeCompleteCallback
|
||||
// guarantees that the store is fully completed. This callback
|
||||
// will decrement hardware counters in CU
|
||||
void writeCallback(Addr address, DataBlock& data);
|
||||
|
||||
void writeCallback(Addr address,
|
||||
@@ -180,6 +269,10 @@ class GPUCoalescer : public RubyPort
|
||||
Cycles forwardRequestTime,
|
||||
Cycles firstResponseTime);
|
||||
|
||||
void writeCompleteCallback(Addr address,
|
||||
uint64_t instSeqNum,
|
||||
MachineType mach);
|
||||
|
||||
void readCallback(Addr address, DataBlock& data);
|
||||
|
||||
void readCallback(Addr address,
|
||||
@@ -200,18 +293,12 @@ class GPUCoalescer : public RubyPort
|
||||
Cycles forwardRequestTime,
|
||||
Cycles firstResponseTime,
|
||||
bool isRegion);
|
||||
/* atomics need their own callback because the data
|
||||
might be const coming from SLICC */
|
||||
|
||||
void atomicCallback(Addr address,
|
||||
MachineType mach,
|
||||
const DataBlock& data);
|
||||
|
||||
void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
|
||||
void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
|
||||
|
||||
// Alternate implementations in VIPER Coalescer
|
||||
virtual RequestStatus makeRequest(PacketPtr pkt) override;
|
||||
|
||||
RequestStatus makeRequest(PacketPtr pkt) override;
|
||||
int outstandingCount() const override { return m_outstanding_count; }
|
||||
|
||||
bool
|
||||
@@ -237,7 +324,6 @@ class GPUCoalescer : public RubyPort
|
||||
|
||||
GMTokenPort& getGMTokenPort() { return gmTokenPort; }
|
||||
|
||||
void recordRequestType(SequencerRequestType requestType);
|
||||
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
|
||||
|
||||
Stats::Histogram& getLatencyHist() { return m_latencyHist; }
|
||||
@@ -271,15 +357,17 @@ class GPUCoalescer : public RubyPort
|
||||
getFirstResponseToCompletionDelayHist(const MachineType t) const
|
||||
{ return *m_FirstResponseToCompletionDelayHist[t]; }
|
||||
|
||||
// Changed to protected to enable inheritance by VIPER Coalescer
|
||||
protected:
|
||||
bool tryCacheAccess(Addr addr, RubyRequestType type,
|
||||
Addr pc, RubyAccessMode access_mode,
|
||||
int size, DataBlock*& data_ptr);
|
||||
// Alternate implementations in VIPER Coalescer
|
||||
virtual void issueRequest(CoalescedRequest* crequest);
|
||||
|
||||
void kernelCallback(int wavfront_id);
|
||||
// since the two following issue functions are protocol-specific,
|
||||
// they must be implemented in a derived coalescer
|
||||
virtual void issueRequest(CoalescedRequest* crequest) = 0;
|
||||
virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
|
||||
|
||||
void kernelCallback(int wavefront_id);
|
||||
|
||||
void hitCallback(CoalescedRequest* crequest,
|
||||
MachineType mach,
|
||||
@@ -297,7 +385,6 @@ class GPUCoalescer : public RubyPort
|
||||
bool success, bool isRegion);
|
||||
void completeHitCallback(std::vector<PacketPtr> & mylist);
|
||||
|
||||
|
||||
virtual RubyRequestType getRequestType(PacketPtr pkt);
|
||||
|
||||
// Attempt to remove a packet from the uncoalescedTable and coalesce
|
||||
@@ -309,8 +396,6 @@ class GPUCoalescer : public RubyPort
|
||||
|
||||
EventFunctionWrapper issueEvent;
|
||||
|
||||
|
||||
// Changed to protected to enable inheritance by VIPER Coalescer
|
||||
protected:
|
||||
int m_max_outstanding_requests;
|
||||
Cycles m_deadlock_threshold;
|
||||
@@ -334,6 +419,11 @@ class GPUCoalescer : public RubyPort
|
||||
// an address, the are serviced in age order.
|
||||
std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
|
||||
|
||||
// a map btw an instruction sequence number and PendingWriteInst
|
||||
// this is used to do a final call back for each write when it is
|
||||
// completely done in the memory system
|
||||
std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
|
||||
|
||||
// Global outstanding request count, across all request tables
|
||||
int m_outstanding_count;
|
||||
bool m_deadlock_check_scheduled;
|
||||
@@ -350,26 +440,28 @@ class GPUCoalescer : public RubyPort
|
||||
EventFunctionWrapper deadlockCheckEvent;
|
||||
bool assumingRfOCoherence;
|
||||
|
||||
// m5 style stats for TCP hit/miss counts
|
||||
Stats::Scalar GPU_TCPLdHits;
|
||||
Stats::Scalar GPU_TCPLdTransfers;
|
||||
Stats::Scalar GPU_TCCLdHits;
|
||||
Stats::Scalar GPU_LdMiss;
|
||||
|
||||
Stats::Scalar GPU_TCPStHits;
|
||||
Stats::Scalar GPU_TCPStTransfers;
|
||||
Stats::Scalar GPU_TCCStHits;
|
||||
Stats::Scalar GPU_StMiss;
|
||||
|
||||
Stats::Scalar CP_TCPLdHits;
|
||||
Stats::Scalar CP_TCPLdTransfers;
|
||||
Stats::Scalar CP_TCCLdHits;
|
||||
Stats::Scalar CP_LdMiss;
|
||||
|
||||
Stats::Scalar CP_TCPStHits;
|
||||
Stats::Scalar CP_TCPStTransfers;
|
||||
Stats::Scalar CP_TCCStHits;
|
||||
Stats::Scalar CP_StMiss;
|
||||
// TODO - Need to update the following stats once the VIPER protocol
|
||||
// is re-integrated.
|
||||
// // m5 style stats for TCP hit/miss counts
|
||||
// Stats::Scalar GPU_TCPLdHits;
|
||||
// Stats::Scalar GPU_TCPLdTransfers;
|
||||
// Stats::Scalar GPU_TCCLdHits;
|
||||
// Stats::Scalar GPU_LdMiss;
|
||||
//
|
||||
// Stats::Scalar GPU_TCPStHits;
|
||||
// Stats::Scalar GPU_TCPStTransfers;
|
||||
// Stats::Scalar GPU_TCCStHits;
|
||||
// Stats::Scalar GPU_StMiss;
|
||||
//
|
||||
// Stats::Scalar CP_TCPLdHits;
|
||||
// Stats::Scalar CP_TCPLdTransfers;
|
||||
// Stats::Scalar CP_TCCLdHits;
|
||||
// Stats::Scalar CP_LdMiss;
|
||||
//
|
||||
// Stats::Scalar CP_TCPStHits;
|
||||
// Stats::Scalar CP_TCPStTransfers;
|
||||
// Stats::Scalar CP_TCCStHits;
|
||||
// Stats::Scalar CP_StMiss;
|
||||
|
||||
//! Histogram for number of outstanding requests per cycle.
|
||||
Stats::Histogram m_outstandReqHist;
|
||||
@@ -394,6 +486,21 @@ class GPUCoalescer : public RubyPort
|
||||
std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
|
||||
std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
|
||||
|
||||
// TODO - Need to update the following stats once the VIPER protocol
|
||||
// is re-integrated.
|
||||
// Stats::Distribution numHopDelays;
|
||||
// Stats::Distribution tcpToTccDelay;
|
||||
// Stats::Distribution tccToSdDelay;
|
||||
// Stats::Distribution sdToSdDelay;
|
||||
// Stats::Distribution sdToTccDelay;
|
||||
// Stats::Distribution tccToTcpDelay;
|
||||
//
|
||||
// Stats::Average avgTcpToTcc;
|
||||
// Stats::Average avgTccToSd;
|
||||
// Stats::Average avgSdToSd;
|
||||
// Stats::Average avgSdToTcc;
|
||||
// Stats::Average avgTccToTcp;
|
||||
|
||||
private:
|
||||
// Token port is used to send/receive tokens to/from GPU's global memory
|
||||
// pipeline across the port boundary. There is one per <wave size> data
|
||||
|
||||
@@ -36,6 +36,7 @@ from m5.objects.Sequencer import *
|
||||
|
||||
class RubyGPUCoalescer(RubyPort):
|
||||
type = 'RubyGPUCoalescer'
|
||||
abstract = True
|
||||
cxx_class = 'GPUCoalescer'
|
||||
cxx_header = "mem/ruby/system/GPUCoalescer.hh"
|
||||
|
||||
@@ -44,8 +45,6 @@ class RubyGPUCoalescer(RubyPort):
|
||||
"max requests (incl. prefetches) outstanding")
|
||||
max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
|
||||
"coalesced in a single cycle")
|
||||
assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
|
||||
"Ownership coherence");
|
||||
|
||||
icache = Param.RubyCache("")
|
||||
dcache = Param.RubyCache("")
|
||||
|
||||
@@ -58,7 +58,7 @@ class VIPERCoalescer : public GPUCoalescer
|
||||
VIPERCoalescer(const Params *);
|
||||
~VIPERCoalescer();
|
||||
|
||||
void issueMemSyncRequest(PacketPtr pkt);
|
||||
void issueMemSyncRequest(PacketPtr pkt) override;
|
||||
void issueRequest(CoalescedRequest* crequest) override;
|
||||
void wbCallback(Addr address);
|
||||
void invCallback(Addr address);
|
||||
|
||||
@@ -39,4 +39,3 @@ class VIPERCoalescer(RubyGPUCoalescer):
|
||||
cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
|
||||
max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
|
||||
max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
|
||||
assume_rfo = False
|
||||
|
||||
Reference in New Issue
Block a user