mem-ruby: GCN3 and VIPER integration

This patch modifies the Coalescer and VIPER protocol to support memory
synchronization requests and write-completion responses that are
required by upcoming GCN3 implementation.

VIPER protocol is simplified to be a solely write-through protocol.

Change-Id: Iccfa3d749a0301172a1cc567c59609bb548dace6
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29913
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Reviewed-by: Bradford Beckmann <brad.beckmann@amd.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Maintainer: Bradford Beckmann <brad.beckmann@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Tuan Ta
2018-05-04 12:14:13 -04:00
committed by Anthony Gutierrez
parent 3ca404da17
commit 18ebe62598
12 changed files with 316 additions and 475 deletions

View File

@@ -392,14 +392,15 @@ machine(MachineType:TCC, "TCC Cache")
action(w_sendResponseWBAck, "w", desc="send WB Ack") {
peek(responseFromNB_in, ResponseMsg) {
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:TDSysWBAck;
out_msg.Destination.clear();
out_msg.Destination.add(in_msg.WTRequestor);
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Control;
}
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:TDSysWBAck;
out_msg.Destination.clear();
out_msg.Destination.add(in_msg.WTRequestor);
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Control;
out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}
@@ -412,6 +413,7 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.Destination.add(in_msg.Requestor);
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Control;
out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}
@@ -486,6 +488,7 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.Dirty := true;
out_msg.DataBlk := in_msg.DataBlk;
out_msg.writeMask.orMask(in_msg.writeMask);
out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}

View File

@@ -56,9 +56,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
I, AccessPermission:Invalid, desc="Invalid";
V, AccessPermission:Read_Only, desc="Valid";
W, AccessPermission:Read_Write, desc="Written";
M, AccessPermission:Read_Write, desc="Written and Valid";
L, AccessPermission:Read_Write, desc="Local access is modifable";
A, AccessPermission:Invalid, desc="Waiting on Atomic";
}
@@ -67,7 +64,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
Load, desc="Load";
Store, desc="Store to L1 (L1 is dirty)";
StoreThrough, desc="Store directly to L2(L1 is clean)";
StoreLocal, desc="Store to L1 but L1 is clean";
Atomic, desc="Atomic";
Flush, desc="Flush if dirty(wbL1 for Store Release)";
Evict, desc="Evict if clean(invL1 for Load Acquire)";
@@ -264,7 +260,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
if (in_msg.Type == CoherenceResponseType:TDSysResp) {
// disable L1 cache
if (disableL1) {
trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
} else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe);
@@ -291,18 +287,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
DPRINTF(RubySlicc, "%s\n", in_msg);
if (in_msg.Type == RubyRequestType:LD) {
trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:ATOMIC) {
} else if (in_msg.Type == RubyRequestType:ATOMIC ||
in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:ST) {
if(disableL1) {
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
} else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
if (WB) {
trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
} else {
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
}
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
} else {
Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
@@ -314,16 +308,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
} else {
error("Unexpected Request Message from VIC");
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
if (WB) {
trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
} else {
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
}
} else {
Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
}
}
}
}
@@ -415,6 +399,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
out_msg.Type := CoherenceRequestType:WriteThrough;
out_msg.InitialRequestTime := curCycle();
out_msg.Shared := false;
// forward inst sequence number to lower TCC
peek(mandatoryQueue_in, RubyRequest) {
out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}
@@ -475,6 +464,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
}
}
action(ad_atomicDone, "ad", desc="atomic done") {
assert(is_valid(cache_entry));
coalescer.atomicCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
}
action(s_storeDone, "s", desc="local store done") {
assert(is_valid(cache_entry));
@@ -491,37 +485,19 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
assert(false);
} else {
coalescer.invCallback(address);
}
}
action(wb_wbDone, "wb", desc="local wb done") {
if (inFlush == true) {
Fcnt := Fcnt + 1;
if (Fcnt > WTcnt) {
if (use_seq_not_coal) {
DPRINTF(RubySlicc, "Sequencer does not define wbCallback!\n");
assert(false);
} else {
coalescer.wbCallback(address);
}
Fcnt := Fcnt - 1;
}
if (WTcnt == 0 && Fcnt == 0) {
inFlush := false;
APPEND_TRANSITION_COMMENT(" inFlush is false");
}
coalescer.invTCPCallback(address);
}
}
action(wd_wtDone, "wd", desc="writethrough done") {
WTcnt := WTcnt - 1;
if (inFlush == true) {
Fcnt := Fcnt -1;
if (use_seq_not_coal) {
DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
assert(false);
} else {
peek(responseToTCP_in, ResponseMsg) {
coalescer.writeCompleteCallback(address, in_msg.instSeqNum);
}
}
assert(WTcnt >= 0);
APPEND_TRANSITION_COMMENT("write-- = ");
APPEND_TRANSITION_COMMENT(WTcnt);
}
action(dw_dirtyWrite, "dw", desc="update write mask"){
@@ -562,21 +538,21 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
// Stalling transitions do NOT check the tag array...and if they do,
// they can cause a resource stall deadlock!
transition({A}, {Load, Store, Atomic, StoreThrough}) { //TagArrayRead} {
transition({A}, {Load, Atomic, StoreThrough}) { //TagArrayRead} {
z_stall;
}
transition({M, V, L}, Load) {TagArrayRead, DataArrayRead} {
l_loadDone;
mru_updateMRU;
p_popMandatoryQueue;
}
transition(I, Load) {TagArrayRead} {
n_issueRdBlk;
p_popMandatoryQueue;
}
transition(V, Load) {TagArrayRead, DataArrayRead} {
l_loadDone;
mru_updateMRU;
p_popMandatoryQueue;
}
transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
t_allocateTBE;
mru_updateMRU;
@@ -584,55 +560,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
p_popMandatoryQueue;
}
transition({M, W}, Atomic, A) {TagArrayRead, TagArrayWrite} {
wt_writeThrough;
t_allocateTBE;
at_atomicThrough;
ic_invCache;
}
transition(W, Load, I) {TagArrayRead, DataArrayRead} {
wt_writeThrough;
norl_issueRdBlkOrloadDone;
p_popMandatoryQueue;
}
transition({I}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
a_allocate;
dw_dirtyWrite;
s_storeDone;
p_popMandatoryQueue;
}
transition({L, V}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
dw_dirtyWrite;
mru_updateMRU;
s_storeDone;
p_popMandatoryQueue;
}
transition(I, Store, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
a_allocate;
dw_dirtyWrite;
s_storeDone;
p_popMandatoryQueue;
}
transition(V, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
dw_dirtyWrite;
mru_updateMRU;
s_storeDone;
p_popMandatoryQueue;
}
transition({M, W}, Store) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
dw_dirtyWrite;
mru_updateMRU;
s_storeDone;
p_popMandatoryQueue;
}
//M,W should not see storeThrough
transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
a_allocate;
dw_dirtyWrite;
@@ -642,7 +569,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
p_popMandatoryQueue;
}
transition({V,L}, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
transition(V, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
dw_dirtyWrite;
s_storeDone;
wt_writeThrough;
@@ -672,7 +599,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
d_deallocateTBE;
a_allocate;
w_writeCache;
s_storeDone;
ad_atomicDone;
pr_popResponseQueue;
ic_invCache;
}
@@ -683,12 +610,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
pr_popResponseQueue;
}
transition({W, M}, TCC_Ack, M) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
w_writeCache;
l_loadDone;
pr_popResponseQueue;
}
transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
ic_invCache;
}
@@ -697,26 +618,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
ic_invCache;
}
transition({W, M}, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
wt_writeThrough;
ic_invCache;
}
transition(L, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
wt_writeThrough;
ic_invCache;
}
transition({W, M}, Flush, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
transition({V, I, A},Flush) {TagArrayFlash} {
sf_setFlush;
wt_writeThrough;
ic_invCache;
p_popMandatoryQueue;
}
transition({V, I, A, L},Flush) {TagArrayFlash} {
sf_setFlush;
wb_wbDone;
p_popMandatoryQueue;
}
@@ -726,20 +629,14 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
ic_invCache;
}
transition({W, M}, Evict, W) {TagArrayFlash} {
inv_invDone;
p_popMandatoryQueue;
}
transition({A, L}, Evict) {TagArrayFlash} {
transition(A, Evict) {TagArrayFlash} {
inv_invDone;
p_popMandatoryQueue;
}
// TCC_AckWB only snoops TBE
transition({V, I, A, M, W, L}, TCC_AckWB) {
transition({V, I, A}, TCC_AckWB) {
wd_wtDone;
wb_wbDone;
pr_popResponseQueue;
}
}

View File

@@ -62,7 +62,8 @@ structure (VIPERCoalescer, external = "yes") {
Cycles, Cycles, Cycles);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void invCallback(Addr);
void wbCallback(Addr);
void atomicCallback(Addr, MachineType, DataBlock);
void invTCPCallback(Addr);
void writeCompleteCallback(Addr, uint64_t);
void evictionCallback(Addr);
}

View File

@@ -514,6 +514,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := curCycle();
out_msg.ProbeRequestStartTime := curCycle();
out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}

View File

@@ -136,6 +136,7 @@ structure(CPURequestMsg, desc="...", interface="Message") {
WriteMask writeMask, desc="Write Through Data";
MachineID WTRequestor, desc="Node who initiated the write through";
int wfid, default="0", desc="wavefront id";
uint64_t instSeqNum, desc="instruction sequence number";
bool NoWriteConflict, default="true", desc="write collided with CAB entry";
int ProgramCounter, desc="PC that accesses to this block";
@@ -188,6 +189,7 @@ structure(TDProbeRequestMsg, desc="...", interface="Message") {
MessageSizeType MessageSize, desc="size category of the message";
int Phase, desc="Synchronization Phase";
int wfid, desc="wavefront id for Release";
uint64_t instSeqNum, desc="instruction sequence number";
MachineID Requestor, desc="Node who initiated the request";
bool functionalRead(Packet *pkt) {
@@ -242,6 +244,7 @@ structure(ResponseMsg, desc="...", interface="Message") {
bool NoAckNeeded, default="false", desc="For short circuting acks";
bool isValid, default="false", desc="Is acked block valid";
int wfid, default="0", desc="wavefront id";
uint64_t instSeqNum, desc="instruction sequence number";
int Phase, desc="Synchronization Phase";
int ProgramCounter, desc="PC that issues this request";
@@ -343,6 +346,7 @@ structure(FifoMsg, desc="...", interface="Message") {
Addr addr, desc="Address";
FifoType Type, desc="WriteThrough/WriteFlush";
int wfid, default="0",desc="wavefront id";
uint64_t instSeqNum, desc="instruction sequence number";
MachineID Requestor, desc="Flush Requestor";
MachineID oRequestor, desc="original Flush Requestor";

View File

@@ -150,6 +150,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
WriteMask writeMask, desc="Writethrough mask";
DataBlock WTData, desc="Writethrough data block";
int wfid, desc="Writethrough wavefront";
uint64_t instSeqNum, desc="Instruction sequence number";
PacketPtr pkt, desc="Packet associated with this request";
}

View File

@@ -56,6 +56,7 @@ class RubyRequest : public Message
WriteMask m_writeMask;
DataBlock m_WTData;
int m_wfid;
uint64_t m_instSeqNum;
RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len,
uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
@@ -80,7 +81,8 @@ class RubyRequest : public Message
RubyAccessMode _access_mode, PacketPtr _pkt, PrefetchBit _pb,
unsigned _proc_id, unsigned _core_id,
int _wm_size, std::vector<bool> & _wm_mask,
DataBlock & _Data)
DataBlock & _Data,
uint64_t _instSeqNum = 0)
: Message(curTime),
m_PhysicalAddress(_paddr),
m_Type(_type),
@@ -93,7 +95,8 @@ class RubyRequest : public Message
m_contextId(_core_id),
m_writeMask(_wm_size,_wm_mask),
m_WTData(_Data),
m_wfid(_proc_id)
m_wfid(_proc_id),
m_instSeqNum(_instSeqNum)
{
m_LineAddress = makeLineAddress(m_PhysicalAddress);
}
@@ -104,7 +107,8 @@ class RubyRequest : public Message
unsigned _proc_id, unsigned _core_id,
int _wm_size, std::vector<bool> & _wm_mask,
DataBlock & _Data,
std::vector< std::pair<int,AtomicOpFunctor*> > _atomicOps)
std::vector< std::pair<int,AtomicOpFunctor*> > _atomicOps,
uint64_t _instSeqNum = 0)
: Message(curTime),
m_PhysicalAddress(_paddr),
m_Type(_type),
@@ -117,7 +121,8 @@ class RubyRequest : public Message
m_contextId(_core_id),
m_writeMask(_wm_size,_wm_mask,_atomicOps),
m_WTData(_Data),
m_wfid(_proc_id)
m_wfid(_proc_id),
m_instSeqNum(_instSeqNum)
{
m_LineAddress = makeLineAddress(m_PhysicalAddress);
}

View File

@@ -506,8 +506,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
}
}
m_outstanding_count--;
assert(m_outstanding_count >= 0);
@@ -555,25 +553,24 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
assert(pkt->req->hasInstSeqNum());
if (pkt->cmd == MemCmd::MemSyncReq) {
// issue mem_sync requests immedidately to the cache system without
// going though uncoalescedTable like normal LD/ST/Atomic requests
issueMemSyncRequest(pkt);
} else {
// otherwise, this must be either read or write command
assert(pkt->isRead() || pkt->isWrite());
// let the child coalescer handle MemSyncReq because this is
// cache coherence protocol specific
return RequestStatus_Issued;
}
// otherwise, this must be either read or write command
assert(pkt->isRead() || pkt->isWrite());
// the pkt is temporarily stored in the uncoalesced table until
// it's picked for coalescing process later in this cycle or in a
// future cycle
uncoalescedTable.insertPacket(pkt);
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
pkt->getAddr());
// the pkt is temporarily stored in the uncoalesced table until
// it's picked for coalescing process later in this cycle or in a
// future cycle
uncoalescedTable.insertPacket(pkt);
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
pkt->getAddr());
// we schedule an issue event here to process the uncoalesced table
// and try to issue Ruby request to cache system
if (!issueEvent.scheduled()) {
schedule(issueEvent, curTick());
}
// we schedule an issue event here to process the uncoalesced table
// and try to issue Ruby request to cache system
if (!issueEvent.scheduled()) {
schedule(issueEvent, curTick());
}
// we always return RequestStatus_Issued in this coalescer
@@ -582,107 +579,6 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
return RequestStatus_Issued;
}
/**
* TODO: Figure out what do with this code. This code may go away
* and/or be merged into the VIPER coalescer once the VIPER
* protocol is re-integrated with GCN3 codes.
*/
/*
void
GPUCoalescer::issueRequest(CoalescedRequest* crequest)
{
PacketPtr pkt = crequest->getFirstPkt();
int proc_id = -1;
if (pkt != NULL && pkt->req->hasContextId()) {
proc_id = pkt->req->contextId();
}
// If valid, copy the pc to the ruby request
Addr pc = 0;
if (pkt->req->hasPC()) {
pc = pkt->req->getPC();
}
// At the moment setting scopes only counts
// for GPU spill space accesses
// which is pkt->req->isStack()
// this scope is REPLACE since it
// does not need to be flushed at the end
// of a kernel Private and local may need
// to be visible at the end of the kernel
HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
HSAScope accessScope = reqScopeToHSAScope(pkt->req);
Addr line_addr = makeLineAddress(pkt->getAddr());
// Creating WriteMask that records written bytes
// and atomic operations. This enables partial writes
// and partial reads of those writes
DataBlock dataBlock;
dataBlock.clear();
uint32_t blockSize = RubySystem::getBlockSizeBytes();
std::vector<bool> accessMask(blockSize,false);
std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
uint32_t tableSize = crequest->getPackets().size();
for (int i = 0; i < tableSize; i++) {
PacketPtr tmpPkt = crequest->getPackets()[i];
uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
uint32_t tmpSize = tmpPkt->getSize();
if (tmpPkt->isAtomicOp()) {
std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
tmpPkt->getAtomicOp());
atomicOps.push_back(tmpAtomicOp);
} else if (tmpPkt->isWrite()) {
dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
tmpOffset, tmpSize);
}
for (int j = 0; j < tmpSize; j++) {
accessMask[tmpOffset + j] = true;
}
}
std::shared_ptr<RubyRequest> msg;
if (pkt->isAtomicOp()) {
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
pkt->getPtr<uint8_t>(),
pkt->getSize(), pc, crequest->getRubyType(),
RubyAccessMode_Supervisor, pkt,
PrefetchBit_No, proc_id, 100,
blockSize, accessMask,
dataBlock, atomicOps,
accessScope, accessSegment);
} else {
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
pkt->getPtr<uint8_t>(),
pkt->getSize(), pc, crequest->getRubyType(),
RubyAccessMode_Supervisor, pkt,
PrefetchBit_No, proc_id, 100,
blockSize, accessMask,
dataBlock,
accessScope, accessSegment);
}
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
curTick(), m_version, "Coal", "Begin", "", "",
printAddress(msg->getPhysicalAddress()),
RubyRequestType_to_string(crequest->getRubyType()));
fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
"there should not be any I-Fetch requests in the GPU Coalescer");
Tick latency = cyclesToTicks(
m_controller->mandatoryQueueLatency(crequest->getRubyType()));
assert(latency > 0);
if (!deadlockCheckEvent.scheduled()) {
schedule(deadlockCheckEvent,
m_deadlock_threshold * clockPeriod() +
curTick());
}
assert(m_mandatory_q_ptr);
m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
}*/
template <class KEY, class VALUE>
std::ostream &
operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
@@ -890,7 +786,13 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
assert(port != NULL);
pkt->senderState = ss->predecessor;
delete ss;
if (pkt->cmd != MemCmd::WriteReq) {
// for WriteReq, we keep the original senderState until
// writeCompleteCallback
delete ss;
}
port->hitCallback(pkt);
trySendRetries();
}

View File

@@ -294,9 +294,11 @@ class GPUCoalescer : public RubyPort
Cycles firstResponseTime,
bool isRegion);
void atomicCallback(Addr address,
MachineType mach,
const DataBlock& data);
/* atomics need their own callback because the data
might be const coming from SLICC */
virtual void atomicCallback(Addr address,
MachineType mach,
const DataBlock& data);
RequestStatus makeRequest(PacketPtr pkt) override;
int outstandingCount() const override { return m_outstanding_count; }
@@ -365,7 +367,7 @@ class GPUCoalescer : public RubyPort
// since the two following issue functions are protocol-specific,
// they must be implemented in a derived coalescer
virtual void issueRequest(CoalescedRequest* crequest) = 0;
virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
// virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
void kernelCallback(int wavefront_id);

View File

@@ -272,6 +272,10 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt)
RubySystem::getBlockSizeBytes());
}
// Save the port in the sender state object to be used later to
// route the response
pkt->pushSenderState(new SenderState(this));
// Submit the ruby request
RequestStatus requestStatus = ruby_port->makeRequest(pkt);
@@ -279,16 +283,16 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt)
// Otherwise, we need to tell the port to retry at a later point
// and return false.
if (requestStatus == RequestStatus_Issued) {
// Save the port in the sender state object to be used later to
// route the response
pkt->pushSenderState(new SenderState(this));
DPRINTF(RubyPort, "Request %s address %#x issued\n", pkt->cmdString(),
DPRINTF(RubyPort, "Request %s 0x%x issued\n", pkt->cmdString(),
pkt->getAddr());
return true;
}
if (pkt->cmd != MemCmd::MemFenceReq) {
// pop off sender state as this request failed to issue
SenderState *ss = safe_cast<SenderState *>(pkt->popSenderState());
delete ss;
if (pkt->cmd != MemCmd::MemSyncReq) {
DPRINTF(RubyPort,
"Request %s for address %#x did not issue because %s\n",
pkt->cmdString(), pkt->getAddr(),
@@ -558,7 +562,7 @@ RubyPort::MemSlavePort::hitCallback(PacketPtr pkt)
}
// turn packet around to go back to requester if response expected
if (needsResponse) {
if (needsResponse || pkt->isResponse()) {
DPRINTF(RubyPort, "Sending packet back over port\n");
// Send a response in the same cycle. There is no need to delay the
// response because the response latency is already incurred in the

View File

@@ -44,6 +44,7 @@
#include "cpu/testers/rubytest/RubyTester.hh"
#include "debug/GPUCoalescer.hh"
#include "debug/MemoryAccess.hh"
#include "debug/ProtocolTrace.hh"
#include "mem/packet.hh"
#include "mem/ruby/common/SubBlock.hh"
#include "mem/ruby/network/MessageBuffer.hh"
@@ -64,148 +65,228 @@ VIPERCoalescerParams::create()
}
VIPERCoalescer::VIPERCoalescer(const Params *p)
: GPUCoalescer(p)
: GPUCoalescer(p),
m_cache_inv_pkt(nullptr),
m_num_pending_invs(0)
{
m_max_wb_per_cycle=p->max_wb_per_cycle;
m_max_inv_per_cycle=p->max_inv_per_cycle;
m_outstanding_inv = 0;
m_outstanding_wb = 0;
}
VIPERCoalescer::~VIPERCoalescer()
{
}
void
VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
{
}
void
VIPERCoalescer::issueMemSyncRequest(PacketPtr pkt)
{
}
// Places an uncoalesced packet in uncoalescedTable. If the packet is a
// special type (MemFence, scoping, etc), it is issued immediately.
RequestStatus
VIPERCoalescer::makeRequest(PacketPtr pkt)
{
if (m_outstanding_wb | m_outstanding_inv) {
DPRINTF(GPUCoalescer,
"There are %d Writebacks and %d Invalidatons\n",
m_outstanding_wb, m_outstanding_inv);
}
// Are we in the middle of a release
if ((m_outstanding_wb) > 0) {
if (pkt->req->isKernel()) {
// Everythign is fine
// Barriers and Kernel End scan coalesce
// If it is a Kerenl Begin flush the cache
if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) {
invL1();
}
// VIPER only supports following memory request types
// MemSyncReq & Acquire: TCP cache invalidation
// ReadReq : cache read
// WriteReq : cache write
// AtomicOp : cache atomic
//
// VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
// does not specify an equivalent type of memory request.
// TODO: future patches should rename Acquire and Release
assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isAcquire()) ||
pkt->cmd == MemCmd::ReadReq ||
pkt->cmd == MemCmd::WriteReq ||
pkt->isAtomicOp());
if (pkt->req->isRelease()) {
insertKernel(pkt->req->contextId(), pkt);
}
return RequestStatus_Issued;
}
} else if (pkt->req->isKernel() && pkt->req->isRelease()) {
// Flush Dirty Data on Kernel End
// isKernel + isRelease
insertKernel(pkt->req->contextId(), pkt);
wbL1();
if (m_outstanding_wb == 0) {
for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
newKernelEnds.push_back(it->first);
}
completeIssue();
}
return RequestStatus_Issued;
if (pkt->req->isAcquire() && m_cache_inv_pkt) {
// In VIPER protocol, the coalescer is not able to handle two or
// more cache invalidation requests at a time. Cache invalidation
// requests must be serialized to ensure that all stale data in
// TCP are invalidated correctly. If there's already a pending
// cache invalidation request, we must retry this request later
return RequestStatus_Aliased;
}
GPUCoalescer::makeRequest(pkt);
if (pkt->req->isKernel() && pkt->req->isAcquire()) {
// Invalidate clean Data on Kernel Begin
// isKernel + isAcquire
invL1();
} else if (pkt->req->isAcquire() && pkt->req->isRelease()) {
// Deschedule the AtomicAcqRel and
// Flush and Invalidate the L1 cache
invwbL1();
if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
deschedule(issueEvent);
}
} else if (pkt->req->isRelease()) {
// Deschedule the StoreRel and
// Flush the L1 cache
wbL1();
if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
deschedule(issueEvent);
}
} else if (pkt->req->isAcquire()) {
// LoadAcq or AtomicAcq
// Invalidate the L1 cache
invL1();
}
// Request was successful
if (m_outstanding_wb == 0) {
if (!issueEvent.scheduled()) {
DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n");
schedule(issueEvent, curTick());
}
if (pkt->req->isAcquire()) {
// In VIPER protocol, a compute unit sends a MemSyncReq with Acquire
// flag to invalidate TCP. Upon receiving a request of this type,
// VIPERCoalescer starts a cache walk to invalidate all valid entries
// in TCP. The request is completed once all entries are invalidated.
assert(!m_cache_inv_pkt);
m_cache_inv_pkt = pkt;
invTCP();
}
return RequestStatus_Issued;
}
void
VIPERCoalescer::wbCallback(Addr addr)
VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
{
m_outstanding_wb--;
// if L1 Flush Complete
// attemnpt to schedule issueEvent
assert(((int) m_outstanding_wb) >= 0);
if (m_outstanding_wb == 0) {
for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
newKernelEnds.push_back(it->first);
}
completeIssue();
PacketPtr pkt = crequest->getFirstPkt();
int proc_id = -1;
if (pkt != NULL && pkt->req->hasContextId()) {
proc_id = pkt->req->contextId();
}
trySendRetries();
// If valid, copy the pc to the ruby request
Addr pc = 0;
if (pkt->req->hasPC()) {
pc = pkt->req->getPC();
}
Addr line_addr = makeLineAddress(pkt->getAddr());
// Creating WriteMask that records written bytes
// and atomic operations. This enables partial writes
// and partial reads of those writes
DataBlock dataBlock;
dataBlock.clear();
uint32_t blockSize = RubySystem::getBlockSizeBytes();
std::vector<bool> accessMask(blockSize,false);
std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
uint32_t tableSize = crequest->getPackets().size();
for (int i = 0; i < tableSize; i++) {
PacketPtr tmpPkt = crequest->getPackets()[i];
uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
uint32_t tmpSize = tmpPkt->getSize();
if (tmpPkt->isAtomicOp()) {
std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
tmpPkt->getAtomicOp());
atomicOps.push_back(tmpAtomicOp);
} else if (tmpPkt->isWrite()) {
dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
tmpOffset, tmpSize);
}
for (int j = 0; j < tmpSize; j++) {
accessMask[tmpOffset + j] = true;
}
}
std::shared_ptr<RubyRequest> msg;
if (pkt->isAtomicOp()) {
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
pkt->getPtr<uint8_t>(),
pkt->getSize(), pc, crequest->getRubyType(),
RubyAccessMode_Supervisor, pkt,
PrefetchBit_No, proc_id, 100,
blockSize, accessMask,
dataBlock, atomicOps, crequest->getSeqNum());
} else {
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
pkt->getPtr<uint8_t>(),
pkt->getSize(), pc, crequest->getRubyType(),
RubyAccessMode_Supervisor, pkt,
PrefetchBit_No, proc_id, 100,
blockSize, accessMask,
dataBlock, crequest->getSeqNum());
}
if (pkt->cmd == MemCmd::WriteReq) {
makeWriteCompletePkts(crequest);
}
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
curTick(), m_version, "Coal", "Begin", "", "",
printAddress(msg->getPhysicalAddress()),
RubyRequestType_to_string(crequest->getRubyType()));
fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
"there should not be any I-Fetch requests in the GPU Coalescer");
if (!deadlockCheckEvent.scheduled()) {
schedule(deadlockCheckEvent,
m_deadlock_threshold * clockPeriod() +
curTick());
}
assert(m_mandatory_q_ptr);
Tick latency = cyclesToTicks(
m_controller->mandatoryQueueLatency(crequest->getRubyType()));
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
}
void
VIPERCoalescer::invCallback(Addr addr)
VIPERCoalescer::makeWriteCompletePkts(CoalescedRequest* crequest)
{
m_outstanding_inv--;
// if L1 Flush Complete
// attemnpt to schedule issueEvent
// This probably won't happen, since
// we dont wait on cache invalidations
if (m_outstanding_wb == 0) {
for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
newKernelEnds.push_back(it->first);
}
completeIssue();
// In VIPER protocol, for each write request, down-stream caches
// return two responses: writeCallback and writeCompleteCallback.
// We need to prepare a writeCompletePkt for each write request so
// that when writeCompleteCallback is called, we can respond
// requesting wavefront right away.
// writeCompletePkt inherits request and senderState of the original
// write request packet so that we can find the original requestor
// later. This assumes that request and senderState are not deleted
// before writeCompleteCallback is called.
auto key = crequest->getSeqNum();
std::vector<PacketPtr>& req_pkts = crequest->getPackets();
for (auto pkt : req_pkts) {
DPRINTF(GPUCoalescer, "makeWriteCompletePkts: instSeqNum %d\n",
key);
assert(pkt->cmd == MemCmd::WriteReq);
PacketPtr writeCompletePkt = new Packet(pkt->req,
MemCmd::WriteCompleteResp);
writeCompletePkt->setAddr(pkt->getAddr());
writeCompletePkt->senderState = pkt->senderState;
m_writeCompletePktMap[key].push_back(writeCompletePkt);
}
}
void
VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum)
{
DPRINTF(GPUCoalescer, "writeCompleteCallback: instSeqNum %d addr 0x%x\n",
instSeqNum, addr);
auto key = instSeqNum;
assert(m_writeCompletePktMap.count(key) == 1 &&
!m_writeCompletePktMap[key].empty());
for (auto writeCompletePkt : m_writeCompletePktMap[key]) {
if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
RubyPort::SenderState *ss =
safe_cast<RubyPort::SenderState *>
(writeCompletePkt->senderState);
MemSlavePort *port = ss->port;
assert(port != NULL);
writeCompletePkt->senderState = ss->predecessor;
delete ss;
port->hitCallback(writeCompletePkt);
}
}
trySendRetries();
if (m_writeCompletePktMap[key].empty())
m_writeCompletePktMap.erase(key);
}
void
VIPERCoalescer::invTCPCallback(Addr addr)
{
assert(m_cache_inv_pkt && m_num_pending_invs > 0);
m_num_pending_invs--;
if (m_num_pending_invs == 0) {
std::vector<PacketPtr> pkt_list { m_cache_inv_pkt };
completeHitCallback(pkt_list);
m_cache_inv_pkt = nullptr;
}
}
/**
* Invalidate L1 cache (Acquire)
* Invalidate TCP (Acquire)
*/
void
VIPERCoalescer::invL1()
VIPERCoalescer::invTCP()
{
int size = m_dataCache_ptr->getNumBlocks();
DPRINTF(GPUCoalescer,
"There are %d Invalidations outstanding before Cache Walk\n",
m_outstanding_inv);
m_num_pending_invs);
// Walk the cache
for (int i = 0; i < size; i++) {
Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
@@ -215,86 +296,14 @@ VIPERCoalescer::invL1()
clockEdge(), addr, (uint8_t*) 0, 0, 0,
request_type, RubyAccessMode_Supervisor,
nullptr);
DPRINTF(GPUCoalescer, "Evicting addr 0x%x\n", addr);
assert(m_mandatory_q_ptr != NULL);
Tick latency = cyclesToTicks(
m_controller->mandatoryQueueLatency(request_type));
assert(latency > 0);
m_controller->mandatoryQueueLatency(request_type));
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
m_outstanding_inv++;
m_num_pending_invs++;
}
DPRINTF(GPUCoalescer,
"There are %d Invalidatons outstanding after Cache Walk\n",
m_outstanding_inv);
}
/**
* Writeback L1 cache (Release)
*/
void
VIPERCoalescer::wbL1()
{
int size = m_dataCache_ptr->getNumBlocks();
DPRINTF(GPUCoalescer,
"There are %d Writebacks outstanding before Cache Walk\n",
m_outstanding_wb);
// Walk the cache
for (int i = 0; i < size; i++) {
Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
// Write dirty data back
RubyRequestType request_type = RubyRequestType_FLUSH;
std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
clockEdge(), addr, (uint8_t*) 0, 0, 0,
request_type, RubyAccessMode_Supervisor,
nullptr);
assert(m_mandatory_q_ptr != NULL);
Tick latency = cyclesToTicks(
m_controller->mandatoryQueueLatency(request_type));
assert(latency > 0);
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
m_outstanding_wb++;
}
DPRINTF(GPUCoalescer,
"There are %d Writebacks outstanding after Cache Walk\n",
m_outstanding_wb);
}
/**
* Invalidate and Writeback L1 cache (Acquire&Release)
*/
void
VIPERCoalescer::invwbL1()
{
int size = m_dataCache_ptr->getNumBlocks();
// Walk the cache
for (int i = 0; i < size; i++) {
Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
// Evict Read-only data
RubyRequestType request_type = RubyRequestType_REPLACEMENT;
std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
clockEdge(), addr, (uint8_t*) 0, 0, 0,
request_type, RubyAccessMode_Supervisor,
nullptr);
assert(m_mandatory_q_ptr != NULL);
Tick latency = cyclesToTicks(
m_controller->mandatoryQueueLatency(request_type));
assert(latency > 0);
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
m_outstanding_inv++;
}
// Walk the cache
for (int i = 0; i< size; i++) {
Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
// Write dirty data back
RubyRequestType request_type = RubyRequestType_FLUSH;
std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
clockEdge(), addr, (uint8_t*) 0, 0, 0,
request_type, RubyAccessMode_Supervisor,
nullptr);
assert(m_mandatory_q_ptr != NULL);
Tick latency = cyclesToTicks(
m_controller->mandatoryQueueLatency(request_type));
assert(latency > 0);
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
m_outstanding_wb++;
}
m_num_pending_invs);
}

View File

@@ -57,19 +57,31 @@ class VIPERCoalescer : public GPUCoalescer
typedef VIPERCoalescerParams Params;
VIPERCoalescer(const Params *);
~VIPERCoalescer();
void issueMemSyncRequest(PacketPtr pkt) override;
void issueRequest(CoalescedRequest* crequest) override;
void wbCallback(Addr address);
void invCallback(Addr address);
void writeCompleteCallback(Addr address, uint64_t instSeqNum);
void invTCPCallback(Addr address);
RequestStatus makeRequest(PacketPtr pkt) override;
void issueRequest(CoalescedRequest* crequest) override;
private:
void invL1();
void wbL1();
void invwbL1();
uint64_t m_outstanding_inv;
uint64_t m_outstanding_wb;
uint64_t m_max_inv_per_cycle;
uint64_t m_max_wb_per_cycle;
void invTCP();
// make write-complete response packets from original write request packets
void makeWriteCompletePkts(CoalescedRequest* crequest);
// current cache invalidation packet
// nullptr if there is no active cache invalidation request
PacketPtr m_cache_inv_pkt;
// number of remaining cache lines to be invalidated in TCP
int m_num_pending_invs;
// a map of instruction sequence number and corresponding pending
// write-complete response packets. Each write-complete response
// corresponds to a pending store request that is waiting for
// writeCompleteCallback. We may have multiple pending store requests per
// wavefront at a time. Each time writeCompleteCallback is called, an entry
// with a corresponding seqNum is popped off from map and returned to
// compute unit.
std::unordered_map<uint64_t, std::vector<PacketPtr>> m_writeCompletePktMap;
};
#endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__