mem-ruby: GCN3 and VIPER integration
This patch modifies the Coalescer and VIPER protocol to support memory synchronization requests and write-completion responses that are required by upcoming GCN3 implementation. VIPER protocol is simplified to be a solely write-through protocol. Change-Id: Iccfa3d749a0301172a1cc567c59609bb548dace6 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29913 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Reviewed-by: Bradford Beckmann <brad.beckmann@amd.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Maintainer: Bradford Beckmann <brad.beckmann@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
committed by
Anthony Gutierrez
parent
3ca404da17
commit
18ebe62598
@@ -392,14 +392,15 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
|
||||
action(w_sendResponseWBAck, "w", desc="send WB Ack") {
|
||||
peek(responseFromNB_in, ResponseMsg) {
|
||||
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:TDSysWBAck;
|
||||
out_msg.Destination.clear();
|
||||
out_msg.Destination.add(in_msg.WTRequestor);
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.MessageSize := MessageSizeType:Writeback_Control;
|
||||
}
|
||||
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:TDSysWBAck;
|
||||
out_msg.Destination.clear();
|
||||
out_msg.Destination.add(in_msg.WTRequestor);
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.MessageSize := MessageSizeType:Writeback_Control;
|
||||
out_msg.instSeqNum := in_msg.instSeqNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -412,6 +413,7 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
out_msg.Destination.add(in_msg.Requestor);
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.MessageSize := MessageSizeType:Writeback_Control;
|
||||
out_msg.instSeqNum := in_msg.instSeqNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -486,6 +488,7 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
out_msg.Dirty := true;
|
||||
out_msg.DataBlk := in_msg.DataBlk;
|
||||
out_msg.writeMask.orMask(in_msg.writeMask);
|
||||
out_msg.instSeqNum := in_msg.instSeqNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,9 +56,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
|
||||
I, AccessPermission:Invalid, desc="Invalid";
|
||||
V, AccessPermission:Read_Only, desc="Valid";
|
||||
W, AccessPermission:Read_Write, desc="Written";
|
||||
M, AccessPermission:Read_Write, desc="Written and Valid";
|
||||
L, AccessPermission:Read_Write, desc="Local access is modifable";
|
||||
A, AccessPermission:Invalid, desc="Waiting on Atomic";
|
||||
}
|
||||
|
||||
@@ -67,7 +64,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
Load, desc="Load";
|
||||
Store, desc="Store to L1 (L1 is dirty)";
|
||||
StoreThrough, desc="Store directly to L2(L1 is clean)";
|
||||
StoreLocal, desc="Store to L1 but L1 is clean";
|
||||
Atomic, desc="Atomic";
|
||||
Flush, desc="Flush if dirty(wbL1 for Store Release)";
|
||||
Evict, desc="Evict if clean(invL1 for Load Acquire)";
|
||||
@@ -264,7 +260,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
if (in_msg.Type == CoherenceResponseType:TDSysResp) {
|
||||
// disable L1 cache
|
||||
if (disableL1) {
|
||||
trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
|
||||
trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
|
||||
} else {
|
||||
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
|
||||
trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe);
|
||||
@@ -291,18 +287,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
DPRINTF(RubySlicc, "%s\n", in_msg);
|
||||
if (in_msg.Type == RubyRequestType:LD) {
|
||||
trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else if (in_msg.Type == RubyRequestType:ATOMIC) {
|
||||
} else if (in_msg.Type == RubyRequestType:ATOMIC ||
|
||||
in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
|
||||
in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
|
||||
trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else if (in_msg.Type == RubyRequestType:ST) {
|
||||
if(disableL1) {
|
||||
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else {
|
||||
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
|
||||
if (WB) {
|
||||
trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else {
|
||||
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
|
||||
}
|
||||
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else {
|
||||
Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
|
||||
trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
|
||||
@@ -314,16 +308,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else {
|
||||
error("Unexpected Request Message from VIC");
|
||||
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
|
||||
if (WB) {
|
||||
trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else {
|
||||
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
|
||||
}
|
||||
} else {
|
||||
Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
|
||||
trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -415,6 +399,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
out_msg.Type := CoherenceRequestType:WriteThrough;
|
||||
out_msg.InitialRequestTime := curCycle();
|
||||
out_msg.Shared := false;
|
||||
|
||||
// forward inst sequence number to lower TCC
|
||||
peek(mandatoryQueue_in, RubyRequest) {
|
||||
out_msg.instSeqNum := in_msg.instSeqNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -475,6 +464,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
}
|
||||
}
|
||||
|
||||
action(ad_atomicDone, "ad", desc="atomic done") {
|
||||
assert(is_valid(cache_entry));
|
||||
coalescer.atomicCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
|
||||
}
|
||||
|
||||
action(s_storeDone, "s", desc="local store done") {
|
||||
assert(is_valid(cache_entry));
|
||||
|
||||
@@ -491,37 +485,19 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
|
||||
assert(false);
|
||||
} else {
|
||||
coalescer.invCallback(address);
|
||||
}
|
||||
}
|
||||
|
||||
action(wb_wbDone, "wb", desc="local wb done") {
|
||||
if (inFlush == true) {
|
||||
Fcnt := Fcnt + 1;
|
||||
if (Fcnt > WTcnt) {
|
||||
if (use_seq_not_coal) {
|
||||
DPRINTF(RubySlicc, "Sequencer does not define wbCallback!\n");
|
||||
assert(false);
|
||||
} else {
|
||||
coalescer.wbCallback(address);
|
||||
}
|
||||
Fcnt := Fcnt - 1;
|
||||
}
|
||||
if (WTcnt == 0 && Fcnt == 0) {
|
||||
inFlush := false;
|
||||
APPEND_TRANSITION_COMMENT(" inFlush is false");
|
||||
}
|
||||
coalescer.invTCPCallback(address);
|
||||
}
|
||||
}
|
||||
|
||||
action(wd_wtDone, "wd", desc="writethrough done") {
|
||||
WTcnt := WTcnt - 1;
|
||||
if (inFlush == true) {
|
||||
Fcnt := Fcnt -1;
|
||||
if (use_seq_not_coal) {
|
||||
DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
|
||||
assert(false);
|
||||
} else {
|
||||
peek(responseToTCP_in, ResponseMsg) {
|
||||
coalescer.writeCompleteCallback(address, in_msg.instSeqNum);
|
||||
}
|
||||
}
|
||||
assert(WTcnt >= 0);
|
||||
APPEND_TRANSITION_COMMENT("write-- = ");
|
||||
APPEND_TRANSITION_COMMENT(WTcnt);
|
||||
}
|
||||
|
||||
action(dw_dirtyWrite, "dw", desc="update write mask"){
|
||||
@@ -562,21 +538,21 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
// Stalling transitions do NOT check the tag array...and if they do,
|
||||
// they can cause a resource stall deadlock!
|
||||
|
||||
transition({A}, {Load, Store, Atomic, StoreThrough}) { //TagArrayRead} {
|
||||
transition({A}, {Load, Atomic, StoreThrough}) { //TagArrayRead} {
|
||||
z_stall;
|
||||
}
|
||||
|
||||
transition({M, V, L}, Load) {TagArrayRead, DataArrayRead} {
|
||||
l_loadDone;
|
||||
mru_updateMRU;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition(I, Load) {TagArrayRead} {
|
||||
n_issueRdBlk;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition(V, Load) {TagArrayRead, DataArrayRead} {
|
||||
l_loadDone;
|
||||
mru_updateMRU;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
|
||||
t_allocateTBE;
|
||||
mru_updateMRU;
|
||||
@@ -584,55 +560,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition({M, W}, Atomic, A) {TagArrayRead, TagArrayWrite} {
|
||||
wt_writeThrough;
|
||||
t_allocateTBE;
|
||||
at_atomicThrough;
|
||||
ic_invCache;
|
||||
}
|
||||
|
||||
transition(W, Load, I) {TagArrayRead, DataArrayRead} {
|
||||
wt_writeThrough;
|
||||
norl_issueRdBlkOrloadDone;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition({I}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
a_allocate;
|
||||
dw_dirtyWrite;
|
||||
s_storeDone;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition({L, V}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
dw_dirtyWrite;
|
||||
mru_updateMRU;
|
||||
s_storeDone;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition(I, Store, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
a_allocate;
|
||||
dw_dirtyWrite;
|
||||
s_storeDone;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition(V, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
dw_dirtyWrite;
|
||||
mru_updateMRU;
|
||||
s_storeDone;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition({M, W}, Store) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
dw_dirtyWrite;
|
||||
mru_updateMRU;
|
||||
s_storeDone;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
//M,W should not see storeThrough
|
||||
transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
a_allocate;
|
||||
dw_dirtyWrite;
|
||||
@@ -642,7 +569,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition({V,L}, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
transition(V, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
dw_dirtyWrite;
|
||||
s_storeDone;
|
||||
wt_writeThrough;
|
||||
@@ -672,7 +599,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
d_deallocateTBE;
|
||||
a_allocate;
|
||||
w_writeCache;
|
||||
s_storeDone;
|
||||
ad_atomicDone;
|
||||
pr_popResponseQueue;
|
||||
ic_invCache;
|
||||
}
|
||||
@@ -683,12 +610,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
transition({W, M}, TCC_Ack, M) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
|
||||
w_writeCache;
|
||||
l_loadDone;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
|
||||
ic_invCache;
|
||||
}
|
||||
@@ -697,26 +618,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
ic_invCache;
|
||||
}
|
||||
|
||||
transition({W, M}, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
|
||||
wt_writeThrough;
|
||||
ic_invCache;
|
||||
}
|
||||
|
||||
transition(L, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
|
||||
wt_writeThrough;
|
||||
ic_invCache;
|
||||
}
|
||||
|
||||
transition({W, M}, Flush, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
|
||||
transition({V, I, A},Flush) {TagArrayFlash} {
|
||||
sf_setFlush;
|
||||
wt_writeThrough;
|
||||
ic_invCache;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition({V, I, A, L},Flush) {TagArrayFlash} {
|
||||
sf_setFlush;
|
||||
wb_wbDone;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
@@ -726,20 +629,14 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
ic_invCache;
|
||||
}
|
||||
|
||||
transition({W, M}, Evict, W) {TagArrayFlash} {
|
||||
inv_invDone;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition({A, L}, Evict) {TagArrayFlash} {
|
||||
transition(A, Evict) {TagArrayFlash} {
|
||||
inv_invDone;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
// TCC_AckWB only snoops TBE
|
||||
transition({V, I, A, M, W, L}, TCC_AckWB) {
|
||||
transition({V, I, A}, TCC_AckWB) {
|
||||
wd_wtDone;
|
||||
wb_wbDone;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,7 +62,8 @@ structure (VIPERCoalescer, external = "yes") {
|
||||
Cycles, Cycles, Cycles);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void invCallback(Addr);
|
||||
void wbCallback(Addr);
|
||||
void atomicCallback(Addr, MachineType, DataBlock);
|
||||
void invTCPCallback(Addr);
|
||||
void writeCompleteCallback(Addr, uint64_t);
|
||||
void evictionCallback(Addr);
|
||||
}
|
||||
|
||||
@@ -514,6 +514,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
|
||||
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
|
||||
out_msg.ForwardRequestTime := curCycle();
|
||||
out_msg.ProbeRequestStartTime := curCycle();
|
||||
out_msg.instSeqNum := in_msg.instSeqNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -136,6 +136,7 @@ structure(CPURequestMsg, desc="...", interface="Message") {
|
||||
WriteMask writeMask, desc="Write Through Data";
|
||||
MachineID WTRequestor, desc="Node who initiated the write through";
|
||||
int wfid, default="0", desc="wavefront id";
|
||||
uint64_t instSeqNum, desc="instruction sequence number";
|
||||
bool NoWriteConflict, default="true", desc="write collided with CAB entry";
|
||||
int ProgramCounter, desc="PC that accesses to this block";
|
||||
|
||||
@@ -188,6 +189,7 @@ structure(TDProbeRequestMsg, desc="...", interface="Message") {
|
||||
MessageSizeType MessageSize, desc="size category of the message";
|
||||
int Phase, desc="Synchronization Phase";
|
||||
int wfid, desc="wavefront id for Release";
|
||||
uint64_t instSeqNum, desc="instruction sequence number";
|
||||
MachineID Requestor, desc="Node who initiated the request";
|
||||
|
||||
bool functionalRead(Packet *pkt) {
|
||||
@@ -242,6 +244,7 @@ structure(ResponseMsg, desc="...", interface="Message") {
|
||||
bool NoAckNeeded, default="false", desc="For short circuting acks";
|
||||
bool isValid, default="false", desc="Is acked block valid";
|
||||
int wfid, default="0", desc="wavefront id";
|
||||
uint64_t instSeqNum, desc="instruction sequence number";
|
||||
int Phase, desc="Synchronization Phase";
|
||||
|
||||
int ProgramCounter, desc="PC that issues this request";
|
||||
@@ -343,6 +346,7 @@ structure(FifoMsg, desc="...", interface="Message") {
|
||||
Addr addr, desc="Address";
|
||||
FifoType Type, desc="WriteThrough/WriteFlush";
|
||||
int wfid, default="0",desc="wavefront id";
|
||||
uint64_t instSeqNum, desc="instruction sequence number";
|
||||
MachineID Requestor, desc="Flush Requestor";
|
||||
MachineID oRequestor, desc="original Flush Requestor";
|
||||
|
||||
|
||||
@@ -150,6 +150,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
|
||||
WriteMask writeMask, desc="Writethrough mask";
|
||||
DataBlock WTData, desc="Writethrough data block";
|
||||
int wfid, desc="Writethrough wavefront";
|
||||
uint64_t instSeqNum, desc="Instruction sequence number";
|
||||
PacketPtr pkt, desc="Packet associated with this request";
|
||||
}
|
||||
|
||||
|
||||
@@ -56,6 +56,7 @@ class RubyRequest : public Message
|
||||
WriteMask m_writeMask;
|
||||
DataBlock m_WTData;
|
||||
int m_wfid;
|
||||
uint64_t m_instSeqNum;
|
||||
|
||||
RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len,
|
||||
uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
|
||||
@@ -80,7 +81,8 @@ class RubyRequest : public Message
|
||||
RubyAccessMode _access_mode, PacketPtr _pkt, PrefetchBit _pb,
|
||||
unsigned _proc_id, unsigned _core_id,
|
||||
int _wm_size, std::vector<bool> & _wm_mask,
|
||||
DataBlock & _Data)
|
||||
DataBlock & _Data,
|
||||
uint64_t _instSeqNum = 0)
|
||||
: Message(curTime),
|
||||
m_PhysicalAddress(_paddr),
|
||||
m_Type(_type),
|
||||
@@ -93,7 +95,8 @@ class RubyRequest : public Message
|
||||
m_contextId(_core_id),
|
||||
m_writeMask(_wm_size,_wm_mask),
|
||||
m_WTData(_Data),
|
||||
m_wfid(_proc_id)
|
||||
m_wfid(_proc_id),
|
||||
m_instSeqNum(_instSeqNum)
|
||||
{
|
||||
m_LineAddress = makeLineAddress(m_PhysicalAddress);
|
||||
}
|
||||
@@ -104,7 +107,8 @@ class RubyRequest : public Message
|
||||
unsigned _proc_id, unsigned _core_id,
|
||||
int _wm_size, std::vector<bool> & _wm_mask,
|
||||
DataBlock & _Data,
|
||||
std::vector< std::pair<int,AtomicOpFunctor*> > _atomicOps)
|
||||
std::vector< std::pair<int,AtomicOpFunctor*> > _atomicOps,
|
||||
uint64_t _instSeqNum = 0)
|
||||
: Message(curTime),
|
||||
m_PhysicalAddress(_paddr),
|
||||
m_Type(_type),
|
||||
@@ -117,7 +121,8 @@ class RubyRequest : public Message
|
||||
m_contextId(_core_id),
|
||||
m_writeMask(_wm_size,_wm_mask,_atomicOps),
|
||||
m_WTData(_Data),
|
||||
m_wfid(_proc_id)
|
||||
m_wfid(_proc_id),
|
||||
m_instSeqNum(_instSeqNum)
|
||||
{
|
||||
m_LineAddress = makeLineAddress(m_PhysicalAddress);
|
||||
}
|
||||
|
||||
@@ -506,8 +506,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
m_outstanding_count--;
|
||||
assert(m_outstanding_count >= 0);
|
||||
|
||||
@@ -555,25 +553,24 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
|
||||
assert(pkt->req->hasInstSeqNum());
|
||||
|
||||
if (pkt->cmd == MemCmd::MemSyncReq) {
|
||||
// issue mem_sync requests immedidately to the cache system without
|
||||
// going though uncoalescedTable like normal LD/ST/Atomic requests
|
||||
issueMemSyncRequest(pkt);
|
||||
} else {
|
||||
// otherwise, this must be either read or write command
|
||||
assert(pkt->isRead() || pkt->isWrite());
|
||||
// let the child coalescer handle MemSyncReq because this is
|
||||
// cache coherence protocol specific
|
||||
return RequestStatus_Issued;
|
||||
}
|
||||
// otherwise, this must be either read or write command
|
||||
assert(pkt->isRead() || pkt->isWrite());
|
||||
|
||||
// the pkt is temporarily stored in the uncoalesced table until
|
||||
// it's picked for coalescing process later in this cycle or in a
|
||||
// future cycle
|
||||
uncoalescedTable.insertPacket(pkt);
|
||||
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
|
||||
pkt->getAddr());
|
||||
// the pkt is temporarily stored in the uncoalesced table until
|
||||
// it's picked for coalescing process later in this cycle or in a
|
||||
// future cycle
|
||||
uncoalescedTable.insertPacket(pkt);
|
||||
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
|
||||
pkt->getAddr());
|
||||
|
||||
// we schedule an issue event here to process the uncoalesced table
|
||||
// and try to issue Ruby request to cache system
|
||||
if (!issueEvent.scheduled()) {
|
||||
schedule(issueEvent, curTick());
|
||||
}
|
||||
// we schedule an issue event here to process the uncoalesced table
|
||||
// and try to issue Ruby request to cache system
|
||||
if (!issueEvent.scheduled()) {
|
||||
schedule(issueEvent, curTick());
|
||||
}
|
||||
|
||||
// we always return RequestStatus_Issued in this coalescer
|
||||
@@ -582,107 +579,6 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
|
||||
return RequestStatus_Issued;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO: Figure out what do with this code. This code may go away
|
||||
* and/or be merged into the VIPER coalescer once the VIPER
|
||||
* protocol is re-integrated with GCN3 codes.
|
||||
*/
|
||||
/*
|
||||
void
|
||||
GPUCoalescer::issueRequest(CoalescedRequest* crequest)
|
||||
{
|
||||
PacketPtr pkt = crequest->getFirstPkt();
|
||||
|
||||
int proc_id = -1;
|
||||
if (pkt != NULL && pkt->req->hasContextId()) {
|
||||
proc_id = pkt->req->contextId();
|
||||
}
|
||||
|
||||
// If valid, copy the pc to the ruby request
|
||||
Addr pc = 0;
|
||||
if (pkt->req->hasPC()) {
|
||||
pc = pkt->req->getPC();
|
||||
}
|
||||
|
||||
// At the moment setting scopes only counts
|
||||
// for GPU spill space accesses
|
||||
// which is pkt->req->isStack()
|
||||
// this scope is REPLACE since it
|
||||
// does not need to be flushed at the end
|
||||
// of a kernel Private and local may need
|
||||
// to be visible at the end of the kernel
|
||||
HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
|
||||
HSAScope accessScope = reqScopeToHSAScope(pkt->req);
|
||||
|
||||
Addr line_addr = makeLineAddress(pkt->getAddr());
|
||||
|
||||
// Creating WriteMask that records written bytes
|
||||
// and atomic operations. This enables partial writes
|
||||
// and partial reads of those writes
|
||||
DataBlock dataBlock;
|
||||
dataBlock.clear();
|
||||
uint32_t blockSize = RubySystem::getBlockSizeBytes();
|
||||
std::vector<bool> accessMask(blockSize,false);
|
||||
std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
|
||||
uint32_t tableSize = crequest->getPackets().size();
|
||||
for (int i = 0; i < tableSize; i++) {
|
||||
PacketPtr tmpPkt = crequest->getPackets()[i];
|
||||
uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
|
||||
uint32_t tmpSize = tmpPkt->getSize();
|
||||
if (tmpPkt->isAtomicOp()) {
|
||||
std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
|
||||
tmpPkt->getAtomicOp());
|
||||
atomicOps.push_back(tmpAtomicOp);
|
||||
} else if (tmpPkt->isWrite()) {
|
||||
dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
|
||||
tmpOffset, tmpSize);
|
||||
}
|
||||
for (int j = 0; j < tmpSize; j++) {
|
||||
accessMask[tmpOffset + j] = true;
|
||||
}
|
||||
}
|
||||
std::shared_ptr<RubyRequest> msg;
|
||||
if (pkt->isAtomicOp()) {
|
||||
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
|
||||
pkt->getPtr<uint8_t>(),
|
||||
pkt->getSize(), pc, crequest->getRubyType(),
|
||||
RubyAccessMode_Supervisor, pkt,
|
||||
PrefetchBit_No, proc_id, 100,
|
||||
blockSize, accessMask,
|
||||
dataBlock, atomicOps,
|
||||
accessScope, accessSegment);
|
||||
} else {
|
||||
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
|
||||
pkt->getPtr<uint8_t>(),
|
||||
pkt->getSize(), pc, crequest->getRubyType(),
|
||||
RubyAccessMode_Supervisor, pkt,
|
||||
PrefetchBit_No, proc_id, 100,
|
||||
blockSize, accessMask,
|
||||
dataBlock,
|
||||
accessScope, accessSegment);
|
||||
}
|
||||
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
|
||||
curTick(), m_version, "Coal", "Begin", "", "",
|
||||
printAddress(msg->getPhysicalAddress()),
|
||||
RubyRequestType_to_string(crequest->getRubyType()));
|
||||
|
||||
fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
|
||||
"there should not be any I-Fetch requests in the GPU Coalescer");
|
||||
|
||||
Tick latency = cyclesToTicks(
|
||||
m_controller->mandatoryQueueLatency(crequest->getRubyType()));
|
||||
assert(latency > 0);
|
||||
|
||||
if (!deadlockCheckEvent.scheduled()) {
|
||||
schedule(deadlockCheckEvent,
|
||||
m_deadlock_threshold * clockPeriod() +
|
||||
curTick());
|
||||
}
|
||||
|
||||
assert(m_mandatory_q_ptr);
|
||||
m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
|
||||
}*/
|
||||
|
||||
template <class KEY, class VALUE>
|
||||
std::ostream &
|
||||
operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
|
||||
@@ -890,7 +786,13 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
|
||||
assert(port != NULL);
|
||||
|
||||
pkt->senderState = ss->predecessor;
|
||||
delete ss;
|
||||
|
||||
if (pkt->cmd != MemCmd::WriteReq) {
|
||||
// for WriteReq, we keep the original senderState until
|
||||
// writeCompleteCallback
|
||||
delete ss;
|
||||
}
|
||||
|
||||
port->hitCallback(pkt);
|
||||
trySendRetries();
|
||||
}
|
||||
|
||||
@@ -294,9 +294,11 @@ class GPUCoalescer : public RubyPort
|
||||
Cycles firstResponseTime,
|
||||
bool isRegion);
|
||||
|
||||
void atomicCallback(Addr address,
|
||||
MachineType mach,
|
||||
const DataBlock& data);
|
||||
/* atomics need their own callback because the data
|
||||
might be const coming from SLICC */
|
||||
virtual void atomicCallback(Addr address,
|
||||
MachineType mach,
|
||||
const DataBlock& data);
|
||||
|
||||
RequestStatus makeRequest(PacketPtr pkt) override;
|
||||
int outstandingCount() const override { return m_outstanding_count; }
|
||||
@@ -365,7 +367,7 @@ class GPUCoalescer : public RubyPort
|
||||
// since the two following issue functions are protocol-specific,
|
||||
// they must be implemented in a derived coalescer
|
||||
virtual void issueRequest(CoalescedRequest* crequest) = 0;
|
||||
virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
|
||||
// virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
|
||||
|
||||
void kernelCallback(int wavefront_id);
|
||||
|
||||
|
||||
@@ -272,6 +272,10 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt)
|
||||
RubySystem::getBlockSizeBytes());
|
||||
}
|
||||
|
||||
// Save the port in the sender state object to be used later to
|
||||
// route the response
|
||||
pkt->pushSenderState(new SenderState(this));
|
||||
|
||||
// Submit the ruby request
|
||||
RequestStatus requestStatus = ruby_port->makeRequest(pkt);
|
||||
|
||||
@@ -279,16 +283,16 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt)
|
||||
// Otherwise, we need to tell the port to retry at a later point
|
||||
// and return false.
|
||||
if (requestStatus == RequestStatus_Issued) {
|
||||
// Save the port in the sender state object to be used later to
|
||||
// route the response
|
||||
pkt->pushSenderState(new SenderState(this));
|
||||
|
||||
DPRINTF(RubyPort, "Request %s address %#x issued\n", pkt->cmdString(),
|
||||
DPRINTF(RubyPort, "Request %s 0x%x issued\n", pkt->cmdString(),
|
||||
pkt->getAddr());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (pkt->cmd != MemCmd::MemFenceReq) {
|
||||
// pop off sender state as this request failed to issue
|
||||
SenderState *ss = safe_cast<SenderState *>(pkt->popSenderState());
|
||||
delete ss;
|
||||
|
||||
if (pkt->cmd != MemCmd::MemSyncReq) {
|
||||
DPRINTF(RubyPort,
|
||||
"Request %s for address %#x did not issue because %s\n",
|
||||
pkt->cmdString(), pkt->getAddr(),
|
||||
@@ -558,7 +562,7 @@ RubyPort::MemSlavePort::hitCallback(PacketPtr pkt)
|
||||
}
|
||||
|
||||
// turn packet around to go back to requester if response expected
|
||||
if (needsResponse) {
|
||||
if (needsResponse || pkt->isResponse()) {
|
||||
DPRINTF(RubyPort, "Sending packet back over port\n");
|
||||
// Send a response in the same cycle. There is no need to delay the
|
||||
// response because the response latency is already incurred in the
|
||||
|
||||
@@ -44,6 +44,7 @@
|
||||
#include "cpu/testers/rubytest/RubyTester.hh"
|
||||
#include "debug/GPUCoalescer.hh"
|
||||
#include "debug/MemoryAccess.hh"
|
||||
#include "debug/ProtocolTrace.hh"
|
||||
#include "mem/packet.hh"
|
||||
#include "mem/ruby/common/SubBlock.hh"
|
||||
#include "mem/ruby/network/MessageBuffer.hh"
|
||||
@@ -64,148 +65,228 @@ VIPERCoalescerParams::create()
|
||||
}
|
||||
|
||||
VIPERCoalescer::VIPERCoalescer(const Params *p)
|
||||
: GPUCoalescer(p)
|
||||
: GPUCoalescer(p),
|
||||
m_cache_inv_pkt(nullptr),
|
||||
m_num_pending_invs(0)
|
||||
{
|
||||
m_max_wb_per_cycle=p->max_wb_per_cycle;
|
||||
m_max_inv_per_cycle=p->max_inv_per_cycle;
|
||||
m_outstanding_inv = 0;
|
||||
m_outstanding_wb = 0;
|
||||
}
|
||||
|
||||
VIPERCoalescer::~VIPERCoalescer()
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
VIPERCoalescer::issueMemSyncRequest(PacketPtr pkt)
|
||||
{
|
||||
}
|
||||
|
||||
// Places an uncoalesced packet in uncoalescedTable. If the packet is a
|
||||
// special type (MemFence, scoping, etc), it is issued immediately.
|
||||
RequestStatus
|
||||
VIPERCoalescer::makeRequest(PacketPtr pkt)
|
||||
{
|
||||
if (m_outstanding_wb | m_outstanding_inv) {
|
||||
DPRINTF(GPUCoalescer,
|
||||
"There are %d Writebacks and %d Invalidatons\n",
|
||||
m_outstanding_wb, m_outstanding_inv);
|
||||
}
|
||||
// Are we in the middle of a release
|
||||
if ((m_outstanding_wb) > 0) {
|
||||
if (pkt->req->isKernel()) {
|
||||
// Everythign is fine
|
||||
// Barriers and Kernel End scan coalesce
|
||||
// If it is a Kerenl Begin flush the cache
|
||||
if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) {
|
||||
invL1();
|
||||
}
|
||||
// VIPER only supports following memory request types
|
||||
// MemSyncReq & Acquire: TCP cache invalidation
|
||||
// ReadReq : cache read
|
||||
// WriteReq : cache write
|
||||
// AtomicOp : cache atomic
|
||||
//
|
||||
// VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
|
||||
// does not specify an equivalent type of memory request.
|
||||
// TODO: future patches should rename Acquire and Release
|
||||
assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isAcquire()) ||
|
||||
pkt->cmd == MemCmd::ReadReq ||
|
||||
pkt->cmd == MemCmd::WriteReq ||
|
||||
pkt->isAtomicOp());
|
||||
|
||||
if (pkt->req->isRelease()) {
|
||||
insertKernel(pkt->req->contextId(), pkt);
|
||||
}
|
||||
|
||||
return RequestStatus_Issued;
|
||||
}
|
||||
} else if (pkt->req->isKernel() && pkt->req->isRelease()) {
|
||||
// Flush Dirty Data on Kernel End
|
||||
// isKernel + isRelease
|
||||
insertKernel(pkt->req->contextId(), pkt);
|
||||
wbL1();
|
||||
if (m_outstanding_wb == 0) {
|
||||
for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
|
||||
newKernelEnds.push_back(it->first);
|
||||
}
|
||||
completeIssue();
|
||||
}
|
||||
return RequestStatus_Issued;
|
||||
if (pkt->req->isAcquire() && m_cache_inv_pkt) {
|
||||
// In VIPER protocol, the coalescer is not able to handle two or
|
||||
// more cache invalidation requests at a time. Cache invalidation
|
||||
// requests must be serialized to ensure that all stale data in
|
||||
// TCP are invalidated correctly. If there's already a pending
|
||||
// cache invalidation request, we must retry this request later
|
||||
return RequestStatus_Aliased;
|
||||
}
|
||||
|
||||
GPUCoalescer::makeRequest(pkt);
|
||||
|
||||
if (pkt->req->isKernel() && pkt->req->isAcquire()) {
|
||||
// Invalidate clean Data on Kernel Begin
|
||||
// isKernel + isAcquire
|
||||
invL1();
|
||||
} else if (pkt->req->isAcquire() && pkt->req->isRelease()) {
|
||||
// Deschedule the AtomicAcqRel and
|
||||
// Flush and Invalidate the L1 cache
|
||||
invwbL1();
|
||||
if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
|
||||
DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
|
||||
deschedule(issueEvent);
|
||||
}
|
||||
} else if (pkt->req->isRelease()) {
|
||||
// Deschedule the StoreRel and
|
||||
// Flush the L1 cache
|
||||
wbL1();
|
||||
if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
|
||||
DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
|
||||
deschedule(issueEvent);
|
||||
}
|
||||
} else if (pkt->req->isAcquire()) {
|
||||
// LoadAcq or AtomicAcq
|
||||
// Invalidate the L1 cache
|
||||
invL1();
|
||||
}
|
||||
// Request was successful
|
||||
if (m_outstanding_wb == 0) {
|
||||
if (!issueEvent.scheduled()) {
|
||||
DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n");
|
||||
schedule(issueEvent, curTick());
|
||||
}
|
||||
if (pkt->req->isAcquire()) {
|
||||
// In VIPER protocol, a compute unit sends a MemSyncReq with Acquire
|
||||
// flag to invalidate TCP. Upon receiving a request of this type,
|
||||
// VIPERCoalescer starts a cache walk to invalidate all valid entries
|
||||
// in TCP. The request is completed once all entries are invalidated.
|
||||
assert(!m_cache_inv_pkt);
|
||||
m_cache_inv_pkt = pkt;
|
||||
invTCP();
|
||||
}
|
||||
|
||||
return RequestStatus_Issued;
|
||||
}
|
||||
|
||||
void
|
||||
VIPERCoalescer::wbCallback(Addr addr)
|
||||
VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
|
||||
{
|
||||
m_outstanding_wb--;
|
||||
// if L1 Flush Complete
|
||||
// attemnpt to schedule issueEvent
|
||||
assert(((int) m_outstanding_wb) >= 0);
|
||||
if (m_outstanding_wb == 0) {
|
||||
for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
|
||||
newKernelEnds.push_back(it->first);
|
||||
}
|
||||
completeIssue();
|
||||
PacketPtr pkt = crequest->getFirstPkt();
|
||||
|
||||
int proc_id = -1;
|
||||
if (pkt != NULL && pkt->req->hasContextId()) {
|
||||
proc_id = pkt->req->contextId();
|
||||
}
|
||||
trySendRetries();
|
||||
|
||||
// If valid, copy the pc to the ruby request
|
||||
Addr pc = 0;
|
||||
if (pkt->req->hasPC()) {
|
||||
pc = pkt->req->getPC();
|
||||
}
|
||||
|
||||
Addr line_addr = makeLineAddress(pkt->getAddr());
|
||||
|
||||
// Creating WriteMask that records written bytes
|
||||
// and atomic operations. This enables partial writes
|
||||
// and partial reads of those writes
|
||||
DataBlock dataBlock;
|
||||
dataBlock.clear();
|
||||
uint32_t blockSize = RubySystem::getBlockSizeBytes();
|
||||
std::vector<bool> accessMask(blockSize,false);
|
||||
std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
|
||||
uint32_t tableSize = crequest->getPackets().size();
|
||||
for (int i = 0; i < tableSize; i++) {
|
||||
PacketPtr tmpPkt = crequest->getPackets()[i];
|
||||
uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
|
||||
uint32_t tmpSize = tmpPkt->getSize();
|
||||
if (tmpPkt->isAtomicOp()) {
|
||||
std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
|
||||
tmpPkt->getAtomicOp());
|
||||
atomicOps.push_back(tmpAtomicOp);
|
||||
} else if (tmpPkt->isWrite()) {
|
||||
dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
|
||||
tmpOffset, tmpSize);
|
||||
}
|
||||
for (int j = 0; j < tmpSize; j++) {
|
||||
accessMask[tmpOffset + j] = true;
|
||||
}
|
||||
}
|
||||
std::shared_ptr<RubyRequest> msg;
|
||||
if (pkt->isAtomicOp()) {
|
||||
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
|
||||
pkt->getPtr<uint8_t>(),
|
||||
pkt->getSize(), pc, crequest->getRubyType(),
|
||||
RubyAccessMode_Supervisor, pkt,
|
||||
PrefetchBit_No, proc_id, 100,
|
||||
blockSize, accessMask,
|
||||
dataBlock, atomicOps, crequest->getSeqNum());
|
||||
} else {
|
||||
msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
|
||||
pkt->getPtr<uint8_t>(),
|
||||
pkt->getSize(), pc, crequest->getRubyType(),
|
||||
RubyAccessMode_Supervisor, pkt,
|
||||
PrefetchBit_No, proc_id, 100,
|
||||
blockSize, accessMask,
|
||||
dataBlock, crequest->getSeqNum());
|
||||
}
|
||||
|
||||
if (pkt->cmd == MemCmd::WriteReq) {
|
||||
makeWriteCompletePkts(crequest);
|
||||
}
|
||||
|
||||
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
|
||||
curTick(), m_version, "Coal", "Begin", "", "",
|
||||
printAddress(msg->getPhysicalAddress()),
|
||||
RubyRequestType_to_string(crequest->getRubyType()));
|
||||
|
||||
fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
|
||||
"there should not be any I-Fetch requests in the GPU Coalescer");
|
||||
|
||||
if (!deadlockCheckEvent.scheduled()) {
|
||||
schedule(deadlockCheckEvent,
|
||||
m_deadlock_threshold * clockPeriod() +
|
||||
curTick());
|
||||
}
|
||||
|
||||
assert(m_mandatory_q_ptr);
|
||||
Tick latency = cyclesToTicks(
|
||||
m_controller->mandatoryQueueLatency(crequest->getRubyType()));
|
||||
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
|
||||
}
|
||||
|
||||
void
|
||||
VIPERCoalescer::invCallback(Addr addr)
|
||||
VIPERCoalescer::makeWriteCompletePkts(CoalescedRequest* crequest)
|
||||
{
|
||||
m_outstanding_inv--;
|
||||
// if L1 Flush Complete
|
||||
// attemnpt to schedule issueEvent
|
||||
// This probably won't happen, since
|
||||
// we dont wait on cache invalidations
|
||||
if (m_outstanding_wb == 0) {
|
||||
for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
|
||||
newKernelEnds.push_back(it->first);
|
||||
}
|
||||
completeIssue();
|
||||
// In VIPER protocol, for each write request, down-stream caches
|
||||
// return two responses: writeCallback and writeCompleteCallback.
|
||||
// We need to prepare a writeCompletePkt for each write request so
|
||||
// that when writeCompleteCallback is called, we can respond
|
||||
// requesting wavefront right away.
|
||||
// writeCompletePkt inherits request and senderState of the original
|
||||
// write request packet so that we can find the original requestor
|
||||
// later. This assumes that request and senderState are not deleted
|
||||
// before writeCompleteCallback is called.
|
||||
|
||||
auto key = crequest->getSeqNum();
|
||||
std::vector<PacketPtr>& req_pkts = crequest->getPackets();
|
||||
|
||||
for (auto pkt : req_pkts) {
|
||||
DPRINTF(GPUCoalescer, "makeWriteCompletePkts: instSeqNum %d\n",
|
||||
key);
|
||||
assert(pkt->cmd == MemCmd::WriteReq);
|
||||
|
||||
PacketPtr writeCompletePkt = new Packet(pkt->req,
|
||||
MemCmd::WriteCompleteResp);
|
||||
writeCompletePkt->setAddr(pkt->getAddr());
|
||||
writeCompletePkt->senderState = pkt->senderState;
|
||||
m_writeCompletePktMap[key].push_back(writeCompletePkt);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum)
|
||||
{
|
||||
DPRINTF(GPUCoalescer, "writeCompleteCallback: instSeqNum %d addr 0x%x\n",
|
||||
instSeqNum, addr);
|
||||
|
||||
auto key = instSeqNum;
|
||||
assert(m_writeCompletePktMap.count(key) == 1 &&
|
||||
!m_writeCompletePktMap[key].empty());
|
||||
|
||||
for (auto writeCompletePkt : m_writeCompletePktMap[key]) {
|
||||
if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
|
||||
RubyPort::SenderState *ss =
|
||||
safe_cast<RubyPort::SenderState *>
|
||||
(writeCompletePkt->senderState);
|
||||
MemSlavePort *port = ss->port;
|
||||
assert(port != NULL);
|
||||
|
||||
writeCompletePkt->senderState = ss->predecessor;
|
||||
delete ss;
|
||||
port->hitCallback(writeCompletePkt);
|
||||
}
|
||||
}
|
||||
|
||||
trySendRetries();
|
||||
|
||||
if (m_writeCompletePktMap[key].empty())
|
||||
m_writeCompletePktMap.erase(key);
|
||||
}
|
||||
|
||||
void
|
||||
VIPERCoalescer::invTCPCallback(Addr addr)
|
||||
{
|
||||
assert(m_cache_inv_pkt && m_num_pending_invs > 0);
|
||||
|
||||
m_num_pending_invs--;
|
||||
|
||||
if (m_num_pending_invs == 0) {
|
||||
std::vector<PacketPtr> pkt_list { m_cache_inv_pkt };
|
||||
completeHitCallback(pkt_list);
|
||||
m_cache_inv_pkt = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate L1 cache (Acquire)
|
||||
* Invalidate TCP (Acquire)
|
||||
*/
|
||||
void
|
||||
VIPERCoalescer::invL1()
|
||||
VIPERCoalescer::invTCP()
|
||||
{
|
||||
int size = m_dataCache_ptr->getNumBlocks();
|
||||
DPRINTF(GPUCoalescer,
|
||||
"There are %d Invalidations outstanding before Cache Walk\n",
|
||||
m_outstanding_inv);
|
||||
m_num_pending_invs);
|
||||
// Walk the cache
|
||||
for (int i = 0; i < size; i++) {
|
||||
Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
|
||||
@@ -215,86 +296,14 @@ VIPERCoalescer::invL1()
|
||||
clockEdge(), addr, (uint8_t*) 0, 0, 0,
|
||||
request_type, RubyAccessMode_Supervisor,
|
||||
nullptr);
|
||||
DPRINTF(GPUCoalescer, "Evicting addr 0x%x\n", addr);
|
||||
assert(m_mandatory_q_ptr != NULL);
|
||||
Tick latency = cyclesToTicks(
|
||||
m_controller->mandatoryQueueLatency(request_type));
|
||||
assert(latency > 0);
|
||||
m_controller->mandatoryQueueLatency(request_type));
|
||||
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
|
||||
m_outstanding_inv++;
|
||||
m_num_pending_invs++;
|
||||
}
|
||||
DPRINTF(GPUCoalescer,
|
||||
"There are %d Invalidatons outstanding after Cache Walk\n",
|
||||
m_outstanding_inv);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writeback L1 cache (Release)
|
||||
*/
|
||||
void
|
||||
VIPERCoalescer::wbL1()
|
||||
{
|
||||
int size = m_dataCache_ptr->getNumBlocks();
|
||||
DPRINTF(GPUCoalescer,
|
||||
"There are %d Writebacks outstanding before Cache Walk\n",
|
||||
m_outstanding_wb);
|
||||
// Walk the cache
|
||||
for (int i = 0; i < size; i++) {
|
||||
Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
|
||||
// Write dirty data back
|
||||
RubyRequestType request_type = RubyRequestType_FLUSH;
|
||||
std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
|
||||
clockEdge(), addr, (uint8_t*) 0, 0, 0,
|
||||
request_type, RubyAccessMode_Supervisor,
|
||||
nullptr);
|
||||
assert(m_mandatory_q_ptr != NULL);
|
||||
Tick latency = cyclesToTicks(
|
||||
m_controller->mandatoryQueueLatency(request_type));
|
||||
assert(latency > 0);
|
||||
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
|
||||
m_outstanding_wb++;
|
||||
}
|
||||
DPRINTF(GPUCoalescer,
|
||||
"There are %d Writebacks outstanding after Cache Walk\n",
|
||||
m_outstanding_wb);
|
||||
}
|
||||
|
||||
/**
|
||||
* Invalidate and Writeback L1 cache (Acquire&Release)
|
||||
*/
|
||||
void
|
||||
VIPERCoalescer::invwbL1()
|
||||
{
|
||||
int size = m_dataCache_ptr->getNumBlocks();
|
||||
// Walk the cache
|
||||
for (int i = 0; i < size; i++) {
|
||||
Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
|
||||
// Evict Read-only data
|
||||
RubyRequestType request_type = RubyRequestType_REPLACEMENT;
|
||||
std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
|
||||
clockEdge(), addr, (uint8_t*) 0, 0, 0,
|
||||
request_type, RubyAccessMode_Supervisor,
|
||||
nullptr);
|
||||
assert(m_mandatory_q_ptr != NULL);
|
||||
Tick latency = cyclesToTicks(
|
||||
m_controller->mandatoryQueueLatency(request_type));
|
||||
assert(latency > 0);
|
||||
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
|
||||
m_outstanding_inv++;
|
||||
}
|
||||
// Walk the cache
|
||||
for (int i = 0; i< size; i++) {
|
||||
Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
|
||||
// Write dirty data back
|
||||
RubyRequestType request_type = RubyRequestType_FLUSH;
|
||||
std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
|
||||
clockEdge(), addr, (uint8_t*) 0, 0, 0,
|
||||
request_type, RubyAccessMode_Supervisor,
|
||||
nullptr);
|
||||
assert(m_mandatory_q_ptr != NULL);
|
||||
Tick latency = cyclesToTicks(
|
||||
m_controller->mandatoryQueueLatency(request_type));
|
||||
assert(latency > 0);
|
||||
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
|
||||
m_outstanding_wb++;
|
||||
}
|
||||
m_num_pending_invs);
|
||||
}
|
||||
|
||||
@@ -57,19 +57,31 @@ class VIPERCoalescer : public GPUCoalescer
|
||||
typedef VIPERCoalescerParams Params;
|
||||
VIPERCoalescer(const Params *);
|
||||
~VIPERCoalescer();
|
||||
|
||||
void issueMemSyncRequest(PacketPtr pkt) override;
|
||||
void issueRequest(CoalescedRequest* crequest) override;
|
||||
void wbCallback(Addr address);
|
||||
void invCallback(Addr address);
|
||||
void writeCompleteCallback(Addr address, uint64_t instSeqNum);
|
||||
void invTCPCallback(Addr address);
|
||||
RequestStatus makeRequest(PacketPtr pkt) override;
|
||||
void issueRequest(CoalescedRequest* crequest) override;
|
||||
|
||||
private:
|
||||
void invL1();
|
||||
void wbL1();
|
||||
void invwbL1();
|
||||
uint64_t m_outstanding_inv;
|
||||
uint64_t m_outstanding_wb;
|
||||
uint64_t m_max_inv_per_cycle;
|
||||
uint64_t m_max_wb_per_cycle;
|
||||
void invTCP();
|
||||
|
||||
// make write-complete response packets from original write request packets
|
||||
void makeWriteCompletePkts(CoalescedRequest* crequest);
|
||||
|
||||
// current cache invalidation packet
|
||||
// nullptr if there is no active cache invalidation request
|
||||
PacketPtr m_cache_inv_pkt;
|
||||
|
||||
// number of remaining cache lines to be invalidated in TCP
|
||||
int m_num_pending_invs;
|
||||
|
||||
// a map of instruction sequence number and corresponding pending
|
||||
// write-complete response packets. Each write-complete response
|
||||
// corresponds to a pending store request that is waiting for
|
||||
// writeCompleteCallback. We may have multiple pending store requests per
|
||||
// wavefront at a time. Each time writeCompleteCallback is called, an entry
|
||||
// with a corresponding seqNum is popped off from map and returned to
|
||||
// compute unit.
|
||||
std::unordered_map<uint64_t, std::vector<PacketPtr>> m_writeCompletePktMap;
|
||||
};
|
||||
#endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__
|
||||
|
||||
Reference in New Issue
Block a user