mem,gpu-compute: Implement GPU TCC directed invalidate (#1011)

The GPU device currently supports large BAR which means that the driver
can write directly to GPU memory over the PCI bus without using SDMA or
PM4 packets. The gem5 PCI interface only provides an atomic interface
for BAR reads/writes, which means the values cannot go through timing
mode Ruby caches. This causes bugs as the TCC cache is allowed to keep
clean data between kernels for performance reasons. If there is a BAR
write directly to memory bypassing the cache, the value in the cache is
stale and must be invalidated.

In this commit a TCC invalidate is generated for all writes over PCI
that go directly to GPU memory. This will also invalidate TCP along the
way if necessary. This currently relies on the driver synchonization
which only allows BAR writes in between kernels. Therefore, the cache
should only be in I or V state.

To handle a race condition between invalidates and launching the next
kernel, the invalidates return a response and the GPU command processor
will wait for all TCC invalidates to be complete before launching the
next kernel.

This fixes issues with stale data in nanoGPT and possibly PENNANT.
This commit is contained in:
Matthew Poremba
2024-04-15 13:18:01 -07:00
committed by GitHub
15 changed files with 252 additions and 15 deletions

View File

@@ -1096,6 +1096,7 @@ class Request : public Extensible<Request>
* setting extraFlags should be done via setCacheCoherenceFlags().
*/
bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); }
bool isInvL2() const { return _cacheCoherenceFlags.isSet(GL2_CACHE_INV); }
bool
isGL2CacheFlush() const

View File

@@ -72,6 +72,7 @@ machine(MachineType:TCC, "TCC Cache")
L2_Repl, desc="L2 Replacement";
// Probes
PrbInv, desc="Invalidating probe";
InvCache, desc="Invalidating probe from TCP";
// Coming from Memory Controller
WBAck, desc="writethrough ack from memory";
Bypass, desc="Bypass the entire L2 cache";
@@ -413,6 +414,8 @@ machine(MachineType:TCC, "TCC Cache")
}
} else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
} else if (in_msg.Type == CoherenceRequestType:InvCache) {
trigger(Event:InvCache, in_msg.addr, cache_entry, tbe);
} else {
DPRINTF(RubySlicc, "%s\n", in_msg);
error("Unexpected Response Message to Core");
@@ -429,6 +432,19 @@ machine(MachineType:TCC, "TCC Cache")
unset_cache_entry();
}
action(ir_invL2Resp, "ir", desc="send L2 invalidate ack") {
peek(coreRequestNetwork_in, CPURequestMsg) {
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:InvL2Resp;
out_msg.Sender := machineID;
out_msg.Destination.add(in_msg.Requestor);
out_msg.MessageSize := MessageSizeType:Response_Control;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
}
action(sd_sendData, "sd", desc="send Shared response") {
peek(coreRequestNetwork_in, CPURequestMsg) {
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
@@ -1188,6 +1204,12 @@ machine(MachineType:TCC, "TCC Cache")
i_invL2;
}
transition({I, V}, InvCache, I) {TagArrayRead, TagArrayWrite} {
i_invL2;
ir_invL2Resp;
p_popRequestQueue;
}
transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
pi_sendProbeResponseInv;
pp_popProbeQueue;

View File

@@ -75,6 +75,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
Evict, desc="Evict if clean(invL1 for Load Acquire)";
// Mem sys initiated
Repl, desc="Replacing block from cache";
InvL2, desc="Invalidate to L2";
InvL2Resp, desc="Invalidate L2 completed";
// TCC initiated
TCC_Ack, desc="TCC Ack to Core Request";
@@ -286,9 +288,12 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
in_msg.Type == CoherenceResponseType:NBSysWBAck) {
trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
} else {
error("Unexpected Response Message to Core");
}
} else if (in_msg.Type == CoherenceResponseType:InvL2Resp) {
DPRINTF(RubySlicc, "Issuing InvL2Resp\n");
trigger(Event:InvL2Resp, in_msg.addr, cache_entry, tbe);
} else {
error("Unexpected Response Message to Core");
}
}
}
}
@@ -333,6 +338,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:REPLACEMENT){
trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:InvL2){
trigger(Event:InvL2, in_msg.LineAddress, cache_entry, tbe);
} else {
error("Unexpected Request Message from VIC");
}
@@ -609,6 +616,31 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
}
}
action(il2_invL2, "il2", desc="Invalidate address in L2") {
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceRequestType:InvCache;
out_msg.Requestor := machineID;
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.InitialRequestTime := curCycle();
peek(mandatoryQueue_in, RubyRequest) {
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
action(i2r_invL2Resp, "i2r", desc="Invalidate L2 completed") {
if (use_seq_not_coal) {
DPRINTF(RubySlicc, "Sequencer does not define invTCCCallback!\n");
assert(false);
} else {
coalescer.invTCCCallback(address);
}
}
action(wd_wtDone, "wd", desc="writethrough done") {
if (use_seq_not_coal) {
DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
@@ -830,6 +862,22 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
p_popMandatoryQueue;
}
transition(I, InvL2) {
il2_invL2;
p_popMandatoryQueue;
}
transition(V, InvL2, I) {
ic_invCache
il2_invL2;
p_popMandatoryQueue;
}
transition(I, InvL2Resp) {
i2r_invL2Resp;
pr_popResponseQueue;
}
// if a line is in IV and a TCC_AckWB comes back, we must have had a WT
// store followed by a load. Thus, complete the store without affecting
// TBE or line state.

View File

@@ -62,6 +62,7 @@ structure (VIPERCoalescer, external = "yes") {
Cycles, Cycles, Cycles, bool);
void atomicCallback(Addr, MachineType, DataBlock);
void invTCPCallback(Addr);
void invTCCCallback(Addr);
void writeCompleteCallback(Addr, uint64_t);
void evictionCallback(Addr);
}

View File

@@ -46,6 +46,7 @@ enumeration(CoherenceRequestType, desc="Coherence Request Types") {
WriteThroughFifo, desc="WriteThrough with no data";
WriteThroughDummy, desc="WriteThrough with no data for atomic operation";
WriteFlush, desc="Release Flush";
InvCache, desc="Invalidate Cache";
WrCancel, desc="want to cancel WB to Memory"; // should this be here?
@@ -95,6 +96,7 @@ enumeration(CoherenceResponseType, desc="Coherence Response Types") {
StaleNotif, desc="Notification of Stale WBAck, No data to writeback";
CPUCancelWB, desc="want to cancel WB to Memory";
MemData, desc="Data from Memory";
InvL2Resp, desc="Invalidate L2 response";
// for regions
PrivateAck, desc="Ack that r-buf received private notify";

View File

@@ -181,6 +181,7 @@ enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") {
COMMIT, desc="Commit version";
NULL, desc="Invalid request type";
FLUSH, desc="Flush request type";
InvL2, desc="Invalidate L2";
Release, desc="Release operation";
Acquire, desc="Acquire opertion";
AcquireRelease, desc="Acquire and Release opertion";

View File

@@ -669,14 +669,14 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
RequestStatus
GPUCoalescer::makeRequest(PacketPtr pkt)
{
// all packets must have valid instruction sequence numbers
assert(pkt->req->hasInstSeqNum());
if (pkt->cmd == MemCmd::MemSyncReq) {
// issue mem_sync requests immediately to the cache system without
// going through uncoalescedTable like normal LD/ST/Atomic requests
issueMemSyncRequest(pkt);
} else {
// all packets must have valid instruction sequence numbers
assert(pkt->req->hasInstSeqNum());
// otherwise, this must be either read or write command
assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());

View File

@@ -80,6 +80,7 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
// VIPER does not expect MemSyncReq & Release since compute unit
// does not specify an equivalent type of memory request.
assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
(pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL2()) ||
pkt->cmd == MemCmd::ReadReq ||
pkt->cmd == MemCmd::WriteReq ||
pkt->cmd == MemCmd::FlushReq ||
@@ -106,6 +107,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
invTCP();
}
if (pkt->req->isInvL2()) {
invTCC(pkt);
}
return RequestStatus_Issued;
}
@@ -306,5 +311,51 @@ VIPERCoalescer::invTCP()
m_num_pending_invs);
}
void
VIPERCoalescer::invTCCCallback(Addr addr)
{
for (auto& pkt : m_pending_invl2s[addr]) {
RubyPort::SenderState *ss =
safe_cast<RubyPort::SenderState *>(pkt->senderState);
MemResponsePort *port = ss->port;
assert(port != nullptr);
// Now convert to MemSyncResp
pkt->makeResponse();
pkt->senderState = ss->predecessor;
delete ss;
port->hitCallback(pkt);
}
m_pending_invl2s.erase(addr);
}
/*
* Send an invalidate to a specific address in the TCC.
*/
void
VIPERCoalescer::invTCC(PacketPtr pkt)
{
assert(pkt);
assert(pkt->req);
Addr addr = pkt->req->getPaddr();
RubyRequestType request_type = RubyRequestType_InvL2;
std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
clockEdge(), addr, 0, 0,
request_type, RubyAccessMode_Supervisor,
nullptr);
DPRINTF(GPUCoalescer, "Sending L2 invalidate to 0x%x\n", addr);
assert(m_mandatory_q_ptr);
Tick latency = cyclesToTicks(
m_controller->mandatoryQueueLatency(request_type));
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
m_pending_invl2s[addr].push_back(pkt);
}
} // namespace ruby
} // namespace gem5

View File

@@ -63,11 +63,13 @@ class VIPERCoalescer : public GPUCoalescer
~VIPERCoalescer();
void writeCompleteCallback(Addr address, uint64_t instSeqNum);
void invTCPCallback(Addr address);
void invTCCCallback(Addr address);
RequestStatus makeRequest(PacketPtr pkt) override;
void issueRequest(CoalescedRequest* crequest) override;
private:
void invTCP();
void invTCC(PacketPtr pkt);
// make write-complete response packets from original write request packets
void makeWriteCompletePkts(CoalescedRequest* crequest);
@@ -79,6 +81,9 @@ class VIPERCoalescer : public GPUCoalescer
// number of remaining cache lines to be invalidated in TCP
int m_num_pending_invs;
// outstanding L2 invalidate packets
std::unordered_map<Addr, std::vector<PacketPtr>> m_pending_invl2s;
// a map of instruction sequence number and corresponding pending
// write-complete response packets. Each write-complete response
// corresponds to a pending store request that is waiting for