mem,gpu-compute: Implement GPU TCC directed invalidate

The GPU device currently supports large BAR which means that the driver
can write directly to GPU memory over the PCI bus without using SDMA or
PM4 packets. The gem5 PCI interface only provides an atomic interface
for BAR reads/writes, which means the values cannot go through timing
mode Ruby caches. This causes bugs as the TCC cache is allowed to keep
clean data between kernels for performance reasons. If there is a BAR
write directly to memory bypassing the cache, the value in the cache is
stale and must be invalidated.

In this commit a TCC invalidate is generated for all writes over PCI
that go directly to GPU memory. This will also invalidate TCP along the
way if necessary. This currently relies on the driver synchonization
which only allows BAR writes in between kernels. Therefore, the cache
should only be in I or V state.

To handle a race condition between invalidates and launching the next
kernel, the invalidates return a response and the GPU command processor
will wait for all TCC invalidates to be complete before launching the
next kernel.

This fixes issues with stale data in nanoGPT and possibly PENNANT.

Change-Id: I8e1290f842122682c271e5508a48037055bfbcdf
This commit is contained in:
Matthew Poremba
2024-03-15 17:40:42 -05:00
parent 833392e7b2
commit 1d64669473
14 changed files with 236 additions and 3 deletions

View File

@@ -420,6 +420,12 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset)
{
DPRINTF(AMDGPUDevice, "Wrote framebuffer address %#lx\n", offset);
for (auto& cu: CP()->shader()->cuList) {
auto system = CP()->shader()->gpuCmdProc.system();
Addr aligned_addr = offset & ~(system->cacheLineSize() - 1);
cu->sendInvL2(aligned_addr);
}
Addr aperture = gpuvm.getFrameAperture(offset);
Addr aperture_offset = offset - aperture;

View File

@@ -864,6 +864,25 @@ ComputeUnit::DataPort::handleResponse(PacketPtr pkt)
// - kernel end
// - non-kernel mem sync
// Non-kernel mem sync not from an instruction
if (!gpuDynInst) {
// If there is no dynamic instruction, a CU must be present.
ComputeUnit *cu = sender_state->computeUnit;
assert(cu != nullptr);
if (pkt->req->isInvL2()) {
cu->shader->decNumOutstandingInvL2s();
assert(cu->shader->getNumOutstandingInvL2s() >= 0);
} else {
panic("Unknown MemSyncResp not from an instruction");
}
// Cleanup and return, no other response events needed.
delete pkt->senderState;
delete pkt;
return true;
}
// Kernel Launch
// wavefront was nullptr when launching kernel, so it is meaningless
// here (simdId=-1, wfSlotId=-1)
@@ -1403,6 +1422,23 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
}
}
void
ComputeUnit::sendInvL2(Addr paddr)
{
auto req = std::make_shared<Request>(paddr, 64, 0, vramRequestorId());
req->setCacheCoherenceFlags(Request::GL2_CACHE_INV);
auto pkt = new Packet(req, MemCmd::MemSyncReq);
pkt->pushSenderState(
new ComputeUnit::DataPort::SenderState(this, 0, nullptr));
EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt);
schedule(mem_req_event, curTick() + req_tick_latency);
shader->incNumOutstandingInvL2s();
}
void
ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
{

View File

@@ -474,6 +474,8 @@ class ComputeUnit : public ClockedObject
void handleSQCReturn(PacketPtr pkt);
void sendInvL2(Addr paddr);
protected:
RequestorID _requestorId;
@@ -527,6 +529,7 @@ class ComputeUnit : public ClockedObject
struct SenderState : public Packet::SenderState
{
ComputeUnit *computeUnit = nullptr;
GPUDynInstPtr _gpuDynInst;
PortID port_index;
Packet::SenderState *saved;
@@ -536,6 +539,12 @@ class ComputeUnit : public ClockedObject
: _gpuDynInst(gpuDynInst),
port_index(_port_index),
saved(sender_state) { }
SenderState(ComputeUnit *cu, PortID _port_index,
Packet::SenderState *sender_state=nullptr)
: computeUnit(cu),
port_index(_port_index),
saved(sender_state) { }
};
class SystemHubEvent : public Event

View File

@@ -41,6 +41,7 @@
#include "debug/GPUKernelInfo.hh"
#include "dev/amdgpu/amdgpu_device.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/shader.hh"
#include "mem/abstract_mem.hh"
#include "mem/packet_access.hh"
#include "mem/se_translating_port_proxy.hh"
@@ -126,6 +127,21 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
unsigned akc_alignment_granularity = 64;
assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));
/**
* Make sure there is not a race condition with invalidates in the L2
* cache. The full system driver may write directly to memory using
* large BAR while the L2 cache is allowed to keep data in the valid
* state between kernel launches. This is a rare event but is required
* for correctness.
*/
if (shader()->getNumOutstandingInvL2s() > 0) {
DPRINTF(GPUCommandProc,
"Deferring kernel launch due to outstanding L2 invalidates\n");
shader()->addDeferredDispatch(raw_pkt, queue_id, host_pkt_addr);
return;
}
/**
* Need to use a raw pointer for DmaVirtDevice API. This is deleted
* in the dispatchKernelObject method.

View File

@@ -552,6 +552,29 @@ Shader::notifyCuSleep() {
}
}
void
Shader::decNumOutstandingInvL2s()
{
num_outstanding_invl2s--;
if (num_outstanding_invl2s == 0 && !deferred_dispatches.empty()) {
for (auto &dispatch : deferred_dispatches) {
gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch),
std::get<1>(dispatch),
std::get<2>(dispatch));
}
deferred_dispatches.clear();
}
}
void
Shader::addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
Addr host_pkt_addr)
{
deferred_dispatches.push_back(
std::make_tuple(raw_pkt, queue_id, host_pkt_addr));
}
/**
* Forward the VRAM requestor ID needed for device memory from CP.
*/

View File

@@ -104,6 +104,11 @@ class Shader : public ClockedObject
// Set to true by the dispatcher if the current kernel is a blit kernel
bool blitKernel = false;
// Number of pending non-instruction invalidates outstanding. The shader
// should wait for these to be done to ensure correctness.
int num_outstanding_invl2s = 0;
std::vector<std::tuple<void *, uint32_t, Addr>> deferred_dispatches;
public:
typedef ShaderParams Params;
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -330,6 +335,13 @@ class Shader : public ClockedObject
blitKernel = is_blit_kernel;
}
void decNumOutstandingInvL2s();
void incNumOutstandingInvL2s() { num_outstanding_invl2s++; };
int getNumOutstandingInvL2s() const { return num_outstanding_invl2s; };
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
Addr host_pkt_addr);
protected:
struct ShaderStats : public statistics::Group
{

View File

@@ -1096,6 +1096,7 @@ class Request : public Extensible<Request>
* setting extraFlags should be done via setCacheCoherenceFlags().
*/
bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); }
bool isInvL2() const { return _cacheCoherenceFlags.isSet(GL2_CACHE_INV); }
bool
isGL2CacheFlush() const

View File

@@ -72,6 +72,7 @@ machine(MachineType:TCC, "TCC Cache")
L2_Repl, desc="L2 Replacement";
// Probes
PrbInv, desc="Invalidating probe";
InvCache, desc="Invalidating probe from TCP";
// Coming from Memory Controller
WBAck, desc="writethrough ack from memory";
Bypass, desc="Bypass the entire L2 cache";
@@ -413,6 +414,8 @@ machine(MachineType:TCC, "TCC Cache")
}
} else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
} else if (in_msg.Type == CoherenceRequestType:InvCache) {
trigger(Event:InvCache, in_msg.addr, cache_entry, tbe);
} else {
DPRINTF(RubySlicc, "%s\n", in_msg);
error("Unexpected Response Message to Core");
@@ -429,6 +432,19 @@ machine(MachineType:TCC, "TCC Cache")
unset_cache_entry();
}
action(ir_invL2Resp, "ir", desc="send L2 invalidate ack") {
peek(coreRequestNetwork_in, CPURequestMsg) {
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:InvL2Resp;
out_msg.Sender := machineID;
out_msg.Destination.add(in_msg.Requestor);
out_msg.MessageSize := MessageSizeType:Response_Control;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
}
action(sd_sendData, "sd", desc="send Shared response") {
peek(coreRequestNetwork_in, CPURequestMsg) {
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
@@ -1188,6 +1204,12 @@ machine(MachineType:TCC, "TCC Cache")
i_invL2;
}
transition({I, V}, InvCache, I) {TagArrayRead, TagArrayWrite} {
i_invL2;
ir_invL2Resp;
p_popRequestQueue;
}
transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
pi_sendProbeResponseInv;
pp_popProbeQueue;

View File

@@ -75,6 +75,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
Evict, desc="Evict if clean(invL1 for Load Acquire)";
// Mem sys initiated
Repl, desc="Replacing block from cache";
InvL2, desc="Invalidate to L2";
InvL2Resp, desc="Invalidate L2 completed";
// TCC initiated
TCC_Ack, desc="TCC Ack to Core Request";
@@ -286,9 +288,12 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
in_msg.Type == CoherenceResponseType:NBSysWBAck) {
trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
} else {
error("Unexpected Response Message to Core");
}
} else if (in_msg.Type == CoherenceResponseType:InvL2Resp) {
DPRINTF(RubySlicc, "Issuing InvL2Resp\n");
trigger(Event:InvL2Resp, in_msg.addr, cache_entry, tbe);
} else {
error("Unexpected Response Message to Core");
}
}
}
}
@@ -333,6 +338,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:REPLACEMENT){
trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:InvL2){
trigger(Event:InvL2, in_msg.LineAddress, cache_entry, tbe);
} else {
error("Unexpected Request Message from VIC");
}
@@ -609,6 +616,31 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
}
}
action(il2_invL2, "il2", desc="Invalidate address in L2") {
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceRequestType:InvCache;
out_msg.Requestor := machineID;
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.InitialRequestTime := curCycle();
peek(mandatoryQueue_in, RubyRequest) {
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
action(i2r_invL2Resp, "i2r", desc="Invalidate L2 completed") {
if (use_seq_not_coal) {
DPRINTF(RubySlicc, "Sequencer does not define invTCCCallback!\n");
assert(false);
} else {
coalescer.invTCCCallback(address);
}
}
action(wd_wtDone, "wd", desc="writethrough done") {
if (use_seq_not_coal) {
DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
@@ -830,6 +862,22 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
p_popMandatoryQueue;
}
transition(I, InvL2) {
il2_invL2;
p_popMandatoryQueue;
}
transition(V, InvL2, I) {
ic_invCache
il2_invL2;
p_popMandatoryQueue;
}
transition(I, InvL2Resp) {
i2r_invL2Resp;
pr_popResponseQueue;
}
// if a line is in IV and a TCC_AckWB comes back, we must have had a WT
// store followed by a load. Thus, complete the store without affecting
// TBE or line state.

View File

@@ -62,6 +62,7 @@ structure (VIPERCoalescer, external = "yes") {
Cycles, Cycles, Cycles, bool);
void atomicCallback(Addr, MachineType, DataBlock);
void invTCPCallback(Addr);
void invTCCCallback(Addr);
void writeCompleteCallback(Addr, uint64_t);
void evictionCallback(Addr);
}

View File

@@ -46,6 +46,7 @@ enumeration(CoherenceRequestType, desc="Coherence Request Types") {
WriteThroughFifo, desc="WriteThrough with no data";
WriteThroughDummy, desc="WriteThrough with no data for atomic operation";
WriteFlush, desc="Release Flush";
InvCache, desc="Invalidate Cache";
WrCancel, desc="want to cancel WB to Memory"; // should this be here?
@@ -95,6 +96,7 @@ enumeration(CoherenceResponseType, desc="Coherence Response Types") {
StaleNotif, desc="Notification of Stale WBAck, No data to writeback";
CPUCancelWB, desc="want to cancel WB to Memory";
MemData, desc="Data from Memory";
InvL2Resp, desc="Invalidate L2 response";
// for regions
PrivateAck, desc="Ack that r-buf received private notify";

View File

@@ -181,6 +181,7 @@ enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") {
COMMIT, desc="Commit version";
NULL, desc="Invalid request type";
FLUSH, desc="Flush request type";
InvL2, desc="Invalidate L2";
Release, desc="Release operation";
Acquire, desc="Acquire opertion";
AcquireRelease, desc="Acquire and Release opertion";

View File

@@ -80,6 +80,7 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
// VIPER does not expect MemSyncReq & Release since compute unit
// does not specify an equivalent type of memory request.
assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
(pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL2()) ||
pkt->cmd == MemCmd::ReadReq ||
pkt->cmd == MemCmd::WriteReq ||
pkt->cmd == MemCmd::FlushReq ||
@@ -106,6 +107,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
invTCP();
}
if (pkt->req->isInvL2()) {
invTCC(pkt);
}
return RequestStatus_Issued;
}
@@ -306,5 +311,51 @@ VIPERCoalescer::invTCP()
m_num_pending_invs);
}
void
VIPERCoalescer::invTCCCallback(Addr addr)
{
for (auto& pkt : m_pending_invl2s[addr]) {
RubyPort::SenderState *ss =
safe_cast<RubyPort::SenderState *>(pkt->senderState);
MemResponsePort *port = ss->port;
assert(port != nullptr);
// Now convert to MemSyncResp
pkt->makeResponse();
pkt->senderState = ss->predecessor;
delete ss;
port->hitCallback(pkt);
}
m_pending_invl2s.erase(addr);
}
/*
* Send an invalidate to a specific address in the TCC.
*/
void
VIPERCoalescer::invTCC(PacketPtr pkt)
{
assert(pkt);
assert(pkt->req);
Addr addr = pkt->req->getPaddr();
RubyRequestType request_type = RubyRequestType_InvL2;
std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
clockEdge(), addr, 0, 0,
request_type, RubyAccessMode_Supervisor,
nullptr);
DPRINTF(GPUCoalescer, "Sending L2 invalidate to 0x%x\n", addr);
assert(m_mandatory_q_ptr);
Tick latency = cyclesToTicks(
m_controller->mandatoryQueueLatency(request_type));
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
m_pending_invl2s[addr].push_back(pkt);
}
} // namespace ruby
} // namespace gem5

View File

@@ -63,11 +63,13 @@ class VIPERCoalescer : public GPUCoalescer
~VIPERCoalescer();
void writeCompleteCallback(Addr address, uint64_t instSeqNum);
void invTCPCallback(Addr address);
void invTCCCallback(Addr address);
RequestStatus makeRequest(PacketPtr pkt) override;
void issueRequest(CoalescedRequest* crequest) override;
private:
void invTCP();
void invTCC(PacketPtr pkt);
// make write-complete response packets from original write request packets
void makeWriteCompletePkts(CoalescedRequest* crequest);
@@ -79,6 +81,9 @@ class VIPERCoalescer : public GPUCoalescer
// number of remaining cache lines to be invalidated in TCP
int m_num_pending_invs;
// outstanding L2 invalidate packets
std::unordered_map<Addr, std::vector<PacketPtr>> m_pending_invl2s;
// a map of instruction sequence number and corresponding pending
// write-complete response packets. Each write-complete response
// corresponds to a pending store request that is waiting for