dev-amdgpu, gpu-compute, mem-ruby: Add support for writeback L2 in GPU (#1692)

Previously, GPU L2 caches could be configured in either writeback or
writethrough mode when used in an APU. However, in a CPU+dGPU system,
only writethrough worked. This is mainly because in CPU+dGPU system, the
CPU sends either PCI or SDMA requests to transfer data from the GPU
memory to CPU. When L2 cache is configured to be writeback, the dirty
data resides in L2 when CPU transfers data from GPU memory. This leads
to the wrong version being transferred. A similar issue also crops up
when the GPU command processor reads kernel information before kernel
dispatch, only to incorrect data. This PR contains a set of commits that
fix both these issues.
This commit is contained in:
Vishnu Ramadas
2024-11-05 12:45:46 -06:00
committed by GitHub
parent 940f49b63b
commit d463868f28
11 changed files with 286 additions and 23 deletions

View File

@@ -364,14 +364,27 @@ AMDGPUDevice::readFrame(PacketPtr pkt, Addr offset)
* because this method is called by the PCIDevice::read method which
* is a non-timing read.
*/
RequestPtr req = std::make_shared<Request>(offset, pkt->getSize(), 0,
vramRequestorId());
PacketPtr readPkt = Packet::createRead(req);
RequestPtr req = std::make_shared<Request>(
offset, pkt->getSize(), 0, vramRequestorId());
PacketPtr readPkt = new Packet(req, MemCmd::ReadReq);
uint8_t *dataPtr = new uint8_t[pkt->getSize()];
readPkt->dataDynamic(dataPtr);
readPkt->req->setGPUFuncAccess(true);
readPkt->setSuppressFuncError();
cp->shader()->cuList[0]->memPort[0].sendFunctional(readPkt);
if (readPkt->cmd == MemCmd::FunctionalReadError) {
delete readPkt;
delete[] dataPtr;
RequestPtr req = std::make_shared<Request>(offset, pkt->getSize(), 0,
vramRequestorId());
PacketPtr readPkt = Packet::createRead(req);
uint8_t *dataPtr = new uint8_t[pkt->getSize()];
readPkt->dataDynamic(dataPtr);
auto system = cp->shader()->gpuCmdProc.system();
system->getDeviceMemory(readPkt)->access(readPkt);
auto system = cp->shader()->gpuCmdProc.system();
system->getDeviceMemory(readPkt)->access(readPkt);
}
pkt->setUintX(readPkt->getUintX(ByteOrder::little), ByteOrder::little);
delete readPkt;

View File

@@ -1062,7 +1062,18 @@ ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
* and doesn't have a wavefront or instruction associated with it.
*/
if (sender_state->wavefront != nullptr) {
computeUnit->handleSQCReturn(pkt);
RequestPtr req = pkt->req;
// If the sender state's isKernDispath is set, then the request came
// from the gpu command processor. The request fetches information
// that will be used in the kernel dispatch process. It should be
// handled in the gpu command processor. If the flag isn't set,
// then the request is an instruction fetch and can be handled in
// the compute unit
if (sender_state->isKernDispatch) {
computeUnit->shader->gpuCmdProc.completeTimingRead();
} else {
computeUnit->handleSQCReturn(pkt);
}
} else {
delete pkt->senderState;
delete pkt;

View File

@@ -685,11 +685,17 @@ class ComputeUnit : public ClockedObject
Packet::SenderState *saved;
// kernel id to be used in handling I-Cache invalidate response
int kernId;
bool isKernDispatch;
SenderState(Wavefront *_wavefront, Packet::SenderState
*sender_state=nullptr, int _kernId=-1)
: wavefront(_wavefront), saved(sender_state),
kernId(_kernId){ }
kernId(_kernId), isKernDispatch(false){ }
SenderState(Wavefront *_wavefront, bool _isKernDispatch,
Packet::SenderState *sender_state=nullptr, int _kernId=-1)
: wavefront(_wavefront), saved(sender_state),
kernId(_kernId), isKernDispatch(_isKernDispatch){ }
};
class MemReqEvent : public Event

View File

@@ -40,6 +40,7 @@
#include "debug/GPUInitAbi.hh"
#include "debug/GPUKernelInfo.hh"
#include "dev/amdgpu/amdgpu_device.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/shader.hh"
#include "mem/abstract_mem.hh"
@@ -100,6 +101,37 @@ GPUCommandProcessor::translate(Addr vaddr, Addr size)
1 /* vmid */, vaddr, size));
}
void
GPUCommandProcessor::performTimingRead(PacketPtr pkt)
{
// Use the shader to access the CUs and call the read request from
// the SQC port. Call submit kernel dispatch in the timing response
// function in receive timing response of SQC port. Schedule this
// timing read when...just currTick
ComputeUnit *cu = shader()->cuList[0];
pkt->senderState = new ComputeUnit::SQCPort::SenderState(
cu->wfList[0][0], true);
ComputeUnit::SQCPort::SenderState *sender_state =
safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
ComputeUnit::SQCPort sqc_port = cu->sqcPort;
if (!sqc_port.sendTimingReq(pkt)) {
sqc_port.retries.push_back(
std::pair<PacketPtr, Wavefront*>(pkt,
sender_state->wavefront));
}
}
void
GPUCommandProcessor::completeTimingRead()
{
struct KernelDispatchData dispatchData = kernelDispatchList.front();
kernelDispatchList.pop_front();
delete dispatchData.readPkt;
if (kernelDispatchList.size() == 0)
dispatchKernelObject(dispatchData.akc, dispatchData.raw_pkt,
dispatchData.queue_id, dispatchData.host_pkt_addr);
}
/**
* submitDispatchPkt() is the entry point into the CP from the HSAPP
* and is only meant to be used with AQL kernel dispatch packets.
@@ -236,16 +268,20 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
RequestPtr request = std::make_shared<Request>(chunk_addr,
akc_alignment_granularity, flags,
walker->getDevRequestor());
Packet *readPkt = new Packet(request, MemCmd::ReadReq);
PacketPtr readPkt = new Packet(request, MemCmd::ReadReq);
readPkt->dataStatic((uint8_t *)akc + gen.complete());
// If the request spans two device memories, the device memory
// returned will be null.
assert(system()->getDeviceMemory(readPkt) != nullptr);
system()->getDeviceMemory(readPkt)->access(readPkt);
delete readPkt;
struct KernelDispatchData dispatchData;
dispatchData.akc = akc;
dispatchData.raw_pkt = raw_pkt;
dispatchData.queue_id = queue_id;
dispatchData.host_pkt_addr = host_pkt_addr;
dispatchData.readPkt = readPkt;
kernelDispatchList.push_back(dispatchData);
performTimingRead(readPkt);
}
dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
}
}
}

View File

@@ -85,12 +85,27 @@ class GPUCommandProcessor : public DmaVirtDevice
Shader* shader();
GPUComputeDriver* driver();
struct KernelDispatchData
{
AMDKernelCode *akc;
void *raw_pkt;
uint32_t queue_id;
Addr host_pkt_addr;
PacketPtr readPkt;
};
std::list<struct KernelDispatchData> kernelDispatchList;
enum AgentCmd
{
Nop = 0,
Steal = 1
};
void performTimingRead(PacketPtr pkt);
void completeTimingRead();
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
Addr host_pkt_addr);
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,

View File

@@ -473,6 +473,8 @@ class Request : public Extensible<Request>
/** The cause for HTM transaction abort */
HtmFailureFaultCause _htmAbortCause = HtmFailureFaultCause::INVALID;
bool _isGPUFuncAccess;
public:
/**
@@ -493,6 +495,7 @@ class Request : public Extensible<Request>
_flags.set(flags);
privateFlags.set(VALID_PADDR|VALID_SIZE);
_byteEnable = std::vector<bool>(size, true);
_isGPUFuncAccess = false;
}
Request(Addr vaddr, unsigned size, Flags flags,
@@ -502,6 +505,7 @@ class Request : public Extensible<Request>
setVirt(vaddr, size, flags, id, pc, std::move(atomic_op));
setContext(cid);
_byteEnable = std::vector<bool>(size, true);
_isGPUFuncAccess = false;
}
Request(const Request& other)
@@ -1124,6 +1128,17 @@ class Request : public Extensible<Request>
bool isCacheInvalidate() const { return _flags.isSet(INVALIDATE); }
bool isCacheMaintenance() const { return _flags.isSet(CLEAN|INVALIDATE); }
/** @} */
void
setGPUFuncAccess(bool flag) {
_isGPUFuncAccess = flag;
}
bool
getGPUFuncAccess()
{
return _isGPUFuncAccess;
}
};
} // namespace gem5

View File

@@ -73,6 +73,7 @@ machine(MachineType:TCC, "TCC Cache")
// Probes
PrbInv, desc="Invalidating probe";
InvCache, desc="Invalidating probe from TCP";
PrbDowngrade, desc="Downgrading probe";
// Coming from Memory Controller
WBAck, desc="writethrough ack from memory";
Bypass, desc="Bypass the entire L2 cache";
@@ -180,8 +181,11 @@ machine(MachineType:TCC, "TCC Cache")
void functionalRead(Addr addr, Packet *pkt) {
TBE tbe := TBEs.lookup(addr);
Entry cache_entry := getCacheEntry(addr);
if(is_valid(tbe)) {
testAndRead(addr, tbe.DataBlk, pkt);
} else if (is_valid(cache_entry)) {
testAndRead(addr, cache_entry.DataBlk, pkt);
} else {
functionalMemoryRead(pkt);
}
@@ -345,7 +349,14 @@ machine(MachineType:TCC, "TCC Cache")
DPRINTF(RubySlicc, "%s\n", in_msg);
Entry cache_entry := getCacheEntry(in_msg.addr);
TBE tbe := TBEs.lookup(in_msg.addr);
trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
if (in_msg.Type == ProbeRequestType:PrbInv) {
// Invalidate data and send it downstream
trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
} else {
// If data present in cache, then downgrade it and send it
// downstream
trigger(Event:PrbDowngrade, in_msg.addr, cache_entry, tbe);
}
}
}
}
@@ -815,6 +826,28 @@ machine(MachineType:TCC, "TCC Cache")
}
}
action(pd_sendProbeResponseDowngrade, "pd", desc="send probe downgrade") {
enqueue(responseToNB_out, ResponseMsg, 1) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes
out_msg.Sender := machineID;
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
if (getState(tbe, cache_entry, address) == State:V || getState(tbe, cache_entry, address) == State:M || getState(tbe, cache_entry, address) == State:W) {
out_msg.Hit := true;
out_msg.Dirty := true;
out_msg.DataBlk := cache_entry.DataBlk;
out_msg.MessageSize := MessageSizeType:Response_Data;
} else {
out_msg.Hit := false;
out_msg.Dirty := false;
out_msg.MessageSize := MessageSizeType:Response_Control;
}
out_msg.Ntsl := true;
out_msg.State := CoherenceState:NA;
}
}
action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
enqueue(responseToNB_out, ResponseMsg, 1) {
out_msg.addr := address;
@@ -1212,6 +1245,25 @@ machine(MachineType:TCC, "TCC Cache")
p_popRequestQueue;
}
transition(I, PrbDowngrade) {TagArrayRead} {
pd_sendProbeResponseDowngrade;
pp_popProbeQueue;
}
transition(V, PrbDowngrade) {TagArrayRead} {
pd_sendProbeResponseDowngrade;
pp_popProbeQueue;
}
transition({M, W}, PrbDowngrade, V) {TagArrayRead, TagArrayWrite} {
pd_sendProbeResponseDowngrade;
pp_popProbeQueue;
}
transition({A, IV, WI, WIB}, PrbDowngrade) {TagArrayRead, TagArrayWrite} {
st_stallAndWaitRequest;
}
transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
pi_sendProbeResponseInv;
pp_popProbeQueue;

View File

@@ -173,8 +173,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
void functionalRead(Addr addr, Packet *pkt) {
TBE tbe := TBEs.lookup(addr);
Entry cache_entry := getCacheEntry(addr);
if(is_valid(tbe)) {
testAndRead(addr, tbe.DataBlk, pkt);
} else if (is_valid(cache_entry)) {
testAndRead(addr, cache_entry.DataBlk, pkt);
} else {
functionalMemoryRead(pkt);
}

View File

@@ -84,7 +84,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack";
BL2, AccessPermission:Backing_Store, desc="Blocked checking for data in L2";
BL2_Pm, AccessPermission:Backing_Store, desc="Blocked waiting for probes, already got memory";
BL2_M, AccessPermission:Backing_Store, desc="Blocked waiting for memory";
F, AccessPermission:Busy, desc="sent Flus, blocked till ack";
}
@@ -105,6 +107,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
// probe responses
CPUPrbResp, desc="Probe Response Msg";
CPUPrbRespWB, desc="Probe Response Msg and Data";
ProbeAcksComplete, desc="Probe Acks Complete";
@@ -121,6 +124,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
// DMA
DmaRead, desc="DMA read";
DmaReadWB, desc="DMA read write back";
DmaWrite, desc="DMA write";
// Flush
@@ -300,7 +304,11 @@ machine(MachineType:Directory, "AMD Baseline protocol")
TBE tbe := TBEs.lookup(in_msg.LineAddress);
CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.LineAddress));
if (in_msg.Type == DMARequestType:READ) {
trigger(Event:DmaRead, in_msg.LineAddress, entry, tbe);
if (L2isWB) {
trigger(Event:DmaReadWB, in_msg.LineAddress, entry, tbe);
} else {
trigger(Event:DmaRead, in_msg.LineAddress, entry, tbe);
}
} else if (in_msg.Type == DMARequestType:WRITE) {
trigger(Event:DmaWrite, in_msg.LineAddress, entry, tbe);
} else {
@@ -359,7 +367,11 @@ machine(MachineType:Directory, "AMD Baseline protocol")
TBE tbe := TBEs.lookup(in_msg.addr);
CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
if (in_msg.Hit == true && L2isWB) {
trigger(Event:CPUPrbRespWB, in_msg.addr, entry, tbe);
} else {
trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
}
} else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
trigger(Event:StaleWB, in_msg.addr, entry, tbe);
} else {
@@ -825,6 +837,12 @@ machine(MachineType:Directory, "AMD Baseline protocol")
TCC_select_low_bit,
TCC_select_num_bits));
}
if (GPUonly && L2isWB) {
probe_dests.add(mapAddressToRange(address, MachineType:TCC,
TCC_select_low_bit,
TCC_select_num_bits));
}
}
probe_dests.remove(in_msg.Requestor);
@@ -1100,6 +1118,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
peek(memQueue_in, MemoryMsg) {
DPRINTF(RubySlicc, "%s\n", in_msg);
if (tbe.wtData == true) {
// Keep the write-through data based on mask, but use the memory block
// for the masked-off data. If we received a probe with data, the mask
@@ -1115,6 +1134,20 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}
}
action(yw_writeProbeDataToTBEWB, "yw", desc="write Probe Data to TBE") {
peek(responseNetwork_in, ResponseMsg) {
DPRINTF(RubySlicc, "%s\n", in_msg);
if (tbe.Dirty == false) {
tbe.DataBlk := in_msg.DataBlk;
tbe.Dirty := in_msg.Dirty;
tbe.LastSender := in_msg.Sender;
tbe.Cached := true;
tbe.MemData := true;
}
}
}
action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
peek(responseNetwork_in, ResponseMsg) {
if (in_msg.Dirty) {
@@ -1315,22 +1348,29 @@ machine(MachineType:Directory, "AMD Baseline protocol")
*/
// TRANSITIONS
transition({BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
transition({BL, BL2, BL2_Pm, BL2_M, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
st_stallAndWaitRequest;
}
// It may be possible to save multiple invalidations here!
transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Atomic, WriteThrough}) {
// It may be possible to save multiple invalidations here!
transition({BL, BL2, BL2_Pm, BL2_M, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Atomic, WriteThrough}) {
st_stallAndWaitRequest;
}
// The exit state is always going to be U, so wakeUpDependents logic should be covered in all the
// transitions which are flowing into U.
transition({BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {DmaRead,DmaWrite}){
transition({BL, BL2, BL2_Pm, BL2_M, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {DmaRead, DmaReadWB, DmaWrite}){
sd_stallAndWaitRequest;
}
// transitions from U
transition(U, DmaReadWB, BL2) {
atd_allocateTBEforDMA; // Allocate a TBE
qdr_queueDmaRdReq;
pr_profileL3HitMiss;
scd_probeShrCoreDataForDma; // Send probes to the Ruby Network
}
transition(U, DmaRead, BDR_PM) {L3TagArrayRead} {
atd_allocateTBEforDMA;
qdr_queueDmaRdReq;
@@ -1567,13 +1607,75 @@ machine(MachineType:Directory, "AMD Baseline protocol")
ptl_popTriggerQueue;
}
transition({BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) {
transition({BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, BP, BL2}, CPUPrbResp) {
y_writeProbeDataToTBE;
x_decrementAcks;
o_checkForCompletion;
pr_popResponseQueue;
}
transition({BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbRespWB) {
y_writeProbeDataToTBE;
x_decrementAcks;
o_checkForCompletion;
pr_popResponseQueue;
}
transition(BL2, L3Hit, BL2_Pm) {
ptl_popTriggerQueue;
}
transition({BL2, BL2_Pm}, CPUPrbRespWB, BL2_Pm) {
// Blocked on L2 and waiting for probes
yw_writeProbeDataToTBEWB;
x_decrementAcks;
o_checkForCompletion;
pr_popResponseQueue;
}
transition(BL2_Pm, CPUPrbResp) {
// Blocked on L2 probes, got the memory
x_decrementAcks;
o_checkForCompletion;
pr_popResponseQueue;
}
transition(BL2, ProbeAcksComplete, BL2_M) {
// We probed all the TCC dirs but didn't find the memory
// Send out memory request
// Transition to waiting on memory
pt_popTriggerQueue;
}
transition(BL2_Pm, ProbeAcksComplete, U) {
// We were waiting for all probes to come back now that they have we can unblock
// Send WBAck back to TCC
dd_sendResponseDmaData;
wada_wakeUpAllDependentsAddr;
dt_deallocateTBE;
pd_popDmaRequestQueue;
pt_popTriggerQueue;
}
transition(BL2, MemData, BL2_Pm) {
mt_writeMemDataToTBE;
pm_popMemQueue;
}
transition({BL2_Pm, U}, MemData) {
pm_popMemQueue;
}
transition(BL2_M, MemData, U) {
// Got the memory we were waiting for. We can unblock now.
mt_writeMemDataToTBE;
dd_sendResponseDmaData;
wada_wakeUpAllDependentsAddr;
pd_popDmaRequestQueue;
dt_deallocateTBE;
pm_popMemQueue;
}
transition(BDR_PM, ProbeAcksComplete, BDR_M) {
pt_popTriggerQueue;
}

View File

@@ -393,6 +393,11 @@ RubyPort::MemResponsePort::recvFunctional(PacketPtr pkt)
{
DPRINTF(RubyPort, "Functional access for address: %#x\n", pkt->getAddr());
// In a CPU+dGPU system, GPU functional packets are injected into
// the CPU network. This happens because the requestorId is automatically
// set to that of the CPU network for these packets. Here, we set it
// to that of the GPU RubyPort so that it uses the right network to
// access GPU caches
RubySystem *rs = owner.m_ruby_system;
// Check for pio requests and directly send them to the dedicated
@@ -407,6 +412,10 @@ RubyPort::MemResponsePort::recvFunctional(PacketPtr pkt)
assert(pkt->getAddr() + pkt->getSize() <=
owner.makeLineAddress(pkt->getAddr()) + rs->getBlockSizeBytes());
if (pkt->req->getGPUFuncAccess()) {
pkt->req->requestorId(owner.m_controller->getRequestorId());
}
if (access_backing_store) {
// The attached physmem contains the official version of data.
// The following command performs the real functional access.

View File

@@ -560,7 +560,8 @@ RubySystem::functionalRead(PacketPtr pkt)
// it only if it's not in the cache hierarchy at all.
int num_controllers = netCntrls[request_net_id].size();
if (num_invalid == (num_controllers - 1) && num_backing_store == 1) {
DPRINTF(RubySystem, "only copy in Backing_Store memory, read from it\n");
DPRINTF(RubySystem,
"only copy in Backing_Store memory, read from it\n");
ctrl_backing_store->functionalRead(line_address, pkt);
return true;
} else if (num_ro > 0 || num_rw >= 1) {