gpu-compute: Support Scalar and Vector access to system pages

The amdgpu driver supports reading and writing scalar and vector memory
addresses that reside in system memory. This is commonly used for things
like blit kernels that perform host-to-device or device-to-host copies
using GPU load/store instructions.

This is done by utilizing the system hub device added in a prior
changeset. Memory packets translated by the Scalar or VMEM TLBs will
have the correspoding system request field set from the PTE in the TLB
which can be used in the compute unit to determine if a request is for
system memory or not.

Another important change is to return global memory tokens for system
requests. Since these do not flow through the GPU coalescer where the
token is returned, the token can be returned once the request is known
to be a system request.

Change-Id: I35030e0b3698f10c63a397f96b81267271e3130e
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/57711
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Matthew Poremba
2022-03-15 13:34:58 -05:00
parent 347364ab0f
commit f375e79bcf
4 changed files with 125 additions and 9 deletions

View File

@@ -112,6 +112,12 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
scheduleToExecute(p),
stats(this, p.n_wf)
{
// This is not currently supported and would require adding more handling
// for system vs. device memory requests on the functional paths, so we
// fatal immediately in the constructor if this configuration is seen.
fatal_if(functionalTLB && FullSystem,
"Functional TLB not supported in full-system GPU simulation");
/**
* This check is necessary because std::bitset only provides conversion
* to unsigned long or unsigned long long via to_ulong() or to_ullong().
@@ -800,6 +806,12 @@ ComputeUnit::init()
bool
ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
{
return handleResponse(pkt);
}
bool
ComputeUnit::DataPort::handleResponse(PacketPtr pkt)
{
// Ruby has completed the memory op. Schedule the mem_resp_event at the
// appropriate cycle to process the timing memory response
@@ -901,6 +913,12 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
bool
ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
{
return handleResponse(pkt);
}
bool
ComputeUnit::ScalarDataPort::handleResponse(PacketPtr pkt)
{
assert(!pkt->req->isKernel());
@@ -1241,9 +1259,13 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
assert(gpuDynInst->isGlobalSeg() ||
gpuDynInst->executedAs() == enums::SC_GLOBAL);
// Fences will never be issued to system memory, so we can mark the
// requestor as a device memory ID here.
if (!req) {
req = std::make_shared<Request>(
0, 0, 0, requestorId(), 0, gpuDynInst->wfDynId);
0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
} else {
req->requestorId(vramRequestorId());
}
// all mem sync requests have Paddr == 0
@@ -1544,6 +1566,24 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
nullptr);
// Set VRAM ID for device requests
// For now, system vmem requests use functional reads. This is not that
// critical to model as the region of interest should always be accessing
// device memory. System vmem requests are used by blit kernels to do
// memcpys and load code objects into device memory.
if (new_pkt->req->systemReq()) {
// There will be multiple packets returned for the same gpuDynInst,
// so first check if systemReq is not already set and if so, return
// the token acquired when the dispatch list is filled as system
// requests do not require a GPU coalescer token.
if (!gpuDynInst->isSystemReq()) {
computeUnit->getTokenManager()->recvTokens(1);
gpuDynInst->setSystemReq();
}
} else {
new_pkt->req->requestorId(computeUnit->vramRequestorId());
}
// translation is done. Schedule the mem_req_event at the appropriate
// cycle to send the timing memory request to ruby
EventFunctionWrapper *mem_req_event =
@@ -1582,7 +1622,11 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
[[maybe_unused]] ComputeUnit *compute_unit = computeUnit;
if (!(sendTimingReq(pkt))) {
if (pkt->req->systemReq()) {
assert(compute_unit->shader->systemHub);
SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
} else if (!(sendTimingReq(pkt))) {
retries.push_back(std::make_pair(pkt, gpuDynInst));
DPRINTF(GPUPort,
@@ -1611,7 +1655,11 @@ ComputeUnit::ScalarDataPort::MemReqEvent::process()
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
[[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;
if (!(scalarDataPort.sendTimingReq(pkt))) {
if (pkt->req->systemReq()) {
assert(compute_unit->shader->systemHub);
SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
} else if (!(scalarDataPort.sendTimingReq(pkt))) {
scalarDataPort.retries.push_back(pkt);
DPRINTF(GPUPort,
@@ -1712,15 +1760,26 @@ ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
req_pkt->senderState =
new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst);
if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
computeUnit->scalarDataPort.retries.push_back(req_pkt);
DPRINTF(GPUMem, "send scalar req failed for: %s\n",
gpuDynInst->disassemble());
// For a system request we want to mark the GPU instruction as a system
// load/store so that after the request is issued to system memory we can
// return any token acquired for the request. Since tokens are returned
// by the coalescer and system requests do not take that path, this needs
// to be tracked.
//
// Device requests change the requestor ID to something in the device
// memory Ruby network.
if (req_pkt->req->systemReq()) {
gpuDynInst->setSystemReq();
} else {
DPRINTF(GPUMem, "send scalar req for: %s\n",
gpuDynInst->disassemble());
req_pkt->req->requestorId(computeUnit->vramRequestorId());
}
ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
= new ComputeUnit::ScalarDataPort::MemReqEvent
(computeUnit->scalarDataPort, req_pkt);
computeUnit->schedule(scalar_mem_req_event, curTick() +
computeUnit->req_tick_latency);
return true;
}

View File

@@ -529,6 +529,28 @@ class ComputeUnit : public ClockedObject
saved(sender_state) { }
};
class SystemHubEvent : public Event
{
DataPort *dataPort;
PacketPtr reqPkt;
public:
SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
: dataPort(_dataPort), reqPkt(pkt)
{
setFlags(Event::AutoDelete);
}
void
process()
{
// DMAs do not operate on packets and therefore do not
// convert to a response. Do that here instead.
reqPkt->makeResponse();
dataPort->handleResponse(reqPkt);
}
};
void processMemReqEvent(PacketPtr pkt);
EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
@@ -537,6 +559,8 @@ class ComputeUnit : public ClockedObject
std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
bool handleResponse(PacketPtr pkt);
protected:
ComputeUnit *computeUnit;
@@ -596,6 +620,30 @@ class ComputeUnit : public ClockedObject
const char *description() const;
};
class SystemHubEvent : public Event
{
ScalarDataPort *dataPort;
PacketPtr reqPkt;
public:
SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
: dataPort(_dataPort), reqPkt(pkt)
{
setFlags(Event::AutoDelete);
}
void
process()
{
// DMAs do not operate on packets and therefore do not
// convert to a response. Do that here instead.
reqPkt->makeResponse();
dataPort->handleResponse(reqPkt);
}
};
bool handleResponse(PacketPtr pkt);
std::deque<PacketPtr> retries;
private:

View File

@@ -62,6 +62,10 @@ GlobalMemPipeline::init()
bool
GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
{
// System requests do not need GPU coalescer tokens. Make sure nothing
// has bypassed the operand gather check stage.
assert(!mp->isSystemReq());
// We require one token from the coalescer's uncoalesced table to
// proceed
int token_count = 1;

View File

@@ -476,11 +476,16 @@ class GPUDynInst : public GPUExecContext
// inst used to save/restore a wavefront context
bool isSaveRestore;
bool isSystemReq() { return systemReq; }
void setSystemReq() { systemReq = true; }
private:
GPUStaticInst *_staticInst;
const InstSeqNum _seqNum;
int maxSrcVecRegOpSize;
int maxSrcScalarRegOpSize;
bool systemReq = false;
// the time the request was started
Tick accessTime = -1;