dev-amdgpu: Handle GPU atomics on host memory addresses

It is possible to execute a GPU atomic instruction using a memory
address that is in the host memory space (e.g, HMM, __managed__,
hipHostMalloc'd address). Since these are in host memory they are passed
to the SystemHub DmaDevice. However, this currently executes as a write
packet without modifying data. This leads to hangs in applications that
use atomics for forward progress (e.g., HeteroSync).

It is not clear where these are handled on a real GPU, but they are
certianly not handled by the software stack nor driver, so they must be
handled in hardware and therefore implemented in gem5. Handling for
atomics in the SystemHub makes the most sense.

To make atomics work a few extra changes need to be made to the
SystemHub. (1) The atomic is implemented as a host memory read, followed
by calling the AtomicOpFunctor, followed by a write. This requires a
second event to handle read response, performing atomic, and issuing a
write. (2) Atomics must be serialized otherwise two atomics might return
the same value which is incorrect. This patch adds serialization logic
for all request types to the same address to handle this. (3) With the
added complexity of the SystemHub, a new debug flag explicitly for
SystemHub is added.

Testing done: The heterosync application with input "sleepMutex 10 16 4"
previously hung before this patch. It passes with the patch applied.
This application tests both (1) and (2) above, as it allocates locks
with hipHostMalloc and has multiple workgroups sending an atomic request
in the same Tick, verifying the serialization mechanism.

Change-Id: Ife84b30037d1447dd384340cfeb06fdfd472fff9
This commit is contained in:
Matthew Poremba
2023-09-19 13:22:35 -05:00
parent 3bdcfd6f7a
commit 63cabf2848
3 changed files with 162 additions and 8 deletions

View File

@@ -50,6 +50,7 @@ Source('system_hub.cc', tags='x86 isa')
DebugFlag('AMDGPUDevice', tags='x86 isa')
DebugFlag('AMDGPUMem', tags='x86 isa')
DebugFlag('AMDGPUSystemHub', tags='x86 isa')
DebugFlag('PM4PacketProcessor', tags='x86 isa')
DebugFlag('SDMAEngine', tags='x86 isa')
DebugFlag('SDMAData', tags='x86 isa')

View File

@@ -31,6 +31,8 @@
#include "dev/amdgpu/system_hub.hh"
#include "debug/AMDGPUSystemHub.hh"
#include "mem/packet_access.hh"
#include "mem/port.hh"
namespace gem5
@@ -39,16 +41,92 @@ namespace gem5
void
AMDGPUSystemHub::sendRequest(PacketPtr pkt, Event *callback)
{
ResponseEvent *dmaRespEvent = new ResponseEvent(callback);
Tick delay = 0;
// Some requests, in particular atomics, need to be sent in order
// to receive the correct values. If there is an atomic in progress
// we must block it until that request is complete. This is overly
// conservative and blocks reads/writes but this situation is rare
// so it should not impact simulated performance.
DeferredReq this_req(pkt, callback);
outstandingReqs[pkt->getAddr()].push_back(this_req);
if (outstandingReqs[pkt->getAddr()].size () > 1) {
// There is another request in progress, Delay this one.
DPRINTF(AMDGPUSystemHub, "SystemHub deferring request for %#lx\n",
pkt->getAddr());
} else {
// No other requests, we can send immediately.
sendDeferredRequest(this_req);
}
}
void
AMDGPUSystemHub::sendDeferredRequest(DeferredReq& deferredReq)
{
PacketPtr pkt = deferredReq.first;
Event *callback = deferredReq.second;
Tick delay = 0;
std::string req_type;
if (pkt->isAtomicOp()) {
AtomicResponseEvent *atomicRespEvent =
new AtomicResponseEvent(*this, callback, pkt);
// First read the value. The response event will do the atomic/write
// This places the current value in the packet, which is correct since
// atomics return the value prior to performing the atomic.
dmaRead(pkt->getAddr(), pkt->getSize(), atomicRespEvent,
pkt->getPtr<uint8_t>(), 0, 0, delay);
req_type = "Atomic";
} else if (pkt->isWrite()) {
ResponseEvent *dmaRespEvent =
new ResponseEvent(*this, callback, pkt);
if (pkt->isWrite()) {
dmaWrite(pkt->getAddr(), pkt->getSize(), dmaRespEvent,
pkt->getPtr<uint8_t>(), 0, 0, delay);
req_type = "Write";
} else {
ResponseEvent *dmaRespEvent =
new ResponseEvent(*this, callback, pkt);
assert(pkt->isRead());
dmaRead(pkt->getAddr(), pkt->getSize(), dmaRespEvent,
pkt->getPtr<uint8_t>(), 0, 0, delay);
req_type = "Read";
}
DPRINTF(AMDGPUSystemHub, "SystemHub %s request for %#lx size %d\n",
req_type.c_str(), pkt->getAddr(), pkt->getSize());
}
void
AMDGPUSystemHub::sendNextRequest(Addr addr, const PacketPtr donePkt)
{
// Remove our request
assert(outstandingReqs.count(addr));
[[maybe_unused]] DeferredReq& frontPkt = outstandingReqs[addr].front();
assert(frontPkt.first == donePkt);
outstandingReqs[addr].pop_front();
// If there are no more requests this can be removed from the map.
// Otherwise issue the next request in the list
if (outstandingReqs[addr].empty()) {
DPRINTF(AMDGPUSystemHub, "SystemHub done with packets for addr %#lx\n",
donePkt->getAddr());
outstandingReqs.erase(addr);
} else {
DeferredReq& nextPkt = outstandingReqs[addr].front();
DPRINTF(AMDGPUSystemHub, "SystemHub sending deferred request for addr"
" %#lx size %d\n", nextPkt.first->getAddr(),
nextPkt.first->getSize());
sendDeferredRequest(nextPkt);
}
}
@@ -57,8 +135,9 @@ AMDGPUSystemHub::dmaResponse(PacketPtr pkt)
{
}
AMDGPUSystemHub::ResponseEvent::ResponseEvent(Event *_callback)
: callback(_callback)
AMDGPUSystemHub::ResponseEvent::ResponseEvent(
AMDGPUSystemHub& _hub, Event *_callback, PacketPtr _pkt)
: systemHub(_hub), callback(_callback), pkt(_pkt)
{
// Delete this event after process is called
setFlags(Event::AutoDelete);
@@ -67,9 +146,62 @@ AMDGPUSystemHub::ResponseEvent::ResponseEvent(Event *_callback)
void
AMDGPUSystemHub::ResponseEvent::process()
{
DPRINTF(AMDGPUSystemHub, "SystemHub response for addr %#lx size %d\n",
pkt->getAddr(), pkt->getSize());
systemHub.sendNextRequest(pkt->getAddr(), pkt);
callback->process();
}
AMDGPUSystemHub::AtomicResponseEvent::AtomicResponseEvent(
AMDGPUSystemHub& _hub, Event *_callback, PacketPtr _pkt)
: systemHub(_hub), callback(_callback), pkt(_pkt)
{
// Delete this event after process is called
setFlags(Event::AutoDelete);
}
void
AMDGPUSystemHub::AtomicResponseEvent::process()
{
// Make a second response with the original sender's callback
ResponseEvent *dmaRespEvent = new ResponseEvent(systemHub, callback, pkt);
Tick delay = 0;
// Create a new write packet which will be modifed then written
RequestPtr write_req =
std::make_shared<Request>(pkt->getAddr(), pkt->getSize(), 0,
pkt->requestorId());
PacketPtr write_pkt = Packet::createWrite(write_req);
uint8_t *write_data = new uint8_t[pkt->getSize()];
std::memcpy(write_data, pkt->getPtr<uint8_t>(), pkt->getSize());
write_pkt->dataDynamic(write_data);
// Perform the atomic on the write packet data. The atomic op is not
// copied from the original packet, so use the original packet.
assert(pkt->isAtomicOp());
(*pkt->getAtomicOp())(write_pkt->getPtr<uint8_t>());
// Write back the new value. The atomic is not considered done until
// this packet's response event is triggered.
systemHub.dmaWrite(write_pkt->getAddr(), write_pkt->getSize(),
dmaRespEvent, write_pkt->getPtr<uint8_t>(), 0, 0, delay);
// Atomics from the GPU are at most 64-bit and usually 32-bit.
// We can take a peek at the data for debugging purposes.
[[maybe_unused]] uint64_t req_data = 0x12345678;
if (write_pkt->getSize() == 8) {
req_data = write_pkt->getLE<uint64_t>();
} else if (pkt->getSize() == 4) {
req_data = write_pkt->getLE<uint32_t>();
}
DPRINTF(AMDGPUSystemHub, "SystemHub atomic %#lx writing %lx size %d\n",
write_pkt->getAddr(), req_data, write_pkt->getSize());
}
AddrRangeList
AMDGPUSystemHub::getAddrRanges() const
{

View File

@@ -63,16 +63,37 @@ class AMDGPUSystemHub : public DmaDevice
AddrRangeList getAddrRanges() const override;
private:
typedef std::pair<PacketPtr, Event*> DeferredReq;
typedef std::list<DeferredReq> DeferredReqList;
std::unordered_map<Addr, DeferredReqList> outstandingReqs;
void sendNextRequest(Addr addr, const PacketPtr donePkt);
void sendDeferredRequest(DeferredReq& deferredReq);
class ResponseEvent : public Event
{
Event *callback;
AMDGPUSystemHub &systemHub;
Event *callback;
PacketPtr pkt;
public:
ResponseEvent(Event *_callback);
public:
ResponseEvent(AMDGPUSystemHub& _hub,
Event *_callback, PacketPtr _pkt);
void process();
};
class AtomicResponseEvent : public Event
{
AMDGPUSystemHub &systemHub;
Event *callback;
PacketPtr pkt;
public:
AtomicResponseEvent(AMDGPUSystemHub& _hub,
Event *_callback, PacketPtr _pkt);
void process();
};
};