dev-hsa,gpu-compute: Agent Packet handler implemented.
HSA packet processor will now accept and process agent packets.
Type field in packet is command type.
For now:
AgentCmd::Nop = 0
AgentCmd::Steal = 1
Steal command steals the completion signal for a running kernel.
This enables a benchmark to use hsa primitives to send an agent
packet to steal the signal, then wait on that signal.
Minimal working example to be added in gem5-resources.
Change-Id: I37f8a4b7ea1780b471559aecbf4af1050353b0b1
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/37015
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
@@ -56,6 +56,18 @@ class HSADevice : public DmaDevice
|
||||
|
||||
HSAPacketProcessor& hsaPacketProc();
|
||||
|
||||
/**
|
||||
* submitAgentDispatchPkt() accepts AQL dispatch packets from the HSA
|
||||
* packet processor. Not all devices will accept AQL dispatch packets,
|
||||
* so the default implementation will fatal.
|
||||
* Implementation added to steal kernel signals.
|
||||
*/
|
||||
virtual void
|
||||
submitAgentDispatchPkt(void *raw_pkt, uint32_t qID, Addr host_pkt_addr)
|
||||
{
|
||||
fatal("%s does not accept dispatch packets\n", name());
|
||||
}
|
||||
|
||||
/**
|
||||
* submitDispatchPkt() accepts AQL dispatch packets from the HSA packet
|
||||
* processor. Not all devices will accept AQL dispatch packets, so the
|
||||
|
||||
@@ -432,6 +432,14 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
|
||||
fatal("Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR");
|
||||
} else if (pkt_type == HSA_PACKET_TYPE_INVALID) {
|
||||
fatal("Unsupported packet type HSA_PACKET_TYPE_INVALID");
|
||||
} else if (pkt_type == HSA_PACKET_TYPE_AGENT_DISPATCH) {
|
||||
DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \
|
||||
" active list ID = %d\n", __FUNCTION__, rl_idx);
|
||||
// Submit packet to HSA device (dispatcher)
|
||||
hsa_device->submitAgentDispatchPkt(
|
||||
(void *)disp_pkt, rl_idx, host_pkt_addr);
|
||||
is_submitted = UNBLOCKED;
|
||||
sendAgentDispatchCompletionSignal((void *)disp_pkt,0);
|
||||
} else {
|
||||
fatal("Unsupported packet type %d\n", pkt_type);
|
||||
}
|
||||
@@ -700,3 +708,56 @@ HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
|
||||
// multi-process support
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
HSAPacketProcessor::sendAgentDispatchCompletionSignal(
|
||||
void *pkt, hsa_signal_value_t signal)
|
||||
{
|
||||
auto agent_pkt = (_hsa_agent_dispatch_packet_t *)pkt;
|
||||
uint64_t signal_addr =
|
||||
(uint64_t) (((uint64_t *)agent_pkt->completion_signal) + 1);
|
||||
DPRINTF(HSAPacketProcessor, "Triggering Agent Dispatch packet" \
|
||||
" completion signal: %x!\n", signal_addr);
|
||||
/**
|
||||
* HACK: The semantics of the HSA signal is to
|
||||
* decrement the current signal value.
|
||||
* I'm going to cheat here and read out
|
||||
* the value from main memory using functional
|
||||
* access, and then just DMA the decremented value.
|
||||
* The reason for this is that the DMASequencer does
|
||||
* not support atomic operations.
|
||||
*/
|
||||
VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
|
||||
|
||||
DPRINTF(HSAPacketProcessor,"HSADriver: Sending signal to %lu\n",
|
||||
(uint64_t)sys->threads[0]->cpuId());
|
||||
|
||||
|
||||
hsa_signal_value_t *new_signal = new hsa_signal_value_t;
|
||||
*new_signal = (hsa_signal_value_t) *prev_signal - 1;
|
||||
|
||||
dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
|
||||
}
|
||||
|
||||
void
|
||||
HSAPacketProcessor::sendCompletionSignal(hsa_signal_value_t signal)
|
||||
{
|
||||
uint64_t signal_addr = (uint64_t) (((uint64_t *)signal) + 1);
|
||||
DPRINTF(HSAPacketProcessor, "Triggering completion signal: %x!\n",
|
||||
signal_addr);
|
||||
/**
|
||||
* HACK: The semantics of the HSA signal is to
|
||||
* decrement the current signal value.
|
||||
* I'm going to cheat here and read out
|
||||
* the value from main memory using functional
|
||||
* access, and then just DMA the decremented value.
|
||||
* The reason for this is that the DMASequencer does
|
||||
* not support atomic operations.
|
||||
*/
|
||||
VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
|
||||
|
||||
hsa_signal_value_t *new_signal = new hsa_signal_value_t;
|
||||
*new_signal = (hsa_signal_value_t) *prev_signal - 1;
|
||||
|
||||
dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
|
||||
}
|
||||
|
||||
@@ -329,6 +329,10 @@ class HSAPacketProcessor: public DmaDevice
|
||||
void schedAQLProcessing(uint32_t rl_idx);
|
||||
void schedAQLProcessing(uint32_t rl_idx, Tick delay);
|
||||
|
||||
void sendAgentDispatchCompletionSignal(void *pkt,
|
||||
hsa_signal_value_t signal);
|
||||
void sendCompletionSignal(hsa_signal_value_t signal);
|
||||
|
||||
class DepSignalsReadDmaEvent : public Event
|
||||
{
|
||||
protected:
|
||||
|
||||
@@ -71,6 +71,7 @@ Source('tlb_coalescer.cc')
|
||||
Source('vector_register_file.cc')
|
||||
Source('wavefront.cc')
|
||||
|
||||
DebugFlag('GPUAgentDisp')
|
||||
DebugFlag('GPUCoalescer')
|
||||
DebugFlag('GPUCommandProc')
|
||||
DebugFlag('GPUDriver')
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
|
||||
#include "debug/GPUAgentDisp.hh"
|
||||
#include "debug/GPUDisp.hh"
|
||||
#include "debug/GPUKernelInfo.hh"
|
||||
#include "debug/GPUWgLatency.hh"
|
||||
@@ -130,6 +131,8 @@ GPUDispatcher::dispatch(HSAQueueEntry *task)
|
||||
|
||||
DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
|
||||
task->kernelName(), task->dispatchId());
|
||||
DPRINTF(GPUAgentDisp, "launching kernel: %s, dispatch ID: %d\n",
|
||||
task->kernelName(), task->dispatchId());
|
||||
|
||||
execIds.push(task->dispatchId());
|
||||
dispatchActive = true;
|
||||
@@ -144,6 +147,7 @@ void
|
||||
GPUDispatcher::exec()
|
||||
{
|
||||
int fail_count(0);
|
||||
int disp_count(0);
|
||||
|
||||
/**
|
||||
* There are potentially multiple outstanding kernel launches.
|
||||
@@ -151,6 +155,7 @@ GPUDispatcher::exec()
|
||||
* can fit on the GPU even if another kernel's workgroups cannot
|
||||
*/
|
||||
DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
|
||||
DPRINTF(GPUAgentDisp, "Launching %d Kernels\n", execIds.size());
|
||||
|
||||
if (execIds.size() > 0) {
|
||||
++cyclesWaitingForDispatch;
|
||||
@@ -204,7 +209,7 @@ GPUDispatcher::exec()
|
||||
/**
|
||||
* if we failed try the next kernel,
|
||||
* it may have smaller workgroups.
|
||||
* put it on the queue to rety latter
|
||||
* put it on the queue to retry later
|
||||
*/
|
||||
DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
|
||||
execIds.push(exec_id);
|
||||
@@ -212,6 +217,7 @@ GPUDispatcher::exec()
|
||||
break;
|
||||
} else if (!launched) {
|
||||
launched = true;
|
||||
disp_count++;
|
||||
DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);
|
||||
}
|
||||
}
|
||||
@@ -221,6 +227,8 @@ GPUDispatcher::exec()
|
||||
}
|
||||
|
||||
DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
|
||||
DPRINTF(GPUWgLatency, "Kernel Wgs dispatched: %d | %d failures\n",
|
||||
disp_count, fail_count);
|
||||
|
||||
while (doneIds.size()) {
|
||||
DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
|
||||
|
||||
@@ -93,6 +93,10 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
|
||||
"kernel object\n", akc.kernel_code_entry_byte_offset);
|
||||
|
||||
DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
|
||||
(uint64_t)tc->cpuId());
|
||||
|
||||
|
||||
Addr machine_code_addr = (Addr)disp_pkt->kernel_object
|
||||
+ akc.kernel_code_entry_byte_offset;
|
||||
|
||||
@@ -166,6 +170,54 @@ GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
|
||||
hsaPP->finishPkt(raw_pkt, queue_id);
|
||||
}
|
||||
|
||||
/**
|
||||
* submitAgentDispatchPkt() is for accepting agent dispatch packets.
|
||||
* These packets will control the dispatch of Wg on the device, and inform
|
||||
* the host when a specified number of Wg have been executed on the device.
|
||||
*
|
||||
* For now it simply finishes the pkt.
|
||||
*/
|
||||
void
|
||||
GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr)
|
||||
{
|
||||
//Parse the Packet, see what it wants us to do
|
||||
_hsa_agent_dispatch_packet_t * agent_pkt =
|
||||
(_hsa_agent_dispatch_packet_t *)raw_pkt;
|
||||
|
||||
if (agent_pkt->type == AgentCmd::Nop) {
|
||||
DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
|
||||
} else if (agent_pkt->type == AgentCmd::Steal) {
|
||||
//This is where we steal the HSA Task's completion signal
|
||||
int kid = agent_pkt->arg[0];
|
||||
DPRINTF(GPUCommandProc,
|
||||
"Agent Dispatch Packet Stealing signal handle for kernel %d\n",
|
||||
kid);
|
||||
|
||||
HSAQueueEntry *task = dispatcher.hsaTask(kid);
|
||||
uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
|
||||
|
||||
uint64_t return_address = agent_pkt->return_address;
|
||||
DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
|
||||
//*return_address = signal_addr;
|
||||
Addr *new_signal_addr = new Addr;
|
||||
*new_signal_addr = (Addr)signal_addr;
|
||||
dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
|
||||
|
||||
DPRINTF(GPUCommandProc,
|
||||
"Agent Dispatch Packet Stealing signal handle from kid %d :" \
|
||||
"(%x:%x) writing into %x\n",
|
||||
kid,signal_addr,new_signal_addr,return_address);
|
||||
|
||||
} else
|
||||
{
|
||||
panic("The agent dispatch packet provided an unknown argument in" \
|
||||
"arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
|
||||
}
|
||||
|
||||
hsaPP->finishPkt(raw_pkt, queue_id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Once the CP has finished extracting all relevant information about
|
||||
* a task and has initialized the ABI state, we send a description of
|
||||
|
||||
@@ -65,6 +65,13 @@ class GPUCommandProcessor : public HSADevice
|
||||
void setShader(Shader *shader);
|
||||
Shader* shader();
|
||||
|
||||
enum AgentCmd {
|
||||
Nop = 0,
|
||||
Steal = 1
|
||||
};
|
||||
|
||||
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr) override;
|
||||
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr) override;
|
||||
void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
#include "arch/x86/isa_traits.hh"
|
||||
#include "arch/x86/linux/linux.hh"
|
||||
#include "base/chunk_generator.hh"
|
||||
#include "debug/GPUAgentDisp.hh"
|
||||
#include "debug/GPUDisp.hh"
|
||||
#include "debug/GPUMem.hh"
|
||||
#include "debug/GPUShader.hh"
|
||||
@@ -231,6 +232,7 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
|
||||
bool scheduledSomething = false;
|
||||
int cuCount = 0;
|
||||
int curCu = nextSchedCu;
|
||||
int disp_count(0);
|
||||
|
||||
while (cuCount < n_cu) {
|
||||
//Every time we try a CU, update nextSchedCu
|
||||
@@ -245,6 +247,8 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
|
||||
scheduledSomething = true;
|
||||
DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
|
||||
curCu, task->globalWgId());
|
||||
DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
|
||||
curCu, task->globalWgId());
|
||||
DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
|
||||
curTick(), task->globalWgId(), curCu);
|
||||
|
||||
@@ -259,12 +263,15 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
|
||||
cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
|
||||
|
||||
task->markWgDispatch();
|
||||
++disp_count;
|
||||
}
|
||||
|
||||
++cuCount;
|
||||
curCu = nextSchedCu;
|
||||
}
|
||||
|
||||
DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
|
||||
|
||||
return scheduledSomething;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user