Files
gem5/src/gpu-compute/gpu_command_processor.cc
Matthew Poremba ebd5b3e4ae gpu-compute: Gfx version check for FS and SE mode
There is no GPU device in SE mode to get version from and no GPU driver
in FS mode to get version from, so a conditional needs to be added
depending on the mode to get the gfx version.

Change-Id: I33fdafb60d351ebc5148e2248244537fb5bebd31
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/71078
Tested-by: kokoro <noreply+kokoro@google.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
2023-06-01 00:15:02 +00:00

466 lines
17 KiB
C++

/*
* Copyright (c) 2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpu-compute/gpu_command_processor.hh"
#include <cassert>
#include "arch/amdgpu/vega/pagetable_walker.hh"
#include "base/chunk_generator.hh"
#include "debug/GPUCommandProc.hh"
#include "debug/GPUKernelInfo.hh"
#include "dev/amdgpu/amdgpu_device.hh"
#include "gpu-compute/dispatcher.hh"
#include "mem/abstract_mem.hh"
#include "mem/packet_access.hh"
#include "mem/se_translating_port_proxy.hh"
#include "mem/translating_port_proxy.hh"
#include "params/GPUCommandProcessor.hh"
#include "sim/full_system.hh"
#include "sim/process.hh"
#include "sim/proxy_ptr.hh"
#include "sim/syscall_emul_buf.hh"
namespace gem5
{
GPUCommandProcessor::GPUCommandProcessor(const Params &p)
: DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
walker(p.walker), hsaPP(p.hsapp)
{
assert(hsaPP);
hsaPP->setDevice(this);
dispatcher.setCommandProcessor(this);
}
HSAPacketProcessor&
GPUCommandProcessor::hsaPacketProc()
{
return *hsaPP;
}
/**
* Forward the VRAM requestor ID needed for device memory from GPU device.
*/
RequestorID
GPUCommandProcessor::vramRequestorId()
{
return gpuDevice->vramRequestorId();
}
TranslationGenPtr
GPUCommandProcessor::translate(Addr vaddr, Addr size)
{
if (!FullSystem) {
// Grab the process and try to translate the virtual address with it;
// with new extensions, it will likely be wrong to just arbitrarily
// grab context zero.
auto process = sys->threads[0]->getProcessPtr();
return process->pTable->translateRange(vaddr, size);
}
// In full system use the page tables setup by the kernel driver rather
// than the CPU page tables.
return TranslationGenPtr(
new AMDGPUVM::UserTranslationGen(&gpuDevice->getVM(), walker,
1 /* vmid */, vaddr, size));
}
/**
* submitDispatchPkt() is the entry point into the CP from the HSAPP
* and is only meant to be used with AQL kernel dispatch packets.
* After the HSAPP receives and extracts an AQL packet, it sends
* it to the CP, which is responsible for gathering all relevant
* information about a task, initializing CU state, and sending
* it to the dispatcher for WG creation and dispatch.
*
* First we need capture all information from the the AQL pkt and
* the code object, then store it in an HSAQueueEntry. Once the
* packet and code are extracted, we extract information from the
* queue descriptor that the CP needs to perform state initialization
* on the CU. Finally we call dispatch() to send the task to the
* dispatcher. When the task completely finishes, we call finishPkt()
* on the HSA packet processor in order to remove the packet from the
* queue, and notify the runtime that the task has completed.
*/
void
GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
Addr host_pkt_addr)
{
static int dynamic_task_id = 0;
_hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));
/**
* we need to read a pointer in the application's address
* space to pull out the kernel code descriptor.
*/
auto *tc = sys->threads[0];
TranslatingPortProxy fs_proxy(tc);
SETranslatingPortProxy se_proxy(tc);
PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
/**
* In full system mode, the page table entry may point to a system page
* or a device page. System pages use the proxy as normal, but a device
* page needs to be read from device memory. Check what type it is here.
*/
bool is_system_page = true;
Addr phys_addr = disp_pkt->kernel_object;
if (FullSystem) {
/**
* Full system currently only supports running on single VMID (one
* virtual memory space), i.e., one application running on GPU at a
* time. Because of this, for now we know the VMID is always 1. Later
* the VMID would have to be passed on to the command processor.
*/
int vmid = 1;
unsigned tmp_bytes;
walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
phys_addr, tmp_bytes, BaseMMU::Mode::Read,
is_system_page);
}
DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
is_system_page);
/**
* The kernel_object is a pointer to the machine code, whose entry
* point is an 'amd_kernel_code_t' type, which is included in the
* kernel binary, and describes various aspects of the kernel. The
* desired entry is the 'kernel_code_entry_byte_offset' field,
* which provides the byte offset (positive or negative) from the
* address of the amd_kernel_code_t to the start of the machine
* instructions.
*/
AMDKernelCode akc;
if (is_system_page) {
DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n");
virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
sizeof(AMDKernelCode));
} else {
assert(FullSystem);
DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");
// Read from GPU memory manager one cache line at a time to prevent
// rare cases where the AKC spans two memory pages.
ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
system()->cacheLineSize());
for (; !gen.done(); gen.next()) {
Addr chunk_addr = gen.addr();
int vmid = 1;
unsigned dummy;
walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
chunk_addr, dummy, BaseMMU::Mode::Read,
is_system_page);
Request::Flags flags = Request::PHYSICAL;
RequestPtr request = std::make_shared<Request>(chunk_addr,
system()->cacheLineSize(), flags, walker->getDevRequestor());
Packet *readPkt = new Packet(request, MemCmd::ReadReq);
readPkt->dataStatic((uint8_t *)&akc + gen.complete());
system()->getDeviceMemory(readPkt)->access(readPkt);
delete readPkt;
}
}
DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
"kernel object\n", akc.kernel_code_entry_byte_offset);
DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
(uint64_t)tc->cpuId());
Addr machine_code_addr = (Addr)disp_pkt->kernel_object
+ akc.kernel_code_entry_byte_offset;
DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
machine_code_addr);
std::string kernel_name;
/**
* BLIT kernels don't have symbol names. BLIT kernels are built-in compute
* kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
* hardware engines are unavailable or explicitly disabled. They can also
* be used to do copies that ROCm things would be better performed
* by the shader than the SDMA engines. They are also sometimes used on
* APUs to implement asynchronous memcopy operations from 2 pointers in
* host memory. I have no idea what BLIT stands for.
* */
if (akc.runtime_loader_kernel_symbol) {
kernel_name = "Some kernel";
} else {
kernel_name = "Blit kernel";
}
DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()
: driver()->getGfxVersion();
HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
gfxVersion);
DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
"grid size (%dx%dx%d) kernarg addr: %#x, completion "
"signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
disp_pkt->grid_size_x, disp_pkt->grid_size_y,
disp_pkt->grid_size_z, disp_pkt->kernarg_address,
disp_pkt->completion_signal);
DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
"num scalar regs: %d, code addr: %#x, kernarg size: %d, "
"LDS size: %d)\n", kernel_name, task->numVectorRegs(),
task->numScalarRegs(), task->codeAddr(), 0, 0);
initABI(task);
++dynamic_task_id;
}
uint64_t
GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
{
Addr value_addr = getHsaSignalValueAddr(signal_handle);
auto tc = system()->threads[0];
ConstVPtr<Addr> prev_value(value_addr, tc);
return *prev_value;
}
void
GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
HsaSignalCallbackFunction function)
{
// The signal value is aligned 8 bytes from
// the actual handle in the runtime
Addr value_addr = getHsaSignalValueAddr(signal_handle);
Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
Addr event_addr = getHsaSignalEventAddr(signal_handle);
DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
auto cb = new DmaVirtCallback<uint64_t>(function, signal_value);
dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
auto tc = system()->threads[0];
ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
// Notifying an event with its mailbox pointer is
// not supported in the current implementation. Just use
// mailbox pointer to distinguish between interruptible
// and default signal. Interruptible signal will have
// a valid mailbox pointer.
if (*mailbox_ptr != 0) {
// This is an interruptible signal. Now, read the
// event ID and directly communicate with the driver
// about that event notification.
ConstVPtr<uint32_t> event_val(event_addr, tc);
DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
"signal event value %d\n", *event_val);
// The mailbox/wakeup signal uses the SE mode proxy port to write
// the event value. This is not available in full system mode so
// instead we need to issue a DMA write to the address. The value of
// *event_val clears the event.
if (FullSystem) {
auto cb = new DmaVirtCallback<uint64_t>(function, *event_val);
dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
} else {
signalWakeupEvent(*event_val);
}
}
}
void
GPUCommandProcessor::attachDriver(GPUComputeDriver *gpu_driver)
{
fatal_if(_driver, "Should not overwrite driver.");
// TODO: GPU Driver inheritance hierarchy doesn't really make sense.
// Should get rid of the base class.
_driver = gpu_driver;
assert(_driver);
}
GPUComputeDriver*
GPUCommandProcessor::driver()
{
return _driver;
}
/**
* submitVendorPkt() is for accepting vendor-specific packets from
* the HSAPP. Vendor-specific packets may be used by the runtime to
* send commands to the HSA device that are specific to a particular
* vendor. The vendor-specific packets should be defined by the vendor
* in the runtime.
*/
/**
* TODO: For now we simply tell the HSAPP to finish the packet,
* however a future patch will update this method to provide
* the proper handling of any required vendor-specific packets.
* In the version of ROCm that is currently supported (1.6)
* the runtime will send packets that direct the CP to
* invalidate the GPUs caches. We do this automatically on
* each kernel launch in the CU, so this is safe for now.
*/
void
GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
Addr host_pkt_addr)
{
hsaPP->finishPkt(raw_pkt, queue_id);
}
/**
* submitAgentDispatchPkt() is for accepting agent dispatch packets.
* These packets will control the dispatch of Wg on the device, and inform
* the host when a specified number of Wg have been executed on the device.
*
* For now it simply finishes the pkt.
*/
void
GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
Addr host_pkt_addr)
{
//Parse the Packet, see what it wants us to do
_hsa_agent_dispatch_packet_t * agent_pkt =
(_hsa_agent_dispatch_packet_t *)raw_pkt;
if (agent_pkt->type == AgentCmd::Nop) {
DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
} else if (agent_pkt->type == AgentCmd::Steal) {
//This is where we steal the HSA Task's completion signal
int kid = agent_pkt->arg[0];
DPRINTF(GPUCommandProc,
"Agent Dispatch Packet Stealing signal handle for kernel %d\n",
kid);
HSAQueueEntry *task = dispatcher.hsaTask(kid);
uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
uint64_t return_address = agent_pkt->return_address;
DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
//*return_address = signal_addr;
Addr *new_signal_addr = new Addr;
*new_signal_addr = (Addr)signal_addr;
dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
DPRINTF(GPUCommandProc,
"Agent Dispatch Packet Stealing signal handle from kid %d :" \
"(%x:%x) writing into %x\n",
kid,signal_addr,new_signal_addr,return_address);
} else
{
panic("The agent dispatch packet provided an unknown argument in" \
"arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
}
hsaPP->finishPkt(raw_pkt, queue_id);
}
/**
* Once the CP has finished extracting all relevant information about
* a task and has initialized the ABI state, we send a description of
* the task to the dispatcher. The dispatcher will create and dispatch
* WGs to the CUs.
*/
void
GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
{
dispatcher.dispatch(task);
}
void
GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
{
_driver->signalWakeupEvent(event_id);
}
/**
* The CP is responsible for traversing all HSA-ABI-related data
* structures from memory and initializing the ABI state.
* Information provided by the MQD, AQL packet, and code object
* metadata will be used to initialze register file state.
*/
void
GPUCommandProcessor::initABI(HSAQueueEntry *task)
{
auto cb = new DmaVirtCallback<uint32_t>(
[ = ] (const uint32_t &readDispIdOffset)
{ ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);
Addr hostReadIdxPtr
= hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
sizeof(uint32_t), cb, &cb->dmaBuffer);
}
System*
GPUCommandProcessor::system()
{
return sys;
}
AddrRangeList
GPUCommandProcessor::getAddrRanges() const
{
AddrRangeList ranges;
return ranges;
}
void
GPUCommandProcessor::setGPUDevice(AMDGPUDevice *gpu_device)
{
gpuDevice = gpu_device;
walker->setDevRequestor(gpuDevice->vramRequestorId());
}
void
GPUCommandProcessor::setShader(Shader *shader)
{
_shader = shader;
}
Shader*
GPUCommandProcessor::shader()
{
return _shader;
}
} // namespace gem5