gem5/src/gpu-compute/gpu_command_processor.cc

/*
 * Copyright (c) 2018 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "gpu-compute/gpu_command_processor.hh"

#include <cassert>

#include "arch/amdgpu/vega/pagetable_walker.hh"
#include "base/chunk_generator.hh"
#include "debug/GPUCommandProc.hh"
#include "debug/GPUKernelInfo.hh"
#include "dev/amdgpu/amdgpu_device.hh"
#include "gpu-compute/dispatcher.hh"
#include "mem/abstract_mem.hh"
#include "mem/packet_access.hh"
#include "mem/se_translating_port_proxy.hh"
#include "mem/translating_port_proxy.hh"
#include "params/GPUCommandProcessor.hh"
#include "sim/full_system.hh"
#include "sim/process.hh"
#include "sim/proxy_ptr.hh"
#include "sim/syscall_emul_buf.hh"

namespace gem5
{

GPUCommandProcessor::GPUCommandProcessor(const Params &p)
    : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
      walker(p.walker), hsaPP(p.hsapp)
{
    assert(hsaPP);
    hsaPP->setDevice(this);
    dispatcher.setCommandProcessor(this);
}

HSAPacketProcessor&
GPUCommandProcessor::hsaPacketProc()
{
    return *hsaPP;
}

/**
 * Forward the VRAM requestor ID needed for device memory from GPU device.
 */
RequestorID
GPUCommandProcessor::vramRequestorId()
{
    return gpuDevice->vramRequestorId();
}

TranslationGenPtr
GPUCommandProcessor::translate(Addr vaddr, Addr size)
{
    if (!FullSystem) {
        // Grab the process and try to translate the virtual address with it;
        // with new extensions, it will likely be wrong to just arbitrarily
        // grab context zero.
        auto process = sys->threads[0]->getProcessPtr();

        return process->pTable->translateRange(vaddr, size);
    }

    // In full system use the page tables setup by the kernel driver rather
    // than the CPU page tables.
    return TranslationGenPtr(
        new AMDGPUVM::UserTranslationGen(&gpuDevice->getVM(), walker,
                                         1 /* vmid */, vaddr, size));
}

/**
 * submitDispatchPkt() is the entry point into the CP from the HSAPP
 * and is only meant to be used with AQL kernel dispatch packets.
 * After the HSAPP receives and extracts an AQL packet, it sends
 * it to the CP, which is responsible for gathering all relevant
 * information about a task, initializing CU state, and sending
 * it to the dispatcher for WG creation and dispatch.
 *
 * First we need capture all information from the the AQL pkt and
 * the code object, then store it in an HSAQueueEntry. Once the
 * packet and code are extracted, we extract information from the
 * queue descriptor that the CP needs to perform state initialization
 * on the CU. Finally we call dispatch() to send the task to the
 * dispatcher. When the task completely finishes, we call finishPkt()
 * on the HSA packet processor in order to remove the packet from the
 * queue, and notify the runtime that the task has completed.
 */
void
GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                                       Addr host_pkt_addr)
{
    static int dynamic_task_id = 0;
    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
    assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));

    /**
     * we need to read a pointer in the application's address
     * space to pull out the kernel code descriptor.
     */
    auto *tc = sys->threads[0];

    TranslatingPortProxy fs_proxy(tc);
    SETranslatingPortProxy se_proxy(tc);
    PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;

    /**
     * In full system mode, the page table entry may point to a system page
     * or a device page. System pages use the proxy as normal, but a device
     * page needs to be read from device memory. Check what type it is here.
     */
    bool is_system_page = true;
    Addr phys_addr = disp_pkt->kernel_object;
    if (FullSystem) {
        /**
         * Full system currently only supports running on single VMID (one
         * virtual memory space), i.e., one application running on GPU at a
         * time. Because of this, for now we know the VMID is always 1. Later
         * the VMID would have to be passed on to the command processor.
         */
        int vmid = 1;
        unsigned tmp_bytes;
        walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
                                phys_addr, tmp_bytes, BaseMMU::Mode::Read,
                                is_system_page);
    }

    DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
            disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
            is_system_page);

    /**
     * The kernel_object is a pointer to the machine code, whose entry
     * point is an 'amd_kernel_code_t' type, which is included in the
     * kernel binary, and describes various aspects of the kernel. The
     * desired entry is the 'kernel_code_entry_byte_offset' field,
     * which provides the byte offset (positive or negative) from the
     * address of the amd_kernel_code_t to the start of the machine
     * instructions.
     */
    AMDKernelCode akc;
    if (is_system_page) {
        DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n");
        virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
            sizeof(AMDKernelCode));
    } else {
        assert(FullSystem);
        DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");

        // Read from GPU memory manager one cache line at a time to prevent
        // rare cases where the AKC spans two memory pages.
        ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
                           system()->cacheLineSize());
        for (; !gen.done(); gen.next()) {
            Addr chunk_addr = gen.addr();
            int vmid = 1;
            unsigned dummy;
            walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
                                    chunk_addr, dummy, BaseMMU::Mode::Read,
                                    is_system_page);

            Request::Flags flags = Request::PHYSICAL;
            RequestPtr request = std::make_shared<Request>(chunk_addr,
                system()->cacheLineSize(), flags, walker->getDevRequestor());
            Packet *readPkt = new Packet(request, MemCmd::ReadReq);
            readPkt->dataStatic((uint8_t *)&akc + gen.complete());
            system()->getDeviceMemory(readPkt)->access(readPkt);
            delete readPkt;
        }
    }

    DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
        "kernel object\n", akc.kernel_code_entry_byte_offset);

    DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
        (uint64_t)tc->cpuId());


    Addr machine_code_addr = (Addr)disp_pkt->kernel_object
        + akc.kernel_code_entry_byte_offset;

    DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
        machine_code_addr);

    std::string kernel_name;

    /**
     * BLIT kernels don't have symbol names.  BLIT kernels are built-in compute
     * kernels issued by ROCm to handle DMAs for dGPUs when the SDMA
     * hardware engines are unavailable or explicitly disabled.  They can also
     * be used to do copies that ROCm things would be better performed
     * by the shader than the SDMA engines.  They are also sometimes used on
     * APUs to implement asynchronous memcopy operations from 2 pointers in
     * host memory.  I have no idea what BLIT stands for.
     * */
    if (akc.runtime_loader_kernel_symbol) {
        kernel_name = "Some kernel";
    } else {
        kernel_name = "Blit kernel";
    }

    DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());

    GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()
                          : driver()->getGfxVersion();
    HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
        dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
        gfxVersion);

    DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
        "grid size (%dx%dx%d) kernarg addr: %#x, completion "
        "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
        disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
        disp_pkt->grid_size_x, disp_pkt->grid_size_y,
        disp_pkt->grid_size_z, disp_pkt->kernarg_address,
        disp_pkt->completion_signal);

    DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
        "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
        "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
        task->numScalarRegs(), task->codeAddr(), 0, 0);

    initABI(task);
    ++dynamic_task_id;
}

uint64_t
GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)
{
    Addr value_addr = getHsaSignalValueAddr(signal_handle);
    auto tc = system()->threads[0];
    ConstVPtr<Addr> prev_value(value_addr, tc);
    return *prev_value;
}

void
GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
                                     HsaSignalCallbackFunction function)
{
    // The signal value is aligned 8 bytes from
    // the actual handle in the runtime
    Addr value_addr = getHsaSignalValueAddr(signal_handle);
    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
    Addr event_addr = getHsaSignalEventAddr(signal_handle);
    DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);

    auto cb = new DmaVirtCallback<uint64_t>(function, signal_value);

    dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);

    auto tc = system()->threads[0];
    ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);

    // Notifying an event with its mailbox pointer is
    // not supported in the current implementation. Just use
    // mailbox pointer to distinguish between interruptible
    // and default signal. Interruptible signal will have
    // a valid mailbox pointer.
    if (*mailbox_ptr != 0) {
        // This is an interruptible signal. Now, read the
        // event ID and directly communicate with the driver
        // about that event notification.
        ConstVPtr<uint32_t> event_val(event_addr, tc);

        DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
                "signal event value %d\n", *event_val);

        // The mailbox/wakeup signal uses the SE mode proxy port to write
        // the event value. This is not available in full system mode so
        // instead we need to issue a DMA write to the address. The value of
        // *event_val clears the event.
        if (FullSystem) {
            auto cb = new DmaVirtCallback<uint64_t>(function, *event_val);
            dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
        } else {
            signalWakeupEvent(*event_val);
        }
    }
}

void
GPUCommandProcessor::attachDriver(GPUComputeDriver *gpu_driver)
{
    fatal_if(_driver, "Should not overwrite driver.");
    // TODO: GPU Driver inheritance hierarchy doesn't really make sense.
    // Should get rid of the base class.
    _driver = gpu_driver;
    assert(_driver);
}

GPUComputeDriver*
GPUCommandProcessor::driver()
{
    return _driver;
}

/**
 * submitVendorPkt() is for accepting vendor-specific packets from
 * the HSAPP. Vendor-specific packets may be used by the runtime to
 * send commands to the HSA device that are specific to a particular
 * vendor. The vendor-specific packets should be defined by the vendor
 * in the runtime.
 */

/**
 * TODO: For now we simply tell the HSAPP to finish the packet,
 *       however a future patch will update this method to provide
 *       the proper handling of any required vendor-specific packets.
 *       In the version of ROCm that is currently supported (1.6)
 *       the runtime will send packets that direct the CP to
 *       invalidate the GPUs caches. We do this automatically on
 *       each kernel launch in the CU, so this is safe for now.
 */
void
GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
    Addr host_pkt_addr)
{
    hsaPP->finishPkt(raw_pkt, queue_id);
}

/**
 * submitAgentDispatchPkt() is for accepting agent dispatch packets.
 * These packets will control the dispatch of Wg on the device, and inform
 * the host when a specified number of Wg have been executed on the device.
 *
 * For now it simply finishes the pkt.
 */
void
GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
    Addr host_pkt_addr)
{
    //Parse the Packet, see what it wants us to do
    _hsa_agent_dispatch_packet_t * agent_pkt =
        (_hsa_agent_dispatch_packet_t *)raw_pkt;

    if (agent_pkt->type == AgentCmd::Nop) {
        DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
    } else if (agent_pkt->type == AgentCmd::Steal) {
        //This is where we steal the HSA Task's completion signal
        int kid = agent_pkt->arg[0];
        DPRINTF(GPUCommandProc,
            "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
            kid);

        HSAQueueEntry *task = dispatcher.hsaTask(kid);
        uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);

        uint64_t return_address = agent_pkt->return_address;
        DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
        //*return_address = signal_addr;
        Addr *new_signal_addr = new Addr;
        *new_signal_addr  = (Addr)signal_addr;
        dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);

        DPRINTF(GPUCommandProc,
            "Agent Dispatch Packet Stealing signal handle from kid %d :" \
            "(%x:%x) writing into %x\n",
            kid,signal_addr,new_signal_addr,return_address);

    } else
    {
        panic("The agent dispatch packet provided an unknown argument in" \
        "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
    }

    hsaPP->finishPkt(raw_pkt, queue_id);
}

/**
 * Once the CP has finished extracting all relevant information about
 * a task and has initialized the ABI state, we send a description of
 * the task to the dispatcher. The dispatcher will create and dispatch
 * WGs to the CUs.
 */
void
GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
{
    dispatcher.dispatch(task);
}

void
GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
{
    _driver->signalWakeupEvent(event_id);
}

/**
 * The CP is responsible for traversing all HSA-ABI-related data
 * structures from memory and initializing the ABI state.
 * Information provided by the MQD, AQL packet, and code object
 * metadata will be used to initialze register file state.
 */
void
GPUCommandProcessor::initABI(HSAQueueEntry *task)
{
    auto cb = new DmaVirtCallback<uint32_t>(
        [ = ] (const uint32_t &readDispIdOffset)
            { ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);

    Addr hostReadIdxPtr
        = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;

    dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
        sizeof(uint32_t), cb, &cb->dmaBuffer);
}

System*
GPUCommandProcessor::system()
{
    return sys;
}

AddrRangeList
GPUCommandProcessor::getAddrRanges() const
{
    AddrRangeList ranges;
    return ranges;
}

void
GPUCommandProcessor::setGPUDevice(AMDGPUDevice *gpu_device)
{
    gpuDevice = gpu_device;
    walker->setDevRequestor(gpuDevice->vramRequestorId());
}

void
GPUCommandProcessor::setShader(Shader *shader)
{
    _shader = shader;
}

Shader*
GPUCommandProcessor::shader()
{
    return _shader;
}

} // namespace gem5