gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
2018-05-01 16:59:35 -04:00
parent b0eac7857a
commit b8da9abba7
86 changed files with 10299 additions and 3734 deletions
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_command_processor.hh"
+
+#include "debug/GPUCommandProc.hh"
+#include "debug/GPUKernelInfo.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "params/GPUCommandProcessor.hh"
+
+GPUCommandProcessor::GPUCommandProcessor(const Params *p)
+    : HSADevice(p), dispatcher(*p->dispatcher)
+{
+    dispatcher.setCommandProcessor(this);
+}
+
+/**
+ * submitDispatchPkt() is the entry point into the CP from the HSAPP
+ * and is only meant to be used with AQL kernel dispatch packets.
+ * After the HSAPP receives and extracts an AQL packet, it sends
+ * it to the CP, which is responsible for gathering all relevant
+ * information about a task, initializing CU state, and sending
+ * it to the dispatcher for WG creation and dispatch.
+ *
+ * First we need capture all information from the the AQL pkt and
+ * the code object, then store it in an HSAQueueEntry. Once the
+ * packet and code are extracted, we extract information from the
+ * queue descriptor that the CP needs to perform state initialization
+ * on the CU. Finally we call dispatch() to send the task to the
+ * dispatcher. When the task completely finishes, we call finishPkt()
+ * on the HSA packet processor in order to remove the packet from the
+ * queue, and notify the runtime that the task has completed.
+ */
+void
+GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
+                                       Addr host_pkt_addr)
+{
+    static int dynamic_task_id = 0;
+    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
+
+    /**
+     * we need to read a pointer in the application's address
+     * space to pull out the kernel code descriptor.
+     */
+    auto *tc = sys->threads[0];
+    auto &virt_proxy = tc->getVirtProxy();
+
+    /**
+     * The kernel_object is a pointer to the machine code, whose entry
+     * point is an 'amd_kernel_code_t' type, which is included in the
+     * kernel binary, and describes various aspects of the kernel. The
+     * desired entry is the 'kernel_code_entry_byte_offset' field,
+     * which provides the byte offset (positive or negative) from the
+     * address of the amd_kernel_code_t to the start of the machine
+     * instructions.
+     */
+    AMDKernelCode akc;
+    virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
+        sizeof(AMDKernelCode));
+
+    DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
+        "kernel object\n", akc.kernel_code_entry_byte_offset);
+
+    Addr machine_code_addr = (Addr)disp_pkt->kernel_object
+        + akc.kernel_code_entry_byte_offset;
+
+    DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
+        machine_code_addr);
+
+    Addr kern_name_addr(0);
+    virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
+        (uint8_t*)&kern_name_addr, 0x8);
+
+    std::string kernel_name;
+    virt_proxy.readString(kernel_name, kern_name_addr);
+
+    DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
+
+    HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
+        dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
+
+    DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
+        "grid size (%dx%dx%d) kernarg addr: %#x, completion "
+        "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
+        disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
+        disp_pkt->grid_size_x, disp_pkt->grid_size_y,
+        disp_pkt->grid_size_z, disp_pkt->kernarg_address,
+        disp_pkt->completion_signal);
+
+    DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
+        "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
+        "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
+        task->numScalarRegs(), task->codeAddr(), 0, 0);
+
+    initABI(task);
+    ++dynamic_task_id;
+}
+
+/**
+ * submitVendorPkt() is for accepting vendor-specific packets from
+ * the HSAPP. Vendor-specific packets may be used by the runtime to
+ * send commands to the HSA device that are specific to a particular
+ * vendor. The vendor-specific packets should be defined by the vendor
+ * in the runtime.
+ */
+
+/**
+ * TODO: For now we simply tell the HSAPP to finish the packet,
+ *       however a future patch will update this method to provide
+ *       the proper handling of any required vendor-specific packets.
+ *       In the version of ROCm that is currently supported (1.6)
+ *       the runtime will send packets that direct the CP to
+ *       invalidate the GPUs caches. We do this automatically on
+ *       each kernel launch in the CU, so this is safe for now.
+ */
+void
+GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
+    Addr host_pkt_addr)
+{
+    hsaPP->finishPkt(raw_pkt, queue_id);
+}
+
+/**
+ * Once the CP has finished extracting all relevant information about
+ * a task and has initialized the ABI state, we send a description of
+ * the task to the dispatcher. The dispatcher will create and dispatch
+ * WGs to the CUs.
+ */
+void
+GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
+{
+    dispatcher.dispatch(task);
+}
+
+/**
+ * The CP is responsible for traversing all HSA-ABI-related data
+ * structures from memory and initializing the ABI state.
+ * Information provided by the MQD, AQL packet, and code object
+ * metadata will be used to initialze register file state.
+ */
+void
+GPUCommandProcessor::initABI(HSAQueueEntry *task)
+{
+    auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
+
+    Addr hostReadIdxPtr
+        = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
+
+    dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
+        sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
+            &readDispIdOffEvent->readDispIdOffset);
+}
+
+System*
+GPUCommandProcessor::system()
+{
+    return sys;
+}
+
+AddrRangeList
+GPUCommandProcessor::getAddrRanges() const
+{
+    AddrRangeList ranges;
+    return ranges;
+}
+
+void
+GPUCommandProcessor::setShader(Shader *shader)
+{
+    _shader = shader;
+}
+
+Shader*
+GPUCommandProcessor::shader()
+{
+    return _shader;
+}
+
+GPUCommandProcessor*
+GPUCommandProcessorParams::create()
+{
+    return new GPUCommandProcessor(this);
+}