gpu-compute: Topology and driver changes for dGPU

New topology ripped from Fiji to support dGPU. A dGPU flag is added to the config which is propogated to the driver. The emulated driver is now able to properly deal with dGPU ioctls and mmaps. For now, dGPU physical memory is allocated from the host, but this is easy to change once we get a GPU memory controller up and running. Change-Id: I594418482b12ec8fb2e4018d8d0371d56f4f51c8 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/42214 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2018-10-11 16:40:51 -04:00
parent dabb0c8f45
commit a5f55e0be1
6 changed files with 329 additions and 27 deletions
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -182,6 +182,13 @@ parser.add_option("--num-hw-queues", type="int", default=10,
 parser.add_option("--reg-alloc-policy",type="string", default="simple",
                  help="register allocation policy (simple/dynamic)")

+parser.add_option("--dgpu", action="store_true", default=False,
+                  help="Configure the system as a dGPU instead of an APU. "
+                  "The dGPU config has its own local memory pool and is not "
+                  "coherent with the host through hardware.  Data is "
+                  "transfered from host to device memory using runtime calls "
+                  "that copy data over a PCIe-like IO bus.")
+
 Ruby.define_options(parser)

 #add TLB options to the parser
@@ -417,7 +424,7 @@ hsapp_gpu_map_size = 0x1000
 hsapp_gpu_map_paddr = int(Addr(options.mem_size))

 # HSA kernel mode driver
-gpu_driver = GPUComputeDriver(filename="kfd")
+gpu_driver = GPUComputeDriver(filename = "kfd", isdGPU = options.dgpu)

 # Creating the GPU kernel launching components: that is the HSA
 # packet processor (HSAPP), GPU command processor (CP), and the
@@ -470,7 +477,15 @@ else:
               "/usr/lib/x86_64-linux-gnu"
           ]),
           'HOME=%s' % os.getenv('HOME','/'),
-           "HSA_ENABLE_INTERRUPT=1"]
+           # Disable the VM fault handler signal creation for dGPUs also
+           # forces the use of DefaultSignals instead of driver-controlled
+           # InteruptSignals throughout the runtime.  DefaultSignals poll
+           # on memory in the runtime, while InteruptSignals call into the
+           # driver.
+           "HSA_ENABLE_INTERRUPT=1",
+           # We don't have an SDMA hardware model, so need to fallback to
+           # vector copy kernels for dGPU memcopies to/from host and device.
+           "HSA_ENABLE_SDMA=0"]

 process = Process(executable = executable, cmd = [options.cmd]
                  + options.options.split(), drivers = [gpu_driver], env = env)
@@ -643,7 +658,12 @@ system.redirect_paths = redirect_paths

 root = Root(system=system, full_system=False)

-hsaTopology.createHsaTopology(options)
+# Create the /sys/devices filesystem for the simulator so that the HSA Runtime
+# knows what type of GPU hardware we are simulating
+if options.dgpu:
+    hsaTopology.createFijiTopology(options)
+else:
+    hsaTopology.createCarrizoTopology(options)

 m5.ticks.setGlobalFrequency('1THz')
 if options.abs_max_tick:
--- a/configs/example/hsaTopology.py
+++ b/configs/example/hsaTopology.py
@@ -49,7 +49,177 @@ def remake_dir(path):
        rmtree(path)
    makedirs(path)

-def createHsaTopology(options):
+# This fakes out a dGPU setup so the runtime correctly operations.  The spoofed
+# system has a single dGPU and a single socket CPU.  Note that more complex
+# topologies (multi-GPU, multi-socket CPUs) need to have a different setup
+# here or the runtime won't be able to issue Memcpies from one node to another.
+#
+# TODO: There is way too much hardcoded here.  It doesn't effect anything in
+# our current ROCm stack (1.6), but it is highly possible that it will in the
+# future.  We might need to scrub through this and extract the appropriate
+# fields from the simulator in the future.
+def createFijiTopology(options):
+    topology_dir = joinpath(m5.options.outdir, \
+        'fs/sys/devices/virtual/kfd/kfd/topology')
+    remake_dir(topology_dir)
+
+    amdgpu_dir = joinpath(m5.options.outdir, \
+        'fs/sys/module/amdgpu/parameters')
+    remake_dir(amdgpu_dir)
+
+    # Fiji reported VM size in GB.  Used to reserve an allocation from CPU
+    # to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree)
+    file_append((amdgpu_dir, 'vm_size'), 256)
+
+    # Ripped from real Fiji platform to appease KMT version checks
+    file_append((topology_dir, 'generation_id'), 2)
+
+    # Set up system properties.  Regiter as ast-rocm server
+    sys_prop = 'platform_oem 35498446626881\n' + \
+               'platform_id 71791775140929\n' + \
+               'platform_rev 2\n'
+    file_append((topology_dir, 'system_properties'), sys_prop)
+
+    # Populate the topology tree
+    # Our dGPU system is two nodes.  Node 0 is a CPU and Node 1 is a dGPU
+    node_dir = joinpath(topology_dir, 'nodes/0')
+    remake_dir(node_dir)
+
+    # Register as a CPU
+    file_append((node_dir, 'gpu_id'), 0)
+    file_append((node_dir, 'name'), '')
+
+    # CPU links.  Only thing that matters is we tell the runtime that GPU is
+    # connected through PCIe to CPU socket 0.
+    io_links = 1
+    io_dir = joinpath(node_dir, 'io_links/0')
+    remake_dir(io_dir)
+    io_prop = 'type 2\n'                                    + \
+              'version_major 0\n'                           + \
+              'version_minor 0\n'                           + \
+              'node_from 0\n'                               + \
+              'node_to 1\n'                                 + \
+              'weight 20\n'                                 + \
+              'min_latency 0\n'                             + \
+              'max_latency 0\n'                             + \
+              'min_bandwidth 0\n'                           + \
+              'max_bandwidth 0\n'                           + \
+              'recommended_transfer_size 0\n'               + \
+              'flags 13\n'
+    file_append((io_dir, 'properties'), io_prop)
+
+    # Populate CPU node properties
+    node_prop = 'cpu_cores_count %s\n' % options.num_cpus   + \
+                'simd_count 0\n'                            + \
+                'mem_banks_count 1\n'                       + \
+                'caches_count 0\n'                          + \
+                'io_links_count %s\n' % io_links            + \
+                'cpu_core_id_base 0\n'                      + \
+                'simd_id_base 0\n'                          + \
+                'max_waves_per_simd 0\n'                    + \
+                'lds_size_in_kb 0\n'                        + \
+                'gds_size_in_kb 0\n'                        + \
+                'wave_front_size 64\n'                      + \
+                'array_count 0\n'                           + \
+                'simd_arrays_per_engine 0\n'                + \
+                'cu_per_simd_array 0\n'                     + \
+                'simd_per_cu 0\n'                           + \
+                'max_slots_scratch_cu 0\n'                  + \
+                'vendor_id 0\n'                             + \
+                'device_id 0\n'                             + \
+                'location_id 0\n'                           + \
+                'drm_render_minor 0\n'                      + \
+                'max_engine_clk_ccompute 3400\n'
+
+    file_append((node_dir, 'properties'), node_prop)
+
+    # CPU memory reporting
+    mem_dir = joinpath(node_dir, 'mem_banks/0')
+    remake_dir(mem_dir)
+    mem_prop = 'heap_type 0\n'                              + \
+               'size_in_bytes 33704329216\n'                + \
+               'flags 0\n'                                  + \
+               'width 72\n'                                 + \
+               'mem_clk_max 2400\n'
+
+    file_append((mem_dir, 'properties'), mem_prop)
+
+    # Build the GPU node
+    node_dir = joinpath(topology_dir, 'nodes/1')
+    remake_dir(node_dir)
+
+    # Register as a Fiji
+    file_append((node_dir, 'gpu_id'), 50156)
+    file_append((node_dir, 'name'), 'Fiji\n')
+
+    # Real Fiji shows 96, but building that topology is complex and doesn't
+    # appear to be required for anything.
+    caches = 0
+
+    # GPU links.  Only thing that matters is we tell the runtime that GPU is
+    # connected through PCIe to CPU socket 0.
+    io_links = 1
+    io_dir = joinpath(node_dir, 'io_links/0')
+    remake_dir(io_dir)
+    io_prop = 'type 2\n'                                    + \
+              'version_major 0\n'                           + \
+              'version_minor 0\n'                           + \
+              'node_from 1\n'                               + \
+              'node_to 0\n'                                 + \
+              'weight 20\n'                                 + \
+              'min_latency 0\n'                             + \
+              'max_latency 0\n'                             + \
+              'min_bandwidth 0\n'                           + \
+              'max_bandwidth 0\n'                           + \
+              'recommended_transfer_size 0\n'               + \
+              'flags 1\n'
+    file_append((io_dir, 'properties'), io_prop)
+
+    # Populate GPU node properties
+    node_prop = 'cpu_cores_count %s\n' % options.num_cpus                   + \
+                'simd_count %s\n'                                             \
+                    % (options.num_compute_units * options.simds_per_cu)    + \
+                'mem_banks_count 1\n'                                       + \
+                'caches_count %s\n' % caches                                + \
+                'io_links_count %s\n' % io_links                            + \
+                'cpu_core_id_base 0\n'                                      + \
+                'simd_id_base 2147487744\n'                                 + \
+                'max_waves_per_simd %s\n' % options.wfs_per_simd            + \
+                'lds_size_in_kb %s\n' % int(options.lds_size / 1024)        + \
+                'gds_size_in_kb 0\n'                                        + \
+                'wave_front_size %s\n' % options.wf_size                    + \
+                'array_count 4\n'                           + \
+                'simd_arrays_per_engine %s\n' % options.sa_per_complex      + \
+                'cu_per_simd_array %s\n' % options.cu_per_sa                + \
+                'simd_per_cu %s\n' % options.simds_per_cu                   + \
+                'max_slots_scratch_cu 32\n'                                 + \
+                'vendor_id 4098\n'                                          + \
+                'device_id 29440\n'                                         + \
+                'location_id 512\n'                                         + \
+                'max_engine_clk_fcompute %s\n'                                \
+                    % int(toFrequency(options.gpu_clock) / 1e6)             + \
+                'local_mem_size 4294967296\n'                               + \
+                'fw_version 730\n'                                          + \
+                'capability 4736\n'                                         + \
+                'max_engine_clk_ccompute %s\n'                                \
+                    % int(toFrequency(options.CPUClock) / 1e6)
+
+    file_append((node_dir, 'properties'), node_prop)
+
+    # Fiji HBM reporting
+    # TODO: Extract size, clk, and width from sim paramters
+    mem_dir = joinpath(node_dir, 'mem_banks/0')
+    remake_dir(mem_dir)
+    mem_prop = 'heap_type 1\n'                              + \
+               'size_in_bytes 4294967296\n'                 + \
+               'flags 0\n'                                  + \
+               'width 4096\n'                               + \
+               'mem_clk_max 500\n'
+
+    file_append((mem_dir, 'properties'), mem_prop)
+
+
+def createCarrizoTopology(options):
    topology_dir = joinpath(m5.options.outdir, \
        'fs/sys/devices/virtual/kfd/kfd/topology')
    remake_dir(topology_dir)
--- a/src/dev/hsa/hsa_driver.cc
+++ b/src/dev/hsa/hsa_driver.cc
@@ -70,25 +70,25 @@ Addr
 HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
                int tgt_flags, int tgt_fd, off_t offset)
 {
-     // Is this a signal event mmap
-     bool is_event_mmap = false;
-     // If addr == 0, then we may need to do mmap.
-     bool should_mmap = (start == 0);
-     auto process = tc->getProcessPtr();
-     auto mem_state = process->memState;
-     // Check if mmap is for signal events first
-     if (((offset >> PAGE_SHIFT) & KFD_MMAP_TYPE_MASK) ==
-         KFD_MMAP_TYPE_EVENTS) {
-         is_event_mmap = true;
-         DPRINTF(HSADriver, "amdkfd mmap for events(start: %p, length: 0x%x,"
-                 "offset: 0x%x,  )\n", start, length, offset);
-         panic_if(start != 0,
-                  "Start address should be provided by KFD\n");
-         panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
-                  "Requested length %d, expected length %d; length mismatch\n",
-                   length, 8 * KFD_SIGNAL_EVENT_LIMIT);
-         // For signal event, do mmap only is eventPage is uninitialized
-         should_mmap = (!eventPage);
+    // Is this a signal event mmap
+    bool is_event_mmap = false;
+    // If addr == 0, then we may need to do mmap.
+    bool should_mmap = (start == 0);
+    auto process = tc->getProcessPtr();
+    auto mem_state = process->memState;
+    // Check if mmap is for signal events first
+    if (((offset >> PAGE_SHIFT) & KFD_MMAP_TYPE_MASK) ==
+        KFD_MMAP_TYPE_EVENTS) {
+        is_event_mmap = true;
+        DPRINTF(HSADriver, "amdkfd mmap for events(start: %p, length: 0x%x,"
+                "offset: 0x%x,  )\n", start, length, offset);
+        panic_if(start != 0,
+                 "Start address should be provided by KFD\n");
+        panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
+                 "Requested length %d, expected length %d; length mismatch\n",
+                  length, 8 * KFD_SIGNAL_EVENT_LIMIT);
+        // For signal event, do mmap only is eventPage is uninitialized
+        should_mmap = (!eventPage);
    } else {
        DPRINTF(HSADriver, "amdkfd doorbell mmap (start: %p, length: 0x%x,"
                "offset: 0x%x)\n", start, length, offset);
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -236,6 +236,7 @@ class Shader(ClockedObject):
 class GPUComputeDriver(HSADriver):
    type = 'GPUComputeDriver'
    cxx_header = 'gpu-compute/gpu_compute_driver.hh'
+    isdGPU = Param.Bool(False, 'Driver is for a dGPU')

 class GPUDispatcher(SimObject):
    type = 'GPUDispatcher'
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -40,10 +40,11 @@
 #include "dev/hsa/kfd_event_defines.h"
 #include "dev/hsa/kfd_ioctl.h"
 #include "params/GPUComputeDriver.hh"
+#include "sim/process.hh"
 #include "sim/syscall_emul_buf.hh"

 GPUComputeDriver::GPUComputeDriver(const Params &p)
-    : HSADriver(p)
+    : HSADriver(p), isdGPU(p.isdGPU)
 {
    device->attachDriver(this);
    DPRINTF(GPUDriver, "Constructing KFD: device\n");
@@ -86,6 +87,19 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
          break;
        case AMDKFD_IOC_SET_MEMORY_POLICY:
          {
+            /**
+             * This is where the runtime requests MTYPE from an aperture.
+             * Basically, the globally memory aperture is divided up into
+             * a default aperture and an alternate aperture each of which have
+             * their own MTYPE policies.  This is done to mark a small piece
+             * of the global memory as uncacheable.  Host memory mappings will
+             * be carved out of this uncacheable aperture, which is how they
+             * implement 'coherent' host/device memory on dGPUs.
+             *
+             * TODO: Need to reflect per-aperture MTYPE policies based on this
+             * call.
+             *
+             */
            warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
          }
          break;
@@ -145,7 +159,10 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                    gpuVmApeLimit(args->process_apertures[i].gpuvm_base);

                // NOTE: Must match ID populated by hsaTopology.py
-                args->process_apertures[i].gpu_id = 2765;
+                if (isdGPU)
+                    args->process_apertures[i].gpu_id = 50156;
+                else
+                    args->process_apertures[i].gpu_id = 2765;

                DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
                        args->process_apertures[i].gpuvm_base);
@@ -351,9 +368,91 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
            warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
          }
          break;
+        /**
+         * In real hardware, this IOCTL maps host memory, dGPU memory, or dGPU
+         * doorbells into GPUVM space. Essentially, ROCm implements SVM by
+         * carving out a region of free VA space that both the host and GPUVM
+         * can agree upon.  The entire GPU VA space is reserved on the host
+         * using a fixed mmap at a low VA range that is also directly
+         * accessable by the GPU's limited number of VA bits.  When we actually
+         * call memory allocation later in the program, this IOCTL is invoked
+         * to create BOs/VMAs in the driver and bind them to physical
+         * memory/doorbells.
+         *
+         * For gem5, we don't need to carve out any GPUVM space here (we don't
+         * support GPUVM and use host page tables on the GPU directly). We can
+         * can just use the existing host SVM region. We comment on each memory
+         * type seperately.
+         */
        case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
          {
-            warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
+            TypedBufferArg<kfd_ioctl_alloc_memory_of_gpu_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+
+            assert(isdGPU);
+            assert((args->va_addr % TheISA::PageBytes) == 0);
+            Addr mmap_offset = 0;
+
+            if (KFD_IOC_ALLOC_MEM_FLAGS_VRAM & args->flags) {
+                DPRINTF(GPUDriver, "amdkfd allocation type: VRAM\n");
+                args->mmap_offset = args->va_addr;
+                // VRAM allocations are device memory mapped into GPUVM
+                // space.
+                //
+                // We can't rely on the lazy host allocator (fixupFault) to
+                // handle this mapping since it needs to be placed in dGPU
+                // framebuffer memory.  The lazy allocator will try to place
+                // this in host memory.
+                //
+                // TODO: We don't have the appropriate bifurcation of the
+                // physical address space with different memory controllers
+                // yet.  This is where we will explicitly add the PT maps to
+                // dGPU memory in the future.
+            } else if (KFD_IOC_ALLOC_MEM_FLAGS_USERPTR & args->flags) {
+                DPRINTF(GPUDriver, "amdkfd allocation type: USERPTR\n");
+                mmap_offset = args->mmap_offset;
+                // USERPTR allocations are system memory mapped into GPUVM
+                // space.  The user provides the driver with the pointer.
+                //
+                // No action needs to be taken for this memory type.  We will
+                // lazily map it into host memory on first touch.
+            } else if (KFD_IOC_ALLOC_MEM_FLAGS_GTT & args->flags) {
+                DPRINTF(GPUDriver, "amdkfd allocation type: GTT\n");
+                args->mmap_offset = args->va_addr;
+                // GTT allocations are system memory mapped into GPUVM space.
+                // It's different than a USERPTR allocation since the driver
+                // itself allocates the physical memory on the host.
+                //
+                // No action needs to be taken for this memory type.  We will
+                // lazily map it into host memory on first touch.  The
+                // fixupFault will find the original SVM aperture mapped to the
+                // host.
+                //
+                // Note that for GTT the thunk layer needs to call mmap on the
+                // driver FD later if it wants the host to have access to this
+                // memory (which it probably does).
+            } else if (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL & args->flags) {
+                DPRINTF(GPUDriver, "amdkfd allocation type: DOORBELL\n");
+                // DOORBELL allocations are the queue doorbells that are
+                // memory mapped into GPUVM space.
+                //
+                // Explicitly map this virtual address to our PIO doorbell
+                // interface in the page tables (non-cacheable)
+                tc->getProcessPtr()->pTable->map(args->va_addr,
+                            device->hsaPacketProc().pioAddr,
+                            args->size, false);
+                break;
+            }
+
+            DPRINTF(GPUDriver, "amdkfd allocation arguments: va_addr %p "
+                    "size %lu, mmap_offset %p, gpu_id %d\n",
+                    args->va_addr, args->size, mmap_offset, args->gpu_id);
+
+            // TODO: Not sure where the handle is used yet.  Set it to an
+            // easily trackable value.
+            args->handle= 0xdeadbeef;
+            args.copyOut(virt_proxy);
          }
          break;
        case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
@@ -361,6 +460,13 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
            warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
          }
          break;
+        /**
+         * Called to map an already allocated region of memory to this GPU's
+         * GPUVM VA space.  We don't need to implement this in the simulator
+         * since we only have a single VM system.  If the region has already
+         * been allocated somewhere like the CPU, then it's already visible
+         * to the device.
+         */
        case AMDKFD_IOC_MAP_MEMORY_TO_GPU:
          {
            warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
@@ -415,7 +521,11 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                ape_args->gpuvm_base = gpuVmApeBase(i + 1);
                ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base);

-                ape_args->gpu_id = 2765;
+                // NOTE: Must match ID populated by hsaTopology.py
+                if (isdGPU)
+                    ape_args->gpu_id = 50156;
+                else
+                    ape_args->gpu_id = 2765;

                assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
                assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
--- a/src/gpu-compute/gpu_compute_driver.hh
+++ b/src/gpu-compute/gpu_compute_driver.hh
@@ -55,6 +55,7 @@ class GPUComputeDriver final : public HSADriver
    void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);

  private:
+    bool isdGPU;
    /**
     * The aperture (APE) base/limit pairs are set
     * statically at startup by the real KFD. AMD