gpu-compute: Implement per-request MTYPEs

GPU MTYPE is currently set using a global config passed to the PACoalescer. This patch enables MTYPE to be set by the shader on a per-request bases. In real hardware, the MTYPE is extracted from a GPUVM PTE during address translation. However, our current simulator only models x86 page tables which do not have the appropriate bits for GPU MTYPES. Rather than hacking non-x86 bits into our x86 page table models, this patch instead keeps an interval tree of all pages that request custom MTYPES in the driver itself. This is currently only used to map host pages to the GPU as uncacheable, but is easily extensible to other MTYPES. Change-Id: I7daab0ffae42084b9131a67c85cd0aa4bbbfc8d6 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/42216 Maintainer: Matthew Poremba <matthew.poremba@amd.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2018-10-31 16:25:12 -04:00
parent dfa712f041
commit ad43083bb3
9 changed files with 274 additions and 23 deletions
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -173,6 +173,21 @@ parser.add_argument("--dgpu", action="store_true", default=False,
                    "transfered from host to device memory using runtime calls "
                    "that copy data over a PCIe-like IO bus.")

+# Mtype option
+#--     1   1   1   C_RW_S  (Cached-ReadWrite-Shared)
+#--     1   1   0   C_RW_US (Cached-ReadWrite-Unshared)
+#--     1   0   1   C_RO_S  (Cached-ReadOnly-Shared)
+#--     1   0   0   C_RO_US (Cached-ReadOnly-Unshared)
+#--     0   1   x   UC_L2   (Uncached_GL2)
+#--     0   0   x   UC_All  (Uncached_All_Load)
+# default value: 5/C_RO_S (only allow caching in GL2 for read. Shared)
+parser.add_argument("--m-type", type='int', default=5,
+                    help="Default Mtype for GPU memory accesses.  This is the "
+                    "value used for all memory accesses on an APU and is the "
+                    "default mode for dGPU unless explicitly overwritten by "
+                    "the driver on a per-page basis.  Valid values are "
+                    "between 0-7")
+
 Ruby.define_options(parser)

 # add TLB options to the parser
@@ -407,8 +422,15 @@ hsapp_gpu_map_vaddr = 0x200000000
 hsapp_gpu_map_size = 0x1000
 hsapp_gpu_map_paddr = int(Addr(args.mem_size))

+if args.dgpu:
+    # Default --m-type for dGPU is write-back gl2 with system coherence
+    # (coherence at the level of the system directory between other dGPUs and
+    # CPUs) managed by kernel boundary flush operations targeting the gl2.
+    args.m_type = 6
+
 # HSA kernel mode driver
-gpu_driver = GPUComputeDriver(filename = "kfd", isdGPU = args.dgpu)
+gpu_driver = GPUComputeDriver(filename = "kfd", isdGPU = args.dgpu,
+                              dGPUPoolID = 1, m_type = args.m_type)

 # Creating the GPU kernel launching components: that is the HSA
 # packet processor (HSAPP), GPU command processor (CP), and the
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -237,6 +237,16 @@ class GPUComputeDriver(HSADriver):
    type = 'GPUComputeDriver'
    cxx_header = 'gpu-compute/gpu_compute_driver.hh'
    isdGPU = Param.Bool(False, 'Driver is for a dGPU')
+    dGPUPoolID = Param.Int(False, 'Pool ID for dGPU.')
+    # Default Mtype for caches
+    #--     1   1   1   C_RW_S  (Cached-ReadWrite-Shared)
+    #--     1   1   0   C_RW_US (Cached-ReadWrite-Unshared)
+    #--     1   0   1   C_RO_S  (Cached-ReadOnly-Shared)
+    #--     1   0   0   C_RO_US (Cached-ReadOnly-Unshared)
+    #--     0   1   x   UC_L2   (Uncached_GL2)
+    #--     0   0   x   UC_All  (Uncached_All_Load)
+    # default value: 5/C_RO_S (only allow caching in GL2 for read. Shared)
+    m_type = Param.Int("Default MTYPE for cache. Valid values between 0-7");

 class GPUDispatcher(SimObject):
    type = 'GPUDispatcher'
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -48,6 +48,7 @@
 #include "debug/GPUSync.hh"
 #include "debug/GPUTLB.hh"
 #include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_command_processor.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/gpu_static_inst.hh"
 #include "gpu-compute/scalar_register_file.hh"
@@ -1023,6 +1024,14 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
    // only do some things if actually accessing data
    bool isDataAccess = pkt->isWrite() || pkt->isRead();

+    // For dGPUs, real hardware will extract MTYPE from the PTE.  Our model
+    // uses x86 pagetables which don't have fields to track GPU MTYPEs.
+    // Rather than hacking up the pagetable to add these bits in, we just
+    // keep a structure local to our GPUs that are populated in our
+    // emulated driver whenever memory is allocated.  Consult that structure
+    // here in case we need a memtype override.
+    shader->gpuCmdProc.driver()->setMtype(pkt->req);
+
    // Check write before read for atomic operations
    // since atomic operations should use BaseTLB::Write
    if (pkt->isWrite()) {
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -42,7 +42,7 @@
 #include "sim/syscall_emul_buf.hh"

 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
-    : HSADevice(p), dispatcher(*p.dispatcher), driver(nullptr)
+    : HSADevice(p), dispatcher(*p.dispatcher), _driver(nullptr)
 {
    dispatcher.setCommandProcessor(this);
 }
@@ -194,8 +194,17 @@ GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
 void
 GPUCommandProcessor::attachDriver(HSADriver *hsa_driver)
 {
-    fatal_if(driver, "Should not overwrite driver.");
-    driver = hsa_driver;
+    fatal_if(_driver, "Should not overwrite driver.");
+    // TODO: GPU Driver inheritance hierarchy doesn't really make sense.
+    // Should get rid of the base class.
+    _driver = dynamic_cast<GPUComputeDriver *>(hsa_driver);
+    assert(_driver);
+}
+
+GPUComputeDriver*
+GPUCommandProcessor::driver()
+{
+    return _driver;
 }

 /**
@@ -285,7 +294,7 @@ GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
 void
 GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)
 {
-    driver->signalWakeupEvent(event_id);
+    _driver->signalWakeupEvent(event_id);
 }

 /**
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -66,6 +66,7 @@ class GPUCommandProcessor : public HSADevice

    void setShader(Shader *shader);
    Shader* shader();
+    GPUComputeDriver* driver();

    enum AgentCmd
    {
@@ -112,7 +113,7 @@ class GPUCommandProcessor : public HSADevice
  private:
    Shader *_shader;
    GPUDispatcher &dispatcher;
-    HSADriver *driver;
+    GPUComputeDriver *_driver;

    void initABI(HSAQueueEntry *task);

--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -35,10 +35,13 @@

 #include "cpu/thread_context.hh"
 #include "debug/GPUDriver.hh"
+#include "debug/GPUShader.hh"
 #include "dev/hsa/hsa_device.hh"
 #include "dev/hsa/hsa_packet_processor.hh"
 #include "dev/hsa/kfd_event_defines.h"
 #include "dev/hsa/kfd_ioctl.h"
+#include "gpu-compute/gpu_command_processor.hh"
+#include "gpu-compute/shader.hh"
 #include "params/GPUComputeDriver.hh"
 #include "sim/process.hh"
 #include "sim/syscall_emul_buf.hh"
@@ -48,12 +51,25 @@ GPUComputeDriver::GPUComputeDriver(const Params &p)
 {
    device->attachDriver(this);
    DPRINTF(GPUDriver, "Constructing KFD: device\n");
+
+    // Convert the 3 bit mtype specified in Shader.py to the proper type
+    // used for requests.
+    if (MtypeFlags::SHARED & p.m_type)
+        defaultMtype.set(Request::SHARED);
+
+    if (MtypeFlags::READ_WRITE & p.m_type)
+        defaultMtype.set(Request::READ_WRITE);
+
+    if (MtypeFlags::CACHED & p.m_type)
+        defaultMtype.set(Request::CACHED);
 }

 int
 GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
 {
    auto &virt_proxy = tc->getVirtProxy();
+    auto process = tc->getProcessPtr();
+    auto mem_state = process->memState;

    switch (req) {
        case AMDKFD_IOC_GET_VERSION:
@@ -394,6 +410,12 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
            assert((args->va_addr % TheISA::PageBytes) == 0);
            Addr mmap_offset = 0;

+            Request::CacheCoherenceFlags mtype = defaultMtype;
+            Addr pa_addr = 0;
+
+            int npages = divCeil(args->size, (int64_t)TheISA::PageBytes);
+            bool cacheable = true;
+
            if (KFD_IOC_ALLOC_MEM_FLAGS_VRAM & args->flags) {
                DPRINTF(GPUDriver, "amdkfd allocation type: VRAM\n");
                args->mmap_offset = args->va_addr;
@@ -409,14 +431,39 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                // physical address space with different memory controllers
                // yet.  This is where we will explicitly add the PT maps to
                // dGPU memory in the future.
+                //
+                // Bind the VA space to the dGPU physical memory pool.  Mark
+                // this region as Uncacheable.  The Uncacheable flag is only
+                // really used by the CPU and is ignored by the GPU. We mark
+                // this as uncacheable from the CPU so that we can implement
+                // direct CPU framebuffer access similar to what we currently
+                // offer in real HW through the so-called Large BAR feature.
+                pa_addr = process->system->allocPhysPages(npages, dGPUPoolID);
+                //
+                // TODO: Uncacheable accesses need to be supported by the
+                // CPU-side protocol for this to work correctly.  I believe
+                // it only works right now if the physical memory is MMIO
+                cacheable = false;
+
+                DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
+                        "%d\n", args->va_addr, pa_addr, args->size);
+
            } else if (KFD_IOC_ALLOC_MEM_FLAGS_USERPTR & args->flags) {
                DPRINTF(GPUDriver, "amdkfd allocation type: USERPTR\n");
                mmap_offset = args->mmap_offset;
                // USERPTR allocations are system memory mapped into GPUVM
                // space.  The user provides the driver with the pointer.
-                //
-                // No action needs to be taken for this memory type.  We will
-                // lazily map it into host memory on first touch.
+                pa_addr = process->system->allocPhysPages(npages);
+
+                DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
+                        "%d\n", args->va_addr, pa_addr, args->size);
+
+                // If the HSA runtime requests system coherent memory, than we
+                // need to explicity mark this region as uncacheable from the
+                // perspective of the GPU.
+                if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
+                    mtype.clear();
+
            } else if (KFD_IOC_ALLOC_MEM_FLAGS_GTT & args->flags) {
                DPRINTF(GPUDriver, "amdkfd allocation type: GTT\n");
                args->mmap_offset = args->va_addr;
@@ -424,14 +471,23 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                // It's different than a USERPTR allocation since the driver
                // itself allocates the physical memory on the host.
                //
-                // No action needs to be taken for this memory type.  We will
-                // lazily map it into host memory on first touch.  The
+                // We will lazily map it into host memory on first touch.  The
                // fixupFault will find the original SVM aperture mapped to the
                // host.
-                //
+                pa_addr = process->system->allocPhysPages(npages);
+
+                DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
+                        "%d\n", args->va_addr, pa_addr, args->size);
+
+                // If the HSA runtime requests system coherent memory, than we
+                // need to explicity mark this region as uncacheable from the
+                // perspective of the GPU.
+                if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
+                    mtype.clear();
+
                // Note that for GTT the thunk layer needs to call mmap on the
                // driver FD later if it wants the host to have access to this
-                // memory (which it probably does).
+                // memory (which it probably does).  This will be ignored.
            } else if (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL & args->flags) {
                DPRINTF(GPUDriver, "amdkfd allocation type: DOORBELL\n");
                // DOORBELL allocations are the queue doorbells that are
@@ -439,25 +495,57 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                //
                // Explicitly map this virtual address to our PIO doorbell
                // interface in the page tables (non-cacheable)
-                tc->getProcessPtr()->pTable->map(args->va_addr,
-                            device->hsaPacketProc().pioAddr,
-                            args->size, false);
-                break;
+                pa_addr = device->hsaPacketProc().pioAddr;
+                cacheable = false;
            }

            DPRINTF(GPUDriver, "amdkfd allocation arguments: va_addr %p "
                    "size %lu, mmap_offset %p, gpu_id %d\n",
                    args->va_addr, args->size, mmap_offset, args->gpu_id);

-            // TODO: Not sure where the handle is used yet.  Set it to an
-            // easily trackable value.
-            args->handle= 0xdeadbeef;
+            // Bind selected physical memory to provided virtual address range
+            // in X86 page tables.
+            process->pTable->map(args->va_addr, pa_addr, args->size,
+                cacheable);
+
+            // We keep track of allocated regions of GPU mapped memory,
+            // just like the driver would.  This allows us to provide the
+            // user with a unique handle for a given allocation.  The user
+            // will only provide us with a handle after allocation and expect
+            // us to be able to use said handle to extract all the properties
+            // of the region.
+            //
+            // This is a simplified version of regular system VMAs, but for
+            // GPUVM space (non of the clobber/remap nonsense we find in real
+            // OS managed memory).
+            allocateGpuVma(mtype, args->va_addr, args->size);
+
+            // Used by the runtime to uniquely identify this allocation.
+            // We can just use the starting address of the VMA region.
+            args->handle= args->va_addr;
            args.copyOut(virt_proxy);
          }
          break;
        case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
          {
-            warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
+            TypedBufferArg<kfd_ioctl_free_memory_of_gpu_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+
+            assert(isdGPU);
+            DPRINTF(GPUDriver, "amdkfd free arguments: handle %p ",
+                    args->handle);
+
+            // We don't recycle physical pages in SE mode
+            Addr size = deallocateGpuVma(args->handle);
+            process->pTable->unmap(args->handle, size);
+
+            // TODO: IOMMU and GPUTLBs do not seem to correctly support
+            // shootdown.  This is also a potential issue for APU systems
+            // that perform unmap or remap with system memory.
+            tc->getMMUPtr()->flushAll();
+
+            args.copyOut(virt_proxy);
          }
          break;
        /**
@@ -635,3 +723,45 @@ GPUComputeDriver::ldsApeLimit(Addr apeBase) const
 {
    return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
 }
+
+void
+GPUComputeDriver::allocateGpuVma(Request::CacheCoherenceFlags mtype,
+                                 Addr start, Addr length)
+{
+    AddrRange range = AddrRange(start, start + length - 1);
+    DPRINTF(GPUDriver, "Registering [%p - %p] with MTYPE %d\n",
+            range.start(), range.end(), mtype);
+    fatal_if(gpuVmas.insert(range, mtype) == gpuVmas.end(),
+             "Attempted to double register Mtypes for [%p - %p]\n",
+             range.start(), range.end());
+}
+
+Addr
+GPUComputeDriver::deallocateGpuVma(Addr start)
+{
+    auto vma = gpuVmas.contains(start);
+    assert(vma != gpuVmas.end());
+    assert((vma->first.start() == start));
+    Addr size = vma->first.size();
+    DPRINTF(GPUDriver, "Unregistering [%p - %p]\n", vma->first.start(),
+            vma->first.end());
+    gpuVmas.erase(vma);
+    return size;
+}
+
+void
+GPUComputeDriver::setMtype(RequestPtr req)
+{
+    // If we are a dGPU then set the MTYPE from our VMAs.
+    if (isdGPU) {
+        AddrRange range = RangeSize(req->getVaddr(), req->getSize());
+        auto vma = gpuVmas.contains(range);
+        assert(vma != gpuVmas.end());
+        DPRINTF(GPUShader, "Setting req from [%p - %p] MTYPE %d\n"
+                "%d\n", range.start(), range.end(), vma->second);
+        req->setCacheCoherenceFlags(vma->second);
+    // APUs always get the default MTYPE
+    } else {
+        req->setCacheCoherenceFlags(defaultMtype);
+    }
+}
--- a/src/gpu-compute/gpu_compute_driver.hh
+++ b/src/gpu-compute/gpu_compute_driver.hh
@@ -42,7 +42,9 @@
 #ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
 #define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__

+#include "base/addr_range_map.hh"
 #include "dev/hsa/hsa_driver.hh"
+#include "mem/request.hh"

 struct GPUComputeDriverParams;

@@ -53,9 +55,44 @@ class GPUComputeDriver final : public HSADriver
    GPUComputeDriver(const Params &p);
    int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;
    void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
+    /**
+     * Called by the compute units right before a request is issued to ruby.
+     * This uses our VMAs to correctly set the MTYPE on a per-request basis.
+     * In real hardware, this is actually done through PTE bits in GPUVM.
+     * Since we are running a single VM (x86 PT) system, the MTYPE bits aren't
+     * available.  Adding GPUVM specific bits to x86 page tables probably
+     * isn't the best way to proceed.  For now we just have the driver set
+     * these until we implement a proper dual PT system.
+     */
+    void setMtype(RequestPtr req);

  private:
    bool isdGPU;
+    int dGPUPoolID;
+
+    /**
+     * VMA structures for GPUVM memory.
+     */
+    AddrRangeMap<Request::CacheCoherenceFlags, 1> gpuVmas;
+
+    /**
+     * Mtype bits {Cached, Read Write, Shared} for caches
+     */
+    enum MtypeFlags
+    {
+        SHARED                  = 0,
+        READ_WRITE              = 1,
+        CACHED                  = 2
+    };
+
+    Request::CacheCoherenceFlags defaultMtype;
+
+    /**
+     * Register a region of host memory as uncacheable from the perspective
+     * of the dGPU.
+     */
+    void registerUncacheableMemory(Addr start, Addr length);
+
    /**
     * The aperture (APE) base/limit pairs are set
     * statically at startup by the real KFD. AMD
@@ -77,6 +114,16 @@ class GPUComputeDriver final : public HSADriver
    Addr scratchApeLimit(Addr apeBase) const;
    Addr ldsApeBase(int gpuNum) const;
    Addr ldsApeLimit(Addr apeBase) const;
+
+    /**
+     * Allocate/deallocate GPUVM VMAs for tracking virtual address allocations
+     * and properties on DGPUs.  For now, we use these to track MTYPE and to
+     * be able to select which pages to unmap when the user provides us with
+     * a handle during the free ioctl.
+     */
+    void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start,
+                        Addr length);
+    Addr deallocateGpuVma(Addr start);
 };

 #endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -296,11 +296,23 @@ class Request
    enum : CacheCoherenceFlagsType
    {
        /** mem_sync_op flags */
-        INV_L1                  = 0x00000001,
+        I_CACHE_INV             = 0x00000001,
+        INV_L1                  = I_CACHE_INV,
+        V_CACHE_INV             = 0x00000002,
+        K_CACHE_INV             = 0x00000004,
+        GL1_CACHE_INV           = 0x00000008,
+        K_CACHE_WB              = 0x00000010,
        FLUSH_L2                = 0x00000020,
+        GL2_CACHE_INV           = 0x00000040,
        /** user-policy flags */
        SLC_BIT                 = 0x00000080,
-        GLC_BIT                 = 0x00000100,
+        DLC_BIT                 = 0x00000100,
+        GLC_BIT                 = 0x00000200,
+        /** mtype flags */
+        CACHED                  = 0x00000400,
+        READ_WRITE              = 0x00000800,
+        SHARED                  = 0x00001000,
+
    };

    using LocalAccessor =
--- a/src/sim/mem_state.hh
+++ b/src/sim/mem_state.hh
@@ -234,6 +234,17 @@ class MemState : public Serializable
    std::string printVmaList();

  private:
+    /**
+     * @param
+     */
+    void replicatePage(const MemState &in, Addr vaddr, Addr new_paddr,
+                       bool alloc_page);
+
+    /**
+     * @param
+     */
+    System * system() const;
+
    /**
     * Owner process of MemState. Used to manipulate page tables.
     */