diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index feed8a724b..baf936068b 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -182,6 +182,13 @@ parser.add_option("--num-hw-queues", type="int", default=10,
 parser.add_option("--reg-alloc-policy",type="string", default="simple",
                   help="register allocation policy (simple/dynamic)")
 
+parser.add_option("--dgpu", action="store_true", default=False,
+                  help="Configure the system as a dGPU instead of an APU. "
+                  "The dGPU config has its own local memory pool and is not "
+                  "coherent with the host through hardware.  Data is "
+                  "transfered from host to device memory using runtime calls "
+                  "that copy data over a PCIe-like IO bus.")
+
 Ruby.define_options(parser)
 
 #add TLB options to the parser
@@ -417,7 +424,7 @@ hsapp_gpu_map_size = 0x1000
 hsapp_gpu_map_paddr = int(Addr(options.mem_size))
 
 # HSA kernel mode driver
-gpu_driver = GPUComputeDriver(filename="kfd")
+gpu_driver = GPUComputeDriver(filename = "kfd", isdGPU = options.dgpu)
 
 # Creating the GPU kernel launching components: that is the HSA
 # packet processor (HSAPP), GPU command processor (CP), and the
@@ -470,7 +477,15 @@ else:
                "/usr/lib/x86_64-linux-gnu"
            ]),
            'HOME=%s' % os.getenv('HOME','/'),
-           "HSA_ENABLE_INTERRUPT=1"]
+           # Disable the VM fault handler signal creation for dGPUs also
+           # forces the use of DefaultSignals instead of driver-controlled
+           # InteruptSignals throughout the runtime.  DefaultSignals poll
+           # on memory in the runtime, while InteruptSignals call into the
+           # driver.
+           "HSA_ENABLE_INTERRUPT=1",
+           # We don't have an SDMA hardware model, so need to fallback to
+           # vector copy kernels for dGPU memcopies to/from host and device.
+           "HSA_ENABLE_SDMA=0"]
 
 process = Process(executable = executable, cmd = [options.cmd]
                   + options.options.split(), drivers = [gpu_driver], env = env)
@@ -643,7 +658,12 @@ system.redirect_paths = redirect_paths
 
 root = Root(system=system, full_system=False)
 
-hsaTopology.createHsaTopology(options)
+# Create the /sys/devices filesystem for the simulator so that the HSA Runtime
+# knows what type of GPU hardware we are simulating
+if options.dgpu:
+    hsaTopology.createFijiTopology(options)
+else:
+    hsaTopology.createCarrizoTopology(options)
 
 m5.ticks.setGlobalFrequency('1THz')
 if options.abs_max_tick:
diff --git a/configs/example/hsaTopology.py b/configs/example/hsaTopology.py
index 707a83df3d..a5e0d446fc 100644
--- a/configs/example/hsaTopology.py
+++ b/configs/example/hsaTopology.py
@@ -49,7 +49,177 @@ def remake_dir(path):
         rmtree(path)
     makedirs(path)
 
-def createHsaTopology(options):
+# This fakes out a dGPU setup so the runtime correctly operations.  The spoofed
+# system has a single dGPU and a single socket CPU.  Note that more complex
+# topologies (multi-GPU, multi-socket CPUs) need to have a different setup
+# here or the runtime won't be able to issue Memcpies from one node to another.
+#
+# TODO: There is way too much hardcoded here.  It doesn't effect anything in
+# our current ROCm stack (1.6), but it is highly possible that it will in the
+# future.  We might need to scrub through this and extract the appropriate
+# fields from the simulator in the future.
+def createFijiTopology(options):
+    topology_dir = joinpath(m5.options.outdir, \
+        'fs/sys/devices/virtual/kfd/kfd/topology')
+    remake_dir(topology_dir)
+
+    amdgpu_dir = joinpath(m5.options.outdir, \
+        'fs/sys/module/amdgpu/parameters')
+    remake_dir(amdgpu_dir)
+
+    # Fiji reported VM size in GB.  Used to reserve an allocation from CPU
+    # to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree)
+    file_append((amdgpu_dir, 'vm_size'), 256)
+
+    # Ripped from real Fiji platform to appease KMT version checks
+    file_append((topology_dir, 'generation_id'), 2)
+
+    # Set up system properties.  Regiter as ast-rocm server
+    sys_prop = 'platform_oem 35498446626881\n' + \
+               'platform_id 71791775140929\n' + \
+               'platform_rev 2\n'
+    file_append((topology_dir, 'system_properties'), sys_prop)
+
+    # Populate the topology tree
+    # Our dGPU system is two nodes.  Node 0 is a CPU and Node 1 is a dGPU
+    node_dir = joinpath(topology_dir, 'nodes/0')
+    remake_dir(node_dir)
+
+    # Register as a CPU
+    file_append((node_dir, 'gpu_id'), 0)
+    file_append((node_dir, 'name'), '')
+
+    # CPU links.  Only thing that matters is we tell the runtime that GPU is
+    # connected through PCIe to CPU socket 0.
+    io_links = 1
+    io_dir = joinpath(node_dir, 'io_links/0')
+    remake_dir(io_dir)
+    io_prop = 'type 2\n'                                    + \
+              'version_major 0\n'                           + \
+              'version_minor 0\n'                           + \
+              'node_from 0\n'                               + \
+              'node_to 1\n'                                 + \
+              'weight 20\n'                                 + \
+              'min_latency 0\n'                             + \
+              'max_latency 0\n'                             + \
+              'min_bandwidth 0\n'                           + \
+              'max_bandwidth 0\n'                           + \
+              'recommended_transfer_size 0\n'               + \
+              'flags 13\n'
+    file_append((io_dir, 'properties'), io_prop)
+
+    # Populate CPU node properties
+    node_prop = 'cpu_cores_count %s\n' % options.num_cpus   + \
+                'simd_count 0\n'                            + \
+                'mem_banks_count 1\n'                       + \
+                'caches_count 0\n'                          + \
+                'io_links_count %s\n' % io_links            + \
+                'cpu_core_id_base 0\n'                      + \
+                'simd_id_base 0\n'                          + \
+                'max_waves_per_simd 0\n'                    + \
+                'lds_size_in_kb 0\n'                        + \
+                'gds_size_in_kb 0\n'                        + \
+                'wave_front_size 64\n'                      + \
+                'array_count 0\n'                           + \
+                'simd_arrays_per_engine 0\n'                + \
+                'cu_per_simd_array 0\n'                     + \
+                'simd_per_cu 0\n'                           + \
+                'max_slots_scratch_cu 0\n'                  + \
+                'vendor_id 0\n'                             + \
+                'device_id 0\n'                             + \
+                'location_id 0\n'                           + \
+                'drm_render_minor 0\n'                      + \
+                'max_engine_clk_ccompute 3400\n'
+
+    file_append((node_dir, 'properties'), node_prop)
+
+    # CPU memory reporting
+    mem_dir = joinpath(node_dir, 'mem_banks/0')
+    remake_dir(mem_dir)
+    mem_prop = 'heap_type 0\n'                              + \
+               'size_in_bytes 33704329216\n'                + \
+               'flags 0\n'                                  + \
+               'width 72\n'                                 + \
+               'mem_clk_max 2400\n'
+
+    file_append((mem_dir, 'properties'), mem_prop)
+
+    # Build the GPU node
+    node_dir = joinpath(topology_dir, 'nodes/1')
+    remake_dir(node_dir)
+
+    # Register as a Fiji
+    file_append((node_dir, 'gpu_id'), 50156)
+    file_append((node_dir, 'name'), 'Fiji\n')
+
+    # Real Fiji shows 96, but building that topology is complex and doesn't
+    # appear to be required for anything.
+    caches = 0
+
+    # GPU links.  Only thing that matters is we tell the runtime that GPU is
+    # connected through PCIe to CPU socket 0.
+    io_links = 1
+    io_dir = joinpath(node_dir, 'io_links/0')
+    remake_dir(io_dir)
+    io_prop = 'type 2\n'                                    + \
+              'version_major 0\n'                           + \
+              'version_minor 0\n'                           + \
+              'node_from 1\n'                               + \
+              'node_to 0\n'                                 + \
+              'weight 20\n'                                 + \
+              'min_latency 0\n'                             + \
+              'max_latency 0\n'                             + \
+              'min_bandwidth 0\n'                           + \
+              'max_bandwidth 0\n'                           + \
+              'recommended_transfer_size 0\n'               + \
+              'flags 1\n'
+    file_append((io_dir, 'properties'), io_prop)
+
+    # Populate GPU node properties
+    node_prop = 'cpu_cores_count %s\n' % options.num_cpus                   + \
+                'simd_count %s\n'                                             \
+                    % (options.num_compute_units * options.simds_per_cu)    + \
+                'mem_banks_count 1\n'                                       + \
+                'caches_count %s\n' % caches                                + \
+                'io_links_count %s\n' % io_links                            + \
+                'cpu_core_id_base 0\n'                                      + \
+                'simd_id_base 2147487744\n'                                 + \
+                'max_waves_per_simd %s\n' % options.wfs_per_simd            + \
+                'lds_size_in_kb %s\n' % int(options.lds_size / 1024)        + \
+                'gds_size_in_kb 0\n'                                        + \
+                'wave_front_size %s\n' % options.wf_size                    + \
+                'array_count 4\n'                           + \
+                'simd_arrays_per_engine %s\n' % options.sa_per_complex      + \
+                'cu_per_simd_array %s\n' % options.cu_per_sa                + \
+                'simd_per_cu %s\n' % options.simds_per_cu                   + \
+                'max_slots_scratch_cu 32\n'                                 + \
+                'vendor_id 4098\n'                                          + \
+                'device_id 29440\n'                                         + \
+                'location_id 512\n'                                         + \
+                'max_engine_clk_fcompute %s\n'                                \
+                    % int(toFrequency(options.gpu_clock) / 1e6)             + \
+                'local_mem_size 4294967296\n'                               + \
+                'fw_version 730\n'                                          + \
+                'capability 4736\n'                                         + \
+                'max_engine_clk_ccompute %s\n'                                \
+                    % int(toFrequency(options.CPUClock) / 1e6)
+
+    file_append((node_dir, 'properties'), node_prop)
+
+    # Fiji HBM reporting
+    # TODO: Extract size, clk, and width from sim paramters
+    mem_dir = joinpath(node_dir, 'mem_banks/0')
+    remake_dir(mem_dir)
+    mem_prop = 'heap_type 1\n'                              + \
+               'size_in_bytes 4294967296\n'                 + \
+               'flags 0\n'                                  + \
+               'width 4096\n'                               + \
+               'mem_clk_max 500\n'
+
+    file_append((mem_dir, 'properties'), mem_prop)
+
+
+def createCarrizoTopology(options):
     topology_dir = joinpath(m5.options.outdir, \
         'fs/sys/devices/virtual/kfd/kfd/topology')
     remake_dir(topology_dir)
diff --git a/src/dev/hsa/hsa_driver.cc b/src/dev/hsa/hsa_driver.cc
index db31cbc103..f2db43635c 100644
--- a/src/dev/hsa/hsa_driver.cc
+++ b/src/dev/hsa/hsa_driver.cc
@@ -70,25 +70,25 @@ Addr
 HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
                 int tgt_flags, int tgt_fd, off_t offset)
 {
-     // Is this a signal event mmap
-     bool is_event_mmap = false;
-     // If addr == 0, then we may need to do mmap.
-     bool should_mmap = (start == 0);
-     auto process = tc->getProcessPtr();
-     auto mem_state = process->memState;
-     // Check if mmap is for signal events first
-     if (((offset >> PAGE_SHIFT) & KFD_MMAP_TYPE_MASK) ==
-         KFD_MMAP_TYPE_EVENTS) {
-         is_event_mmap = true;
-         DPRINTF(HSADriver, "amdkfd mmap for events(start: %p, length: 0x%x,"
-                 "offset: 0x%x,  )\n", start, length, offset);
-         panic_if(start != 0,
-                  "Start address should be provided by KFD\n");
-         panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
-                  "Requested length %d, expected length %d; length mismatch\n",
-                   length, 8 * KFD_SIGNAL_EVENT_LIMIT);
-         // For signal event, do mmap only is eventPage is uninitialized
-         should_mmap = (!eventPage);
+    // Is this a signal event mmap
+    bool is_event_mmap = false;
+    // If addr == 0, then we may need to do mmap.
+    bool should_mmap = (start == 0);
+    auto process = tc->getProcessPtr();
+    auto mem_state = process->memState;
+    // Check if mmap is for signal events first
+    if (((offset >> PAGE_SHIFT) & KFD_MMAP_TYPE_MASK) ==
+        KFD_MMAP_TYPE_EVENTS) {
+        is_event_mmap = true;
+        DPRINTF(HSADriver, "amdkfd mmap for events(start: %p, length: 0x%x,"
+                "offset: 0x%x,  )\n", start, length, offset);
+        panic_if(start != 0,
+                 "Start address should be provided by KFD\n");
+        panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
+                 "Requested length %d, expected length %d; length mismatch\n",
+                  length, 8 * KFD_SIGNAL_EVENT_LIMIT);
+        // For signal event, do mmap only is eventPage is uninitialized
+        should_mmap = (!eventPage);
     } else {
         DPRINTF(HSADriver, "amdkfd doorbell mmap (start: %p, length: 0x%x,"
                 "offset: 0x%x)\n", start, length, offset);
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index d2959ac6b8..e54882330c 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -236,6 +236,7 @@ class Shader(ClockedObject):
 class GPUComputeDriver(HSADriver):
     type = 'GPUComputeDriver'
     cxx_header = 'gpu-compute/gpu_compute_driver.hh'
+    isdGPU = Param.Bool(False, 'Driver is for a dGPU')
 
 class GPUDispatcher(SimObject):
     type = 'GPUDispatcher'
diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc
index 664afa9257..6c4639a2b2 100644
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -40,10 +40,11 @@
 #include "dev/hsa/kfd_event_defines.h"
 #include "dev/hsa/kfd_ioctl.h"
 #include "params/GPUComputeDriver.hh"
+#include "sim/process.hh"
 #include "sim/syscall_emul_buf.hh"
 
 GPUComputeDriver::GPUComputeDriver(const Params &p)
-    : HSADriver(p)
+    : HSADriver(p), isdGPU(p.isdGPU)
 {
     device->attachDriver(this);
     DPRINTF(GPUDriver, "Constructing KFD: device\n");
@@ -86,6 +87,19 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
           break;
         case AMDKFD_IOC_SET_MEMORY_POLICY:
           {
+            /**
+             * This is where the runtime requests MTYPE from an aperture.
+             * Basically, the globally memory aperture is divided up into
+             * a default aperture and an alternate aperture each of which have
+             * their own MTYPE policies.  This is done to mark a small piece
+             * of the global memory as uncacheable.  Host memory mappings will
+             * be carved out of this uncacheable aperture, which is how they
+             * implement 'coherent' host/device memory on dGPUs.
+             *
+             * TODO: Need to reflect per-aperture MTYPE policies based on this
+             * call.
+             *
+             */
             warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
           }
           break;
@@ -145,7 +159,10 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                     gpuVmApeLimit(args->process_apertures[i].gpuvm_base);
 
                 // NOTE: Must match ID populated by hsaTopology.py
-                args->process_apertures[i].gpu_id = 2765;
+                if (isdGPU)
+                    args->process_apertures[i].gpu_id = 50156;
+                else
+                    args->process_apertures[i].gpu_id = 2765;
 
                 DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
                         args->process_apertures[i].gpuvm_base);
@@ -351,9 +368,91 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
             warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
           }
           break;
+        /**
+         * In real hardware, this IOCTL maps host memory, dGPU memory, or dGPU
+         * doorbells into GPUVM space. Essentially, ROCm implements SVM by
+         * carving out a region of free VA space that both the host and GPUVM
+         * can agree upon.  The entire GPU VA space is reserved on the host
+         * using a fixed mmap at a low VA range that is also directly
+         * accessable by the GPU's limited number of VA bits.  When we actually
+         * call memory allocation later in the program, this IOCTL is invoked
+         * to create BOs/VMAs in the driver and bind them to physical
+         * memory/doorbells.
+         *
+         * For gem5, we don't need to carve out any GPUVM space here (we don't
+         * support GPUVM and use host page tables on the GPU directly). We can
+         * can just use the existing host SVM region. We comment on each memory
+         * type seperately.
+         */
         case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
           {
-            warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
+            TypedBufferArg<kfd_ioctl_alloc_memory_of_gpu_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+
+            assert(isdGPU);
+            assert((args->va_addr % TheISA::PageBytes) == 0);
+            Addr mmap_offset = 0;
+
+            if (KFD_IOC_ALLOC_MEM_FLAGS_VRAM & args->flags) {
+                DPRINTF(GPUDriver, "amdkfd allocation type: VRAM\n");
+                args->mmap_offset = args->va_addr;
+                // VRAM allocations are device memory mapped into GPUVM
+                // space.
+                //
+                // We can't rely on the lazy host allocator (fixupFault) to
+                // handle this mapping since it needs to be placed in dGPU
+                // framebuffer memory.  The lazy allocator will try to place
+                // this in host memory.
+                //
+                // TODO: We don't have the appropriate bifurcation of the
+                // physical address space with different memory controllers
+                // yet.  This is where we will explicitly add the PT maps to
+                // dGPU memory in the future.
+            } else if (KFD_IOC_ALLOC_MEM_FLAGS_USERPTR & args->flags) {
+                DPRINTF(GPUDriver, "amdkfd allocation type: USERPTR\n");
+                mmap_offset = args->mmap_offset;
+                // USERPTR allocations are system memory mapped into GPUVM
+                // space.  The user provides the driver with the pointer.
+                //
+                // No action needs to be taken for this memory type.  We will
+                // lazily map it into host memory on first touch.
+            } else if (KFD_IOC_ALLOC_MEM_FLAGS_GTT & args->flags) {
+                DPRINTF(GPUDriver, "amdkfd allocation type: GTT\n");
+                args->mmap_offset = args->va_addr;
+                // GTT allocations are system memory mapped into GPUVM space.
+                // It's different than a USERPTR allocation since the driver
+                // itself allocates the physical memory on the host.
+                //
+                // No action needs to be taken for this memory type.  We will
+                // lazily map it into host memory on first touch.  The
+                // fixupFault will find the original SVM aperture mapped to the
+                // host.
+                //
+                // Note that for GTT the thunk layer needs to call mmap on the
+                // driver FD later if it wants the host to have access to this
+                // memory (which it probably does).
+            } else if (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL & args->flags) {
+                DPRINTF(GPUDriver, "amdkfd allocation type: DOORBELL\n");
+                // DOORBELL allocations are the queue doorbells that are
+                // memory mapped into GPUVM space.
+                //
+                // Explicitly map this virtual address to our PIO doorbell
+                // interface in the page tables (non-cacheable)
+                tc->getProcessPtr()->pTable->map(args->va_addr,
+                            device->hsaPacketProc().pioAddr,
+                            args->size, false);
+                break;
+            }
+
+            DPRINTF(GPUDriver, "amdkfd allocation arguments: va_addr %p "
+                    "size %lu, mmap_offset %p, gpu_id %d\n",
+                    args->va_addr, args->size, mmap_offset, args->gpu_id);
+
+            // TODO: Not sure where the handle is used yet.  Set it to an
+            // easily trackable value.
+            args->handle= 0xdeadbeef;
+            args.copyOut(virt_proxy);
           }
           break;
         case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
@@ -361,6 +460,13 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
             warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
           }
           break;
+        /**
+         * Called to map an already allocated region of memory to this GPU's
+         * GPUVM VA space.  We don't need to implement this in the simulator
+         * since we only have a single VM system.  If the region has already
+         * been allocated somewhere like the CPU, then it's already visible
+         * to the device.
+         */
         case AMDKFD_IOC_MAP_MEMORY_TO_GPU:
           {
             warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
@@ -415,7 +521,11 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                 ape_args->gpuvm_base = gpuVmApeBase(i + 1);
                 ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base);
 
-                ape_args->gpu_id = 2765;
+                // NOTE: Must match ID populated by hsaTopology.py
+                if (isdGPU)
+                    ape_args->gpu_id = 50156;
+                else
+                    ape_args->gpu_id = 2765;
 
                 assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
                 assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
diff --git a/src/gpu-compute/gpu_compute_driver.hh b/src/gpu-compute/gpu_compute_driver.hh
index d2c822d588..f8c02b2d01 100644
--- a/src/gpu-compute/gpu_compute_driver.hh
+++ b/src/gpu-compute/gpu_compute_driver.hh
@@ -55,6 +55,7 @@ class GPUComputeDriver final : public HSADriver
     void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
 
   private:
+    bool isdGPU;
     /**
      * The aperture (APE) base/limit pairs are set
      * statically at startup by the real KFD. AMD