dev-hsa: Add HSA device and HSA packet processor

This change adds support for HSA devices, which are DMA devices that have an HSA packet processor (HSAPP). An HSA packet processor model is also included. The HSAPP is a DMA device that matains AQL packet queues and handles extraction of AQL packets, scheduling of AQL queues, and initiates kernel launch for HSA devices. Because these devices directly interact with low-level software and aid in the implementation of the HSA ABI we also include some headers from the ROCm runtime: the hsa.h and kfd_ioctl.h headers. These aid with support ROCm for the HSA devices and drivers. Change-Id: I24305e0337edc6fa555d436697b4e607a1e097d5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/28128 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Maintainer: Jason Lowe-Power <power.jg@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2018-04-27 19:40:11 -04:00
parent 94f15bd3f7
commit 15adefd7bc
16 changed files with 8419 additions and 0 deletions
--- a/2
+++ b/2
@@ -53,6 +53,8 @@ cpu-o3:
 cpu-simple:

 dev:
+dev-hsa:
+  Tony Gutierrez <anthony.gutierrez@amd.com>
 dev-virtio:
  Andreas Sandberg <andreas.sandberg@arm.com>

--- a/src/dev/hsa/HSADevice.py
+++ b/src/dev/hsa/HSADevice.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Eric Van Tassell
+#          Anthony Gutierrez
+
+from m5.SimObject import SimObject
+from m5.params import *
+from m5.proxy import *
+from Device import DmaDevice
+
+class HSADevice(DmaDevice):
+    type = 'HSADevice'
+    abstract = True
+    cxx_header = "dev/hsa/hsa_device.hh"
+    hsapp = Param.HSAPacketProcessor("PP attached to this device")
+
+class HSAPacketProcessor(DmaDevice):
+    type = 'HSAPacketProcessor'
+    cxx_header = 'dev/hsa/hsa_packet_processor.hh'
+    pioAddr = Param.Addr("doorbell physical address")
+    numHWQueues = Param.Int("Number of HW queues")
+    # See:
+    # Sooraj Puthoor et al., Oversubscribed Command Queues in GPUs. In the
+    # proceedings of the 11th Workshop on General Purpose GPUs (GPGPU). 2018.
+    wakeupDelay = Param.Tick(100000000, "Scheduling quantum")
+    # This value was obtained empirically on Kaveri hardware via the KPS
+    # benchmark from ATMI.
+    # See: https://github.com/RadeonOpenCompute/atmi/tree/master/examples/
+    #      runtime/kps
+    pktProcessDelay = Param.Tick(4400000, "Packet processing delay")
--- a/src/dev/hsa/HSADriver.py
+++ b/src/dev/hsa/HSADriver.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Anthony Gutierrez
+#          Eric Van Tassell
+
+from m5.SimObject import SimObject
+from m5.params import *
+from m5.proxy import *
+from Process import EmulatedDriver
+
+class HSADriver(EmulatedDriver):
+    type = 'HSADriver'
+    abstract = True
+    cxx_header = 'dev/hsa/hsa_driver.hh'
+    device = Param.HSADevice('HSA device controlled by this driver')
--- a/src/dev/hsa/SConscript
+++ b/src/dev/hsa/SConscript
@@ -0,0 +1,50 @@
+# -*- mode:python -*-
+
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Anthony Gutierrez
+
+Import('*')
+
+if not env['BUILD_GPU']:
+    Return()
+
+SimObject('HSADevice.py')
+SimObject('HSADriver.py')
+
+Source('hsa_device.cc')
+Source('hsa_driver.cc')
+Source('hsa_packet_processor.cc')
+Source('hw_scheduler.cc')
+
+DebugFlag('HSADriver')
+DebugFlag('HSAPacketProcessor')
--- a/src/dev/hsa/hsa.h
+++ b/src/dev/hsa/hsa.h
--- a/src/dev/hsa/hsa_device.cc
+++ b/src/dev/hsa/hsa_device.cc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Sooraj Puthoor
+ *          Michael LeBeane
+ *          Eric van Tassell
+ *          Anthony Gutierrez
+ */
+
+#include "dev/hsa/hsa_device.hh"
+
+#include "base/chunk_generator.hh"
+#include "sim/process.hh"
+
+HSAPacketProcessor&
+HSADevice::hsaPacketProc()
+{
+    return *hsaPP;
+}
+
+void
+HSADevice::dmaReadVirt(Addr host_addr, unsigned size,
+                             DmaCallback *cb, void *data, Tick delay)
+{
+    dmaVirt(&DmaDevice::dmaRead, host_addr, size, cb, data, delay);
+}
+
+void
+HSADevice::dmaWriteVirt(Addr host_addr, unsigned size,
+                              DmaCallback *cb, void *data, Tick delay)
+{
+    dmaVirt(&DmaDevice::dmaWrite, host_addr, size, cb, data, delay);
+}
+
+void
+HSADevice::dmaVirt(DmaFnPtr dmaFn, Addr addr, unsigned size,
+                           DmaCallback *cb, void *data, Tick delay)
+{
+    if (size == 0) {
+        if (cb)
+            schedule(cb->getChunkEvent(), curTick() + delay);
+        return;
+    }
+
+    // move the buffer data pointer with the chunks
+    uint8_t *loc_data = (uint8_t*)data;
+
+    for (ChunkGenerator gen(addr, size, PAGE_SIZE); !gen.done(); gen.next()) {
+        Addr phys;
+
+        // translate pages into their corresponding frames
+        translateOrDie(gen.addr(), phys);
+
+        Event *event = cb ? cb->getChunkEvent() : nullptr;
+
+        (this->*dmaFn)(phys, gen.size(), event, loc_data, delay);
+
+        loc_data += gen.size();
+    }
+}
+
+/**
+ * HSADevices will perform DMA operations on VAs, and because
+ * page faults are not currently supported for HSADevices, we
+ * must be able to find the pages mapped for the process.
+ */
+void
+HSADevice::translateOrDie(Addr vaddr, Addr &paddr)
+{
+    /**
+     * Grab the process and try to translate the virtual address with it;
+     * with new extensions, it will likely be wrong to just arbitrarily
+     * grab context zero.
+     */
+    auto process = sys->getThreadContext(0)->getProcessPtr();
+    auto mem_state = process->getMemState();
+
+    if (!mem_state->translate(vaddr, paddr)) {
+        fatal("failed translation: vaddr 0x%x\n", vaddr);
+    }
+}
--- a/src/dev/hsa/hsa_device.hh
+++ b/src/dev/hsa/hsa_device.hh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Eric van Tassell
+ *          Anthony Gutierrez
+ *          Sooraj Puthoor
+ *          Michael LeBeane
+ */
+
+#ifndef __DEV_HSA_HSA_DEVICE_HH__
+#define __DEV_HSA_HSA_DEVICE_HH__
+
+#include "dev/dma_device.hh"
+#include "dev/hsa/hsa_packet_processor.hh"
+#include "params/HSADevice.hh"
+
+class HSADevice : public DmaDevice
+{
+  public:
+    typedef HSADeviceParams Params;
+
+    HSADevice(const Params *p) : DmaDevice(p), hsaPP(p->hsapp)
+    {
+        assert(hsaPP);
+        hsaPP->setDevice(this);
+    };
+
+    HSAPacketProcessor& hsaPacketProc();
+
+    /**
+     * submitDispatchPkt() accepts AQL dispatch packets from the HSA packet
+     * processor. Not all devices will accept AQL dispatch packets, so the
+     * default implementation will fatal.
+     */
+    virtual void
+    submitDispatchPkt(void *raw_pkt, uint32_t qID, Addr host_pkt_addr)
+    {
+        fatal("%s does not accept dispatch packets\n", name());
+    }
+
+    /**
+     * submitVendorPkt() accepts vendor specific packets from the HSA
+     * packet processor. This method should be overriden in any HSADevice
+     * that acceptes vendor specific packets, and should interpret the
+     * packet according to the vendor's specifications. Not all HSA
+     * devices will accept vendor specific packets, so the default
+     * implementation will fatal.
+     */
+    virtual void
+    submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
+    {
+        fatal("%s does not accept vendor specific packets\n", name());
+    }
+
+    void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb,
+                     void *data, Tick delay = 0);
+    void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *cb,
+                      void *data, Tick delay = 0);
+
+  protected:
+    // Typedefing dmaRead and dmaWrite function pointer
+    typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
+    HSAPacketProcessor *hsaPP;
+    void dmaVirt(DmaFnPtr, Addr host_addr, unsigned size, DmaCallback *cb,
+                 void *data, Tick delay = 0);
+    void translateOrDie(Addr vaddr, Addr &paddr);
+};
+
+#endif // __DEV_HSA_HSA_DEVICE_HH__
--- a/src/dev/hsa/hsa_driver.cc
+++ b/src/dev/hsa/hsa_driver.cc
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ *          Eric van Tassell
+ */
+
+#include "dev/hsa/hsa_driver.hh"
+
+#include "cpu/thread_context.hh"
+#include "debug/HSADriver.hh"
+#include "dev/hsa/hsa_device.hh"
+#include "dev/hsa/kfd_ioctl.h"
+#include "params/HSADriver.hh"
+#include "sim/process.hh"
+#include "sim/syscall_emul_buf.hh"
+
+HSADriver::HSADriver(HSADriverParams *p)
+    : EmulatedDriver(p), device(p->device), queueId(0)
+{
+}
+
+/**
+ * Create an FD entry for the KFD inside of the owning process.
+ */
+int
+HSADriver::open(ThreadContext *tc, int mode, int flags)
+{
+    DPRINTF(HSADriver, "Opened %s\n", filename);
+    auto process = tc->getProcessPtr();
+    auto device_fd_entry = std::make_shared<DeviceFDEntry>(this, filename);
+    int tgt_fd = process->fds->allocFD(device_fd_entry);
+    return tgt_fd;
+}
+
+/**
+ * Currently, mmap() will simply setup a mapping for the associated
+ * device's packet processor's doorbells.
+ */
+Addr
+HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
+                int tgt_flags, int tgt_fd, int offset)
+{
+    DPRINTF(HSADriver, "amdkfd doorbell mmap (start: %p, length: 0x%x,"
+            "offset: 0x%x)\n", start, length, offset);
+
+    auto process = tc->getProcessPtr();
+    auto mem_state = process->getMemState();
+
+    // Extend global mmap region if necessary.
+    if (start == 0) {
+        // Assume mmap grows down, as in x86 Linux.
+        start = mem_state->getMmapEnd() - length;
+        mem_state->setMmapEnd(start);
+    }
+
+    /**
+     * Now map this virtual address to our PIO doorbell interface
+     * in the page tables (non-cacheable).
+     */
+    mem_state->map(start, device->hsaPacketProc().pioAddr, length, false);
+    DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start);
+    return start;
+}
+
+/**
+ * Forward relevant parameters to packet processor; queueID
+ * is used to link doorbell. The queueIDs are not re-used
+ * in current implementation, and we allocate only one page
+ * (4096 bytes) for doorbells, so check if this queue ID can
+ * be mapped into that page.
+ */
+void
+HSADriver::allocateQueue(const SETranslatingPortProxy &mem_proxy, Addr ioc_buf)
+{
+    TypedBufferArg<kfd_ioctl_create_queue_args> args(ioc_buf);
+    args.copyIn(mem_proxy);
+
+    if (VOID_PTR_ADD32(0, queueId) >= (void*)0x1000) {
+        fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
+    }
+
+    args->queue_id = queueId++;
+    auto &hsa_pp = device->hsaPacketProc();
+    hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
+                              args->ring_base_address, args->queue_id,
+                              args->ring_size);
+    args.copyOut(mem_proxy);
+}
--- a/src/dev/hsa/hsa_driver.hh
+++ b/src/dev/hsa/hsa_driver.hh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ *          Eric van Tassell
+ */
+
+/**
+ * @file
+ * An HSADriver is an emulated driver that controls an HSA agent,
+ * or more simply put, an HSA device. An HSA device is a device
+ * that has an associated HSA packet processor.
+ *
+ * In the base HSADriver class the open() method is implemented, as
+ * well as the mmap() call, which maps the HSA packet processor's
+ * doorbells. Drivers for other HSA devices should derive from this
+ * class and implement the necessary methods; typically this is an
+ * ioctl() method that satisfies the ioctl requests needed to manage
+ * and control the device.
+ */
+
+#ifndef __DEV_HSA_HSA_DRIVER_HH__
+#define __DEV_HSA_HSA_DRIVER_HH__
+
+#include "base/types.hh"
+#include "sim/emul_driver.hh"
+
+struct HSADriverParams;
+class HSADevice;
+class SETranslatingPortProxy;
+class ThreadContext;
+
+class HSADriver : public EmulatedDriver
+{
+  public:
+    HSADriver(HSADriverParams *p);
+
+    int open(ThreadContext *tc, int mode, int flags);
+    Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
+              int prot, int tgtFlags, int tgtFd, int offset);
+  protected:
+    /**
+     * HSA agent (device) that is controled by this driver.
+     */
+    HSADevice *device;
+    uint32_t queueId;
+
+    void allocateQueue(const SETranslatingPortProxy &mem_proxy,
+                       Addr ioc_buf_addr);
+};
+
+#endif // __DEV_HSA_HSA_DRIVER_HH__
--- a/src/dev/hsa/hsa_packet.hh
+++ b/src/dev/hsa/hsa_packet.hh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Eric van Tassell
+ */
+
+#ifndef __DEV_HSA_HSA_PACKET__
+#define __DEV_HSA_HSA_PACKET__
+
+#define _HSA_PACKET_TYPE_VENDOR_SPECIFIC 0
+
+#include <stdint.h>
+
+typedef struct hsa_packet_header_s {
+        // TODO: replace with more portable impl based on offset, length
+        uint16_t type:8;
+        uint16_t barrier:1;
+        uint16_t acquire_fence_scope:2;
+        uint16_t release_fence_scope:2;
+        uint16_t reserved:3;
+} hsa_packet_header_bitfield_t;
+
+//TODO: put an _ in front of these guys to avoud prob with hsa.h for now
+typedef struct _hsa_dispatch_packet_s {
+    uint16_t header;
+    uint16_t setup;
+    uint16_t workgroup_size_x;
+    uint16_t workgroup_size_y;
+    uint16_t workgroup_size_z;
+    uint16_t reserved0;
+    uint32_t grid_size_x;
+    uint32_t grid_size_y;
+    uint32_t grid_size_z;
+    uint32_t private_segment_size;
+    uint32_t group_segment_size;
+    uint64_t kernel_object;
+    uint64_t kernarg_address;
+    uint64_t reserved1;
+    uint64_t completion_signal;
+} _hsa_dispatch_packet_t;
+
+typedef struct _hsa_agent_dispatch_packet_s {
+    uint16_t header;
+    uint16_t type;
+    uint32_t reserved0;
+    uint64_t return_address;
+    uint64_t arg[4];
+    uint64_t reserved2;
+    uint64_t completion_signal;
+} _hsa_agent_dispatch_packet_t;
+
+typedef struct _hsa_barrier_and_packet_s {
+    uint16_t header;
+    uint16_t reserved0;
+    uint32_t reserved1;
+    uint64_t dep_signal[5];
+    uint64_t reserved2;
+    uint64_t completion_signal;
+} _hsa_barrier_and_packet_t;
+
+typedef struct _hsa_barrier_or_packet_s {
+    uint16_t header;
+    uint16_t reserved0;
+    uint32_t reserved1;
+    uint64_t dep_signal[5];
+    uint64_t reserved2;
+    uint64_t completion_signal;
+} _hsa_barrier_or_packet_t;
+
+#endif // __DEV_HSA_HSA_PACKET_HH__
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -0,0 +1,654 @@
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Eric van Tassell
+ */
+
+#include "dev/hsa/hsa_packet_processor.hh"
+
+#include <cstring>
+
+#include "base/chunk_generator.hh"
+#include "base/compiler.hh"
+#include "debug/HSAPacketProcessor.hh"
+#include "dev/dma_device.hh"
+#include "dev/hsa/hsa_device.hh"
+#include "dev/hsa/hsa_packet.hh"
+#include "dev/hsa/hw_scheduler.hh"
+#include "mem/packet_access.hh"
+#include "mem/page_table.hh"
+#include "sim/process.hh"
+#include "sim/syscall_emul_buf.hh"
+#include "sim/system.hh"
+
+#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT) \
+  const char*                                    \
+  HSAPacketProcessor::XEVENT::description() const       \
+  {                                              \
+      return #XEVENT;                            \
+  }
+
+#define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \
+            HSA_PACKET_HEADER_TYPE) & (HSA_PACKET_HEADER_WIDTH_TYPE - 1)))
+
+HSAPP_EVENT_DESCRIPTION_GENERATOR(UpdateReadDispIdDmaEvent)
+HSAPP_EVENT_DESCRIPTION_GENERATOR(CmdQueueCmdDmaEvent)
+HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent)
+HSAPP_EVENT_DESCRIPTION_GENERATOR(DepSignalsReadDmaEvent)
+
+HSAPacketProcessor::HSAPacketProcessor(const Params *p)
+    : DmaDevice(p), numHWQueues(p->numHWQueues), pioAddr(p->pioAddr),
+      pioSize(PAGE_SIZE), pioDelay(10), pktProcessDelay(p->pktProcessDelay)
+{
+    DPRINTF(HSAPacketProcessor, "%s:\n", __FUNCTION__);
+    hwSchdlr = new HWScheduler(this, p->wakeupDelay);
+    regdQList.resize(numHWQueues);
+    for (int i = 0; i < numHWQueues; i++) {
+        regdQList[i] = new RQLEntry(this, i);
+    }
+}
+
+HSAPacketProcessor::~HSAPacketProcessor()
+{
+    for (auto &queue : regdQList) {
+        delete queue;
+    }
+}
+
+void
+HSAPacketProcessor::unsetDeviceQueueDesc(uint64_t queue_id)
+{
+    hwSchdlr->unregisterQueue(queue_id);
+}
+
+void
+HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
+                                       uint64_t basePointer,
+                                       uint64_t queue_id,
+                                       uint32_t size)
+{
+    DPRINTF(HSAPacketProcessor,
+             "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
+             (void *)basePointer, queue_id, size);
+    hwSchdlr->registerNewQueue(hostReadIndexPointer,
+                               basePointer, queue_id, size);
+}
+
+AddrRangeList
+HSAPacketProcessor::getAddrRanges() const
+{
+    assert(pioSize != 0);
+
+    AddrRangeList ranges;
+    ranges.push_back(RangeSize(pioAddr, pioSize));
+
+    return ranges;
+}
+
+// Basically only processes writes to the queue doorbell register.
+Tick
+HSAPacketProcessor::write(Packet *pkt)
+{
+    assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
+
+    // TODO: How to get pid??
+    Addr M5_VAR_USED daddr = pkt->getAddr() - pioAddr;
+
+    DPRINTF(HSAPacketProcessor,
+          "%s: write of size %d to reg-offset %d (0x%x)\n",
+          __FUNCTION__, pkt->getSize(), daddr, daddr);
+
+    uint32_t doorbell_reg = pkt->get<uint32_t>();
+
+    DPRINTF(HSAPacketProcessor,
+            "%s: write data 0x%x to offset %d (0x%x)\n",
+            __FUNCTION__, doorbell_reg, daddr, daddr);
+    hwSchdlr->write(daddr, doorbell_reg);
+    pkt->makeAtomicResponse();
+    return pioDelay;
+}
+
+Tick
+HSAPacketProcessor::read(Packet *pkt)
+{
+    pkt->makeAtomicResponse();
+    pkt->setBadAddress();
+    return pioDelay;
+}
+
+void
+HSAPacketProcessor::translateOrDie(Addr vaddr, Addr &paddr)
+{
+    // Grab the process and try to translate the virtual address with it; with
+    // new extensions, it will likely be wrong to just arbitrarily grab context
+    // zero.
+    auto process = sys->getThreadContext(0)->getProcessPtr();
+    auto mem_state = process->getMemState();
+
+    if (!mem_state->translate(vaddr, paddr))
+        fatal("failed translation: vaddr 0x%x\n", vaddr);
+}
+
+void
+HSAPacketProcessor::dmaVirt(DmaFnPtr dmaFn, Addr addr, unsigned size,
+                         Event *event, void *data, Tick delay)
+{
+    if (size == 0) {
+        schedule(event, curTick() + delay);
+        return;
+    }
+
+    // move the buffer data pointer with the chunks
+    uint8_t *loc_data = (uint8_t*)data;
+
+    for (ChunkGenerator gen(addr, size, PAGE_SIZE); !gen.done(); gen.next()) {
+        Addr phys;
+
+        // translate pages into their corresponding frames
+        translateOrDie(gen.addr(), phys);
+
+        // only send event on last transfer; transfers complete in-order
+        Event *ev = gen.last() ? event : NULL;
+
+        (this->*dmaFn)(phys, gen.size(), ev, loc_data, delay);
+
+        loc_data += gen.size();
+    }
+}
+
+void
+HSAPacketProcessor::dmaReadVirt(Addr host_addr, unsigned size,
+                             Event *event, void *data, Tick delay)
+{
+    DPRINTF(HSAPacketProcessor,
+            "%s:host_addr = 0x%lx, size = %d\n", __FUNCTION__, host_addr, size);
+    dmaVirt(&DmaDevice::dmaRead, host_addr, size, event, data, delay);
+}
+
+void
+HSAPacketProcessor::dmaWriteVirt(Addr host_addr, unsigned size,
+                              Event *event, void *data, Tick delay)
+{
+    dmaVirt(&DmaDevice::dmaWrite, host_addr, size, event, data, delay);
+}
+
+HSAPacketProcessor::UpdateReadDispIdDmaEvent::
+        UpdateReadDispIdDmaEvent()
+    : Event(Default_Pri, AutoDelete)
+{
+    DPRINTF(HSAPacketProcessor, "%s:\n", __FUNCTION__);
+    setFlags(AutoDelete);
+}
+
+void
+HSAPacketProcessor::updateReadIndex(int pid, uint32_t rl_idx)
+{
+    AQLRingBuffer* aqlbuf = regdQList[rl_idx]->qCntxt.aqlBuf;
+    HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
+    auto *dmaEvent = new UpdateReadDispIdDmaEvent();
+
+    DPRINTF(HSAPacketProcessor,
+            "%s: read-pointer offset [0x%x]\n", __FUNCTION__, aqlbuf->rdIdx());
+
+    dmaWriteVirt((Addr)qDesc->hostReadIndexPtr,
+                 sizeof(aqlbuf->rdIdx()),
+                 dmaEvent, aqlbuf->rdIdxPtr());
+
+    DPRINTF(HSAPacketProcessor,
+            "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
+            " q size = %d, is_empty = %s, active list ID = %d\n", __FUNCTION__,
+            qDesc->readIndex, qDesc->writeIndex, qDesc->spaceUsed(),
+            qDesc->numElts, qDesc->isEmpty()? "true" : "false", rl_idx);
+    if (qDesc->writeIndex != aqlbuf->wrIdx()) {
+        getCommandsFromHost(pid, rl_idx);
+    }
+}
+
+HSAPacketProcessor::CmdQueueCmdDmaEvent::
+CmdQueueCmdDmaEvent(HSAPacketProcessor *_hsaPP, int _pid, bool _isRead,
+                    uint32_t _ix_start, unsigned _num_pkts,
+                    dma_series_ctx *_series_ctx, void *_dest_4debug)
+    : Event(Default_Pri, AutoDelete), hsaPP(_hsaPP), pid(_pid), isRead(_isRead),
+      ix_start(_ix_start), num_pkts(_num_pkts), series_ctx(_series_ctx),
+      dest_4debug(_dest_4debug)
+{
+    setFlags(AutoDelete);
+
+    DPRINTF(HSAPacketProcessor, "%s, ix = %d, npkts = %d," \
+            "active list ID = %d\n", __FUNCTION__,
+            _ix_start, num_pkts, series_ctx->rl_idx);
+}
+
+void
+HSAPacketProcessor::CmdQueueCmdDmaEvent::process()
+{
+    uint32_t rl_idx = series_ctx->rl_idx;
+    AQLRingBuffer *aqlRingBuffer M5_VAR_USED =
+        hsaPP->regdQList[rl_idx]->qCntxt.aqlBuf;
+    HSAQueueDescriptor* qDesc =
+        hsaPP->regdQList[rl_idx]->qCntxt.qDesc;
+    DPRINTF(HSAPacketProcessor, ">%s, ix = %d, npkts = %d," \
+            " pktsRemaining = %d, active list ID = %d\n", __FUNCTION__,
+            ix_start, num_pkts, series_ctx->pkts_2_go,
+            rl_idx);
+    if (isRead) {
+        series_ctx->pkts_2_go -= num_pkts;
+        if (series_ctx->pkts_2_go == 0) {
+            // Mark DMA as completed
+            qDesc->dmaInProgress = false;
+            DPRINTF(HSAPacketProcessor,
+                    "%s: schedule Qwakeup next cycle, rdIdx %d, wrIdx %d," \
+                    " dispIdx %d, active list ID = %d\n",
+                    __FUNCTION__, aqlRingBuffer->rdIdx(),
+                    aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rl_idx);
+            // schedule queue wakeup
+            hsaPP->schedAQLProcessing(rl_idx);
+            delete series_ctx;
+        }
+    }
+}
+
+void
+HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
+{
+    RQLEntry *queue = regdQList[rl_idx];
+    if (!queue->aqlProcessEvent.scheduled()) {
+        Tick processingTick = curTick() + pktProcessDelay;
+        schedule(queue->aqlProcessEvent, processingTick);
+        DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n",
+                processingTick);
+    } else {
+        DPRINTF(HSAPacketProcessor, "AQL processing already scheduled\n");
+    }
+}
+
+bool
+HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
+{
+    bool is_submitted = false;
+    SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
+    // Dependency signals are not read yet. And this can only be a retry.
+    // The retry logic will schedule the packet processor wakeup
+    if (dep_sgnl_rd_st->pendingReads != 0) {
+        return false;
+    }
+    // `pkt` can be typecasted to any type of AQL packet since they all
+    // have header information at offset zero
+    auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
+    hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
+    if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
+        DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
+                " active list ID = %d\n", __FUNCTION__, rl_idx);
+        // Submit packet to HSA device (dispatcher)
+        hsa_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
+        is_submitted = true;
+    } else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
+        DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
+                " active list ID = %d\n", __FUNCTION__, rl_idx);
+        // Submit packet to HSA device (dispatcher)
+        hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
+        is_submitted = true;
+    } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
+        DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
+                " active list ID = %d\n", __FUNCTION__, rl_idx);
+        auto bar_and_pkt = (_hsa_barrier_and_packet_t *)pkt;
+        bool isReady = true;
+        // Loop thorugh all the completion signals to see if this barrier
+        // packet is ready.
+        for (int i = 0; i < NumSignalsPerBarrier; i++) {
+            // dep_signal = zero imply no signal connected
+            if (bar_and_pkt->dep_signal[i]) {
+                // The signal value is aligned 8 bytes from
+                // the actual handle in the runtime
+                uint64_t signal_addr =
+                    (uint64_t) (((uint64_t *) bar_and_pkt->dep_signal[i]) + 1);
+                hsa_signal_value_t *signal_val =
+                    &(dep_sgnl_rd_st->values[i]);
+                DPRINTF(HSAPacketProcessor, "%s: Barrier pkt dep sgnl[%d]" \
+                       " , sig addr %x, value %d active list ID = %d\n",
+                       __FUNCTION__, i, signal_addr,
+                       *signal_val, rl_idx);
+                // The if condition will be executed everytime except the
+                // very first time this barrier packet is encounteresd.
+                if (dep_sgnl_rd_st->allRead) {
+                    if (*signal_val != 0) {
+                        // This signal is not yet ready, read it again
+                        isReady = false;
+                        DepSignalsReadDmaEvent *sgnl_rd_evnt =
+                            new DepSignalsReadDmaEvent(dep_sgnl_rd_st);
+                        dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
+                                    sgnl_rd_evnt, signal_val);
+                        dep_sgnl_rd_st->pendingReads++;
+                        DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
+                            " active list %d\n", __FUNCTION__,
+                            dep_sgnl_rd_st->pendingReads, rl_idx);
+                    }
+                } else {
+                    // This signal is not yet ready, read it again
+                    isReady = false;
+                    DepSignalsReadDmaEvent *sgnl_rd_evnt =
+                        new DepSignalsReadDmaEvent(dep_sgnl_rd_st);
+                    dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
+                                sgnl_rd_evnt, signal_val);
+                    dep_sgnl_rd_st->pendingReads++;
+                    DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
+                        " active list %d\n", __FUNCTION__,
+                        dep_sgnl_rd_st->pendingReads, rl_idx);
+                }
+            }
+        }
+        if (isReady) {
+            assert(dep_sgnl_rd_st->pendingReads == 0);
+            DPRINTF(HSAPacketProcessor, "%s: Barrier packet completed" \
+                    " active list ID = %d\n", __FUNCTION__, rl_idx);
+            // TODO: Completion signal of barrier packet to be
+            // atomically decremented here
+            finishPkt((void*)bar_and_pkt, rl_idx);
+            is_submitted = true;
+            // Reset signal values
+            dep_sgnl_rd_st->resetSigVals();
+            // The completion signal is connected
+            if (bar_and_pkt->completion_signal != 0) {
+                // The signal value is aligned 8 bytes
+                // from the actual handle in the runtime
+                uint64_t signal_addr =
+                    (uint64_t) (((uint64_t *)
+                    bar_and_pkt->completion_signal) + 1);
+                DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
+                       " completion signal: %x!\n", signal_addr);
+                /**
+                 * HACK: The semantics of the HSA signal is to
+                 * decrement the current signal value.
+                 * I'm going to cheat here and read out
+                 * the value from main memory using functional
+                 * access, and then just DMA the decremented value.
+                 * The reason for this is that the DMASequencer does
+                 * not support atomic operations.
+                 */
+                auto tc = sys->getThreadContext(0);
+                auto process = tc->getProcessPtr();
+                auto mem_state = process->getMemState();
+                auto &virt_proxy = mem_state->getVirtProxy();
+                TypedBufferArg<uint64_t> prev_signal(signal_addr);
+                prev_signal.copyIn(virt_proxy);
+
+                hsa_signal_value_t *new_signal = new hsa_signal_value_t;
+                *new_signal = (hsa_signal_value_t) *prev_signal - 1;
+
+                dmaWriteVirt(signal_addr,
+                             sizeof(hsa_signal_value_t), NULL, new_signal, 0);
+            }
+        }
+        if (dep_sgnl_rd_st->pendingReads > 0) {
+            // Atleast one DepSignalsReadDmaEvent is scheduled this cycle
+            dep_sgnl_rd_st->allRead = false;
+            dep_sgnl_rd_st->discardRead = false;
+        }
+    } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_OR) {
+        fatal("Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR");
+    } else if (pkt_type == HSA_PACKET_TYPE_INVALID) {
+        fatal("Unsupported packet type HSA_PACKET_TYPE_INVALID");
+    } else {
+        fatal("Unsupported packet type %d\n", pkt_type);
+    }
+    return is_submitted;
+}
+
+// Wakes up every fixed time interval (pktProcessDelay) and processes a single
+// packet from the queue that scheduled this wakeup. If there are more
+// packets in that queue, the next wakeup is scheduled.
+void
+HSAPacketProcessor::QueueProcessEvent::process()
+{
+    AQLRingBuffer *aqlRingBuffer = hsaPP->regdQList[rqIdx]->qCntxt.aqlBuf;
+    DPRINTF(HSAPacketProcessor,
+            "%s: Qwakeup , rdIdx %d, wrIdx %d," \
+            " dispIdx %d, active list ID = %d\n",
+            __FUNCTION__, aqlRingBuffer->rdIdx(),
+            aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx);
+    // In the future, we may support batch processing of packets.
+    // Then, we can just remove the break statements and the code
+    // will support batch processing. That is why we are using a
+    // "while loop" here instead on an "if" condition.
+    while (hsaPP->regdQList[rqIdx]->dispPending()) {
+        void *pkt = aqlRingBuffer->ptr(aqlRingBuffer->dispIdx());
+        DPRINTF(HSAPacketProcessor, "%s: Attempting dispatch @ dispIdx[%d]\n",
+                __FUNCTION__, aqlRingBuffer->dispIdx());
+        Addr host_addr = aqlRingBuffer->hostDispAddr();
+        if (hsaPP->processPkt(pkt, rqIdx, host_addr)) {
+             aqlRingBuffer->incDispIdx(1);
+             DPRINTF(HSAPacketProcessor, "%s: Increment dispIdx[%d]\n",
+                     __FUNCTION__, aqlRingBuffer->dispIdx());
+             if (hsaPP->regdQList[rqIdx]->dispPending()) {
+                 hsaPP->schedAQLProcessing(rqIdx);
+             }
+             break;
+        } else {
+            // This queue is blocked, scheduled a processing event
+            hsaPP->schedAQLProcessing(rqIdx);
+            break;
+        }
+    }
+}
+
+void
+HSAPacketProcessor::SignalState::handleReadDMA()
+{
+    assert(pendingReads > 0);
+    pendingReads--;
+    if (pendingReads == 0) {
+        allRead = true;
+        if (discardRead) {
+            resetSigVals();
+        }
+    }
+}
+
+void
+HSAPacketProcessor::getCommandsFromHost(int pid, uint32_t rl_idx)
+{
+    HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
+    AQLRingBuffer *aqlRingBuffer = regdQList[rl_idx]->qCntxt.aqlBuf;
+
+    DPRINTF(HSAPacketProcessor,
+            "%s: read-pointer offset[0x%x], write-pointer offset[0x%x]"
+            " doorbell(%d)[0x%x] \n",
+            __FUNCTION__, qDesc->readIndex,
+            qDesc->writeIndex, pid, qDesc->doorbellPointer);
+
+    if (qDesc->dmaInProgress) {
+        // we'll try again when this dma transfer completes in updateReadIndex
+        return;
+    }
+    uint32_t num_umq = qDesc->spaceUsed();
+    if (num_umq == 0)
+        return; // nothing to be gotten
+    uint32_t umq_nxt = qDesc->readIndex;
+    // Total AQL buffer size
+    uint32_t ttl_aql_buf = aqlRingBuffer->numObjs();
+    // Available AQL buffer size. If the available buffer is less than
+    // demanded, number of available buffer is returned
+    uint32_t got_aql_buf = aqlRingBuffer->allocEntry(num_umq);
+    qDesc->readIndex += got_aql_buf;
+    uint32_t dma_start_ix = (aqlRingBuffer->wrIdx() - got_aql_buf) %
+        ttl_aql_buf;
+    dma_series_ctx *series_ctx = NULL;
+
+    DPRINTF(HSAPacketProcessor, "%s: umq_nxt = %d, ttl_aql_buf = %d, "
+            "dma_start_ix = %d, num_umq = %d\n", __FUNCTION__, umq_nxt,
+            ttl_aql_buf, dma_start_ix, num_umq);
+
+    if (got_aql_buf == 0) {
+        // we'll try again when some dma bufs are freed in freeEntry
+        qDesc->stalledOnDmaBufAvailability = true;
+        return;
+    } else {
+        qDesc->stalledOnDmaBufAvailability = false;
+    }
+
+    uint32_t dma_b4_wrap = ttl_aql_buf - dma_start_ix;
+    while (got_aql_buf != 0 && num_umq != 0) {
+        uint32_t umq_b4_wrap = qDesc->numObjs() -
+            (umq_nxt % qDesc->objSize());
+        uint32_t num_2_xfer
+            = std::min({umq_b4_wrap, dma_b4_wrap, num_umq, got_aql_buf});
+        if (!series_ctx) {
+            qDesc->dmaInProgress = true;
+            series_ctx = new dma_series_ctx(got_aql_buf, got_aql_buf,
+                                            dma_start_ix, rl_idx);
+        }
+
+        void *aql_buf = aqlRingBuffer->ptr(dma_start_ix);
+        CmdQueueCmdDmaEvent *dmaEvent
+            = new CmdQueueCmdDmaEvent(this, pid, true, dma_start_ix,
+                                      num_2_xfer, series_ctx, aql_buf);
+        DPRINTF(HSAPacketProcessor,
+                "%s: aql_buf = %p, umq_nxt = %d, dma_ix = %d, num2xfer = %d\n",
+                __FUNCTION__, aql_buf, umq_nxt, dma_start_ix, num_2_xfer);
+
+        dmaReadVirt(qDesc->ptr(umq_nxt), num_2_xfer * qDesc->objSize(),
+                    dmaEvent, aql_buf);
+
+        aqlRingBuffer->saveHostDispAddr(qDesc->ptr(umq_nxt), num_2_xfer,
+                                        dma_start_ix);
+
+        num_umq -= num_2_xfer;
+        got_aql_buf -= num_2_xfer;
+        dma_start_ix = (dma_start_ix + num_2_xfer) % ttl_aql_buf;
+        umq_nxt = (umq_nxt + num_2_xfer) % qDesc->numObjs();
+        if (got_aql_buf == 0 && num_umq != 0) {
+            // There are more packets in the queue but
+            // not enough DMA buffers. Set the stalledOnDmaBufAvailability,
+            // we will try again in freeEntry
+            qDesc->stalledOnDmaBufAvailability = true;
+        }
+    }
+}
+
+void
+HSAPacketProcessor::displayQueueDescriptor(int pid, uint32_t rl_idx)
+{
+    HSAQueueDescriptor* M5_VAR_USED qDesc = regdQList[rl_idx]->qCntxt.qDesc;
+    DPRINTF(HSAPacketProcessor,
+            "%s: pid[%d], basePointer[0x%lx], dBPointer[0x%lx], "
+            "writeIndex[0x%x], readIndex[0x%x], size(bytes)[0x%x]\n",
+            __FUNCTION__, pid, qDesc->basePointer,
+            qDesc->doorbellPointer, qDesc->writeIndex,
+            qDesc->readIndex, qDesc->numElts);
+}
+
+AQLRingBuffer::AQLRingBuffer(uint32_t size,
+                             const std::string name)
+        : _name(name), _wrIdx(0), _rdIdx(0), _dispIdx(0)
+{
+    _aqlBuf.resize(size);
+    _aqlComplete.resize(size);
+    _hostDispAddresses.resize(size);
+    // Mark all packets as invalid and incomplete
+    for (auto& it : _aqlBuf)
+        it.header = HSA_PACKET_TYPE_INVALID;
+    std::fill(_aqlComplete.begin(), _aqlComplete.end(), false);
+}
+
+bool
+AQLRingBuffer::freeEntry(void *pkt)
+{
+    _aqlComplete[(hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data()] = true;
+    DPRINTF(HSAPacketProcessor, "%s: pkt_ix = %d; "\
+            " # free entries = %d, wrIdx = %d, rdIdx = %d\n", __FUNCTION__,
+            (hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data(),
+            nFree(), wrIdx(), rdIdx());
+    // Packets can complete out-of-order. This code "retires" packets in-order
+    // by updating the read pointer in the MQD when a contiguous chunk of
+    // packets have finished.
+    uint32_t old_rdIdx = rdIdx();
+    while (_aqlComplete[rdIdx() % numObjs()]) {
+       _aqlComplete[rdIdx() % numObjs()] = false;
+       _aqlBuf[rdIdx() % numObjs()].header = HSA_PACKET_TYPE_INVALID;
+       incRdIdx(1);
+    }
+    return (old_rdIdx != rdIdx());
+}
+
+void
+HSAPacketProcessor::setDevice(HSADevice *dev)
+{
+    this->hsa_device = dev;
+}
+
+int
+AQLRingBuffer::allocEntry(uint32_t nBufReq)
+{
+    DPRINTF(HSAPacketProcessor, "%s: nReq = %d\n", __FUNCTION__, nBufReq);
+    if (nFree() == 0) {
+        DPRINTF(HSAPacketProcessor, "%s: return = %d\n", __FUNCTION__, 0);
+        return 0;
+    }
+
+    if (nBufReq > nFree())
+        nBufReq = nFree();
+
+    DPRINTF(HSAPacketProcessor, "%s: ix1stFree = %d\n", __FUNCTION__, wrIdx());
+    incWrIdx(nBufReq);
+    DPRINTF(HSAPacketProcessor, "%s: return = %d, wrIdx = %d\n",
+            __FUNCTION__, nBufReq, wrIdx());
+    return nBufReq;
+}
+
+HSAPacketProcessor *
+HSAPacketProcessorParams::create()
+{
+    return new HSAPacketProcessor(this);
+}
+
+void
+HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
+{
+    HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
+    if (regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt))
+        updateReadIndex(0, rl_idx);
+    DPRINTF(HSAPacketProcessor,
+            "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
+            " q size = %d, stalled = %s, empty = %s, active list ID = %d\n",
+            __FUNCTION__, qDesc->readIndex, qDesc->writeIndex,
+            qDesc->spaceUsed(), qDesc->numElts,
+            qDesc->stalledOnDmaBufAvailability? "true" : "false",
+            qDesc->isEmpty()? "true" : "false", rl_idx);
+    // DMA buffer is freed, check the queue to see if there are DMA
+    // accesses blocked becasue of non-availability of DMA buffer
+    if (qDesc->stalledOnDmaBufAvailability) {
+        assert(!qDesc->isEmpty());
+        getCommandsFromHost(0, rl_idx); // TODO:assign correct pid
+                                        // when implementing
+                                        // multi-process support
+    }
+}
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Eric van Tassell
+ */
+
+#ifndef __DEV_HSA_HSA_PACKET_PROCESSOR__
+#define __DEV_HSA_HSA_PACKET_PROCESSOR__
+
+#include <cstdint>
+
+#include <queue>
+
+#include "dev/dma_device.hh"
+#include "dev/hsa/hsa.h"
+#include "dev/hsa/hsa_queue.hh"
+#include "params/HSAPacketProcessor.hh"
+
+#define AQL_PACKET_SIZE 64
+#define PAGE_SIZE 4096
+#define NUM_DMA_BUFS 16
+#define DMA_BUF_SIZE (AQL_PACKET_SIZE * NUM_DMA_BUFS)
+// HSA runtime supports only 5 signals per barrier packet
+#define NumSignalsPerBarrier 5
+
+// This define is copied from hsa runtime (libhsakmt/src/libhsakmt.h)
+// This is the mapping function used by runtime for mapping
+// queueID to dooorbell address
+#define VOID_PTR_ADD32(ptr,n) (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
+
+class HSADevice;
+class HWScheduler;
+
+// Our internal representation of an HSA queue
+class HSAQueueDescriptor {
+    public:
+        uint64_t     basePointer;
+        uint64_t     doorbellPointer;
+        uint64_t     writeIndex;
+        uint64_t     readIndex;
+        uint32_t     numElts;
+        uint64_t     hostReadIndexPtr;
+        bool         stalledOnDmaBufAvailability;
+        bool         dmaInProgress;
+
+        HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
+                           uint64_t hri_ptr, uint32_t size)
+          : basePointer(base_ptr), doorbellPointer(db_ptr),
+            writeIndex(0), readIndex(0),
+            numElts(size), hostReadIndexPtr(hri_ptr),
+            stalledOnDmaBufAvailability(false),
+            dmaInProgress(false)
+        {  }
+        uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
+        uint64_t spaceUsed() { return writeIndex - readIndex; }
+        uint32_t objSize() { return AQL_PACKET_SIZE; }
+        uint32_t numObjs() { return numElts; }
+        bool isFull() { return spaceRemaining() == 0; }
+        bool isEmpty() { return spaceRemaining() == numElts; }
+
+        uint64_t ptr(uint64_t ix)
+        {
+            return basePointer +
+                ((ix % numElts) * objSize());
+        }
+};
+
+/**
+ * Internal ring buffer which is used to prefetch/store copies of the
+ * in-memory HSA ring buffer.  Each packet in the queue has three implicit
+ * states tracked by a packet's relative location to the write, read, and
+ * dispatch pointers.
+ *
+ * FREE: Entry is empty
+ * ALLOCATED: Entry has been allocated for a packet, but the DMA has not
+ *            yet completed
+ * SUBMITTED: Packet has been submitted to the HSADevice, but has not
+ *            yet completed
+ */
+class AQLRingBuffer
+{
+   private:
+     std::vector<hsa_kernel_dispatch_packet_t> _aqlBuf;
+     std::string _name;
+     std::vector<Addr> _hostDispAddresses;
+     std::vector<bool> _aqlComplete;
+     uint64_t _wrIdx;   // Points to next write location
+     uint64_t _rdIdx;   // Read pointer of AQL buffer
+     uint64_t _dispIdx; // Dispatch pointer of AQL buffer
+
+  public:
+     std::string name() {return _name;}
+     AQLRingBuffer(uint32_t size, const std::string name);
+     int allocEntry(uint32_t nBufReq);
+     bool freeEntry(void *pkt);
+
+     /**
+      * the kernel may try to read from the dispatch packet,
+      * so we need to keep the host address that corresponds
+      * to each of the dispatch packets this AQL buffer is
+      * storing. when we call submitPkt(), we send along the
+      * corresponding host address for the packet so the
+      * wavefront can properly initialize its SGPRs - which
+      * may include a pointer to the dispatch packet
+      */
+     void
+     saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
+     {
+         for (int i = 0; i < num_pkts; ++i) {
+            _hostDispAddresses[ix % numObjs()] = host_pkt_addr + i * objSize();
+            ++ix;
+         }
+     }
+
+     Addr
+     hostDispAddr() const
+     {
+         return _hostDispAddresses[dispIdx() % numObjs()];
+     }
+
+     bool
+     dispPending() const
+     {
+         int packet_type = (_aqlBuf[_dispIdx % _aqlBuf.size()].header
+             >> HSA_PACKET_HEADER_TYPE) &
+             ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1);
+         return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID;
+     }
+
+     uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); }
+     void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); }
+     uint32_t numObjs() const { return _aqlBuf.size(); };
+     uint32_t objSize() const { return AQL_PACKET_SIZE; }
+     uint64_t dispIdx() const { return _dispIdx; }
+     uint64_t wrIdx() const { return _wrIdx; }
+     uint64_t rdIdx() const { return _rdIdx; }
+     uint64_t* rdIdxPtr() { return &_rdIdx; }
+     void incRdIdx(uint64_t value) { _rdIdx += value; }
+     void incWrIdx(uint64_t value) { _wrIdx += value; }
+     void incDispIdx(uint64_t value) { _dispIdx += value; }
+
+};
+
+typedef struct QueueContext {
+    HSAQueueDescriptor* qDesc;
+    AQLRingBuffer* aqlBuf;
+    QueueContext(HSAQueueDescriptor* q_desc,
+                 AQLRingBuffer* aql_buf)
+                 : qDesc(q_desc), aqlBuf(aql_buf)
+    {}
+    QueueContext() : qDesc(NULL), aqlBuf(NULL) {}
+} QCntxt;
+
+class HSAPacketProcessor: public DmaDevice
+{
+    friend class HWScheduler;
+  protected:
+    typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
+    HSADevice *hsa_device;
+    HWScheduler *hwSchdlr;
+
+    // Structure to store the read values of dependency signals
+    // from shared memory. Also used for tracking the status of
+    // those reads while they are in progress
+    class SignalState
+    {
+      public:
+        SignalState()
+            : pendingReads(0), allRead(false), discardRead(false)
+        {
+            values.resize(NumSignalsPerBarrier);
+        }
+        void handleReadDMA();
+        int pendingReads;
+        bool allRead;
+        // If this queue is unmapped when there are pending reads, then
+        // the pending reads has to be discarded.
+        bool discardRead;
+        // values stores the value of already read dependency signal
+        std::vector<hsa_signal_value_t> values;
+        void
+        resetSigVals()
+        {
+            std::fill(values.begin(), values.end(), 1);
+        }
+    };
+
+    class QueueProcessEvent : public Event
+    {
+      private:
+        HSAPacketProcessor *hsaPP;
+        uint32_t rqIdx;
+      public:
+        QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
+            : Event(Default_Pri), hsaPP(_hsaPP), rqIdx(_rqIdx)
+        {}
+        virtual void process();
+        virtual const char *description() const;
+    };
+
+    // Registered queue list entry; each entry has one queueDescriptor and
+    // associated AQL buffer
+    class RQLEntry
+    {
+      public:
+        RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
+            : aqlProcessEvent(hsaPP, rqIdx) {}
+        QCntxt qCntxt;
+        bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
+        SignalState depSignalRdState;
+        QueueProcessEvent aqlProcessEvent;
+    };
+    // Keeps track of queueDescriptors of registered queues
+    std::vector<class RQLEntry *> regdQList;
+
+    void translateOrDie(Addr vaddr, Addr &paddr);
+    void dmaVirt(DmaFnPtr, Addr host_addr, unsigned size, Event *event,
+                 void *data, Tick delay = 0);
+
+    void dmaReadVirt(Addr host_addr, unsigned size, Event *event,
+                     void *data, Tick delay = 0);
+
+    void dmaWriteVirt(Addr host_addr, unsigned size, Event *event,
+                      void *data, Tick delay = 0);
+    bool processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
+    void displayQueueDescriptor(int pid, uint32_t rl_idx);
+
+  public:
+    HSAQueueDescriptor*
+    getQueueDesc(uint32_t queId)
+    {
+        return regdQList.at(queId)->qCntxt.qDesc;
+    }
+    class RQLEntry*
+    getRegdListEntry(uint32_t queId)
+    {
+        return regdQList.at(queId);
+    }
+
+    int numHWQueues;
+    Addr pioAddr;
+    Addr pioSize;
+    Tick pioDelay;
+    const Tick pktProcessDelay;
+
+    typedef HSAPacketProcessorParams Params;
+    HSAPacketProcessor(const Params *p);
+    ~HSAPacketProcessor();
+    void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
+                            uint64_t basePointer,
+                            uint64_t queue_id,
+                            uint32_t size);
+    void unsetDeviceQueueDesc(uint64_t queue_id);
+    void setDevice(HSADevice * dev);
+    void updateReadIndex(int, uint32_t);
+    void getCommandsFromHost(int pid, uint32_t rl_idx);
+
+    // PIO interface
+    virtual Tick read(Packet*);
+    virtual Tick write(Packet*);
+    virtual AddrRangeList getAddrRanges() const;
+    void finishPkt(void *pkt, uint32_t rl_idx);
+    void finishPkt(void *pkt) { finishPkt(pkt, 0); }
+    void schedAQLProcessing(uint32_t rl_idx);
+
+    class DepSignalsReadDmaEvent : public Event
+    {
+      protected:
+        SignalState *signalState;
+      public:
+        DepSignalsReadDmaEvent(SignalState *ss)
+            : Event(Default_Pri, AutoDelete), signalState(ss)
+        {}
+        virtual void process() { signalState->handleReadDMA(); }
+        virtual const char *description() const;
+    };
+
+    /**
+     * this event is used to update the read_disp_id field (the read pointer)
+     * of the MQD, which is how the host code knows the status of the HQD's
+     * read pointer
+     */
+    class UpdateReadDispIdDmaEvent : public Event
+    {
+      public:
+        UpdateReadDispIdDmaEvent();
+
+        void process() override { }
+        const char *description() const override;
+
+    };
+
+    /**
+     * Calls getCurrentEntry once the queueEntry has been dmaRead.
+     */
+    struct dma_series_ctx {
+        // deal with the fact dma ops can complete out of issue order
+        uint32_t pkts_ttl;
+        uint32_t pkts_2_go;
+        uint32_t start_ix;
+        uint32_t rl_idx;
+
+        dma_series_ctx(uint32_t _pkts_ttl,
+                       uint32_t _pkts_2_go,
+                       uint32_t _start_ix,
+                       uint32_t _rl_idx)
+            : pkts_ttl(_pkts_2_go), pkts_2_go(_pkts_2_go),
+              start_ix(_start_ix), rl_idx(_rl_idx)
+        {};
+        ~dma_series_ctx() {};
+    };
+
+    class CmdQueueCmdDmaEvent : public Event
+    {
+      protected:
+        HSAPacketProcessor *hsaPP;
+        int pid;
+        bool isRead;
+        uint32_t ix_start;
+        uint num_pkts;
+        dma_series_ctx *series_ctx;
+        void *dest_4debug;
+
+      public:
+        CmdQueueCmdDmaEvent(HSAPacketProcessor *hsaPP, int pid, bool isRead,
+                            uint32_t dma_buf_ix, uint num_bufs,
+                            dma_series_ctx *series_ctx, void *dest_4debug);
+        virtual void process();
+        virtual const char *description() const;
+    };
+};
+
+#endif // __DEV_HSA_HSA_PACKET_PROCESSOR__
--- a/src/dev/hsa/hsa_queue.hh
+++ b/src/dev/hsa/hsa_queue.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#ifndef __DEV_HSA_HSA_QUEUE_HH__
+#define __DEV_HSA_HSA_QUEUE_HH__
+
+#include <cstdint>
+
+typedef enum
+{
+    _HSA_QUEUE_TYPE_MULTI = 0,
+    _HSA_QUEUE_TYPE_SINGLE = 1
+} _hsa_queue_type_t;
+
+typedef struct _hsa_signal_s
+{
+    uint64_t handle;
+} _hsa_signal_t;
+
+typedef struct _hsa_queue_s
+{
+    _hsa_queue_type_t type;
+    uint32_t features;
+    void *base_address;
+    _hsa_signal_t doorbell_signal;
+    uint32_t size;
+    uint32_t reserved1;
+    uint64_t id;
+} _hsa_queue_t;
+
+typedef uint32_t _amd_queue_properties32_t;
+
+typedef struct _amd_queue_s
+{
+    _hsa_queue_t hsa_queue;
+    uint32_t reserved1[4];
+    volatile uint64_t write_dispatch_id;
+    uint32_t group_segment_aperture_base_hi;
+    uint32_t private_segment_aperture_base_hi;
+    uint32_t max_cu_id;
+    uint32_t max_wave_id;
+    volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1;
+    volatile uint32_t legacy_doorbell_lock;
+    uint32_t reserved2[9];
+    volatile uint64_t read_dispatch_id;
+    uint32_t read_dispatch_id_field_base_byte_offset;
+    uint32_t compute_tmpring_size_waves : 12;
+    uint32_t compute_tmpring_size_wavesize : 13;
+    uint32_t compute_tmpring_size_pad : 7;
+    uint32_t scratch_resource_descriptor[4];
+    uint64_t scratch_backing_memory_location;
+    uint64_t scratch_backing_memory_byte_size;
+    uint32_t scratch_workitem_byte_size;
+    _amd_queue_properties32_t queue_properties;
+    uint32_t reserved3[2];
+    _hsa_signal_t queue_inactive_signal;
+    uint32_t reserved4[14];
+} _amd_queue_t;
+
+#endif // __DEV_HSA_HSA_QUEUE_HH__
--- a/src/dev/hsa/hw_scheduler.cc
+++ b/src/dev/hsa/hw_scheduler.cc
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Sooraj Puthoor
+ */
+
+#include "dev/hsa/hw_scheduler.hh"
+
+#include "debug/HSAPacketProcessor.hh"
+#include "mem/packet_access.hh"
+
+#define HWSCHDLR_EVENT_DESCRIPTION_GENERATOR(XEVENT) \
+  const char*                                    \
+  HWScheduler::XEVENT::description() const       \
+  {                                              \
+      return #XEVENT;                            \
+  }
+
+HWSCHDLR_EVENT_DESCRIPTION_GENERATOR(SchedulerWakeupEvent)
+
+void
+HWScheduler::SchedulerWakeupEvent::process()
+{
+    hwSchdlr->wakeup();
+}
+
+void
+HWScheduler::wakeup()
+{
+    // The scheduler unmaps an idle queue from the
+    // registered qList and maps a new queue
+    // to the registered list from the active list.
+    // For this implementation, an idle queue means
+    // a queue that does not have any outstanding dispatch
+    // at the time of this scheduler's wakeup
+
+    contextSwitchQ();
+    schedWakeup();
+}
+
+void
+HWScheduler::schedWakeup()
+{
+    // If atleast there is one queue that is not registered
+    // then wakeup again
+    if (!schedWakeupEvent.scheduled() &&
+        regdListMap.size() < activeList.size()) {
+        hsaPP->schedule(&schedWakeupEvent, curTick() + wakeupDelay);
+        DPRINTF(HSAPacketProcessor,
+                "Scheduling wakeup at %lu\n", (curTick() + wakeupDelay));
+    }
+}
+
+void
+HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
+                              uint64_t basePointer,
+                              uint64_t queue_id,
+                              uint32_t size)
+{
+    assert(queue_id < MAX_ACTIVE_QUEUES);
+    // Map queue ID to doorbell.
+    // We are only using offset to pio base address as doorbell
+    // We use the same mapping function used by hsa runtime to do this mapping
+    Addr db_offset = (Addr)(VOID_PTR_ADD32(0, queue_id));
+    if (dbMap.find(db_offset) != dbMap.end()) {
+        panic("Creating an already existing queue (queueID %d)", queue_id);
+    }
+
+    // Populate doorbell map
+    dbMap[db_offset] = queue_id;
+
+    if (queue_id >= MAX_ACTIVE_QUEUES) {
+        panic("Attempting to create a queue (queueID %d)" \
+              " beyond PIO range", queue_id);
+    }
+
+    HSAQueueDescriptor* q_desc =
+       new HSAQueueDescriptor(basePointer, db_offset,
+                              hostReadIndexPointer, size);
+    AQLRingBuffer* aql_buf =
+        new AQLRingBuffer(NUM_DMA_BUFS, hsaPP->name());
+    QCntxt q_cntxt(q_desc, aql_buf);
+    activeList[dbMap[db_offset]] = q_cntxt;
+
+    // Check if this newly created queue can be directly mapped
+    // to registered queue list
+    bool M5_VAR_USED register_q = mapQIfSlotAvlbl(queue_id, aql_buf, q_desc);
+    schedWakeup();
+    DPRINTF(HSAPacketProcessor,
+             "%s: offset = %p, qID = %d, is_regd = %s, AL size %d\n",
+             __FUNCTION__, db_offset, queue_id,
+             (register_q) ? "true" : "false", dbMap.size());
+}
+
+bool
+HWScheduler::findEmptyHWQ()
+{
+    DPRINTF(HSAPacketProcessor,
+            "Trying to find empty HW queue, @ %s\n", __FUNCTION__);
+    if (regdListMap.size() < hsaPP->numHWQueues) {
+        for (int emptyQId = 0; emptyQId < hsaPP->numHWQueues; emptyQId++) {
+            HSAQueueDescriptor* qDesc =
+                hsaPP->getRegdListEntry(nextRLId)->qCntxt.qDesc;
+            // If qDesc is empty, we find an empty HW queue
+            if (qDesc == NULL) {
+                return true;
+            }
+            nextRLId = (nextRLId + 1) % hsaPP->numHWQueues;
+        }
+        // We should be able to find an empty slot in registered list
+        // So, we should not reach here
+        panic("Cannot find empty queue\n");
+    }
+    return false;
+}
+
+bool
+HWScheduler::mapQIfSlotAvlbl(uint32_t q_id, AQLRingBuffer* aql_buf,
+                             HSAQueueDescriptor* q_desc)
+{
+    DPRINTF(HSAPacketProcessor,
+            "Trying to map new queue, @ %s\n", __FUNCTION__);
+    if (!findEmptyHWQ()) {
+        return false;
+    }
+    addQCntxt(q_id, aql_buf, q_desc);
+    scheduleAndWakeupMappedQ();
+    updateRRVars(q_id, nextRLId);
+    return true;
+}
+
+void
+HWScheduler::scheduleAndWakeupMappedQ()
+{
+    // There maybe AQL packets in the mapped queue waiting
+    // to be fetched. Invoke the logic to fetch AQL packets
+    hsaPP->getCommandsFromHost(0, nextRLId);
+    // Schedule the newly mapped queue
+    if (hsaPP->regdQList[nextRLId]->dispPending())
+        hsaPP->schedAQLProcessing(nextRLId);
+}
+
+void
+HWScheduler::addQCntxt(uint32_t al_idx, AQLRingBuffer* aql_buf,
+                        HSAQueueDescriptor* q_desc)
+{
+    assert(hsaPP->getRegdListEntry(nextRLId)->qCntxt.qDesc == NULL);
+    assert(hsaPP->getRegdListEntry(nextRLId)->qCntxt.aqlBuf == NULL);
+    // Move the context
+    hsaPP->getRegdListEntry(nextRLId)->qCntxt.qDesc = q_desc;
+    hsaPP->getRegdListEntry(nextRLId)->qCntxt.aqlBuf = aql_buf;
+    // Add the mapping to registered list map
+    regdListMap[al_idx] = nextRLId;
+    DPRINTF(HSAPacketProcessor, "Mapped HSA queue %d to hw queue %d: @ %s\n",
+            al_idx, nextRLId, __FUNCTION__);
+}
+
+bool
+HWScheduler::contextSwitchQ()
+{
+    DPRINTF(HSAPacketProcessor,
+            "Trying to map next queue, @ %s", __FUNCTION__);
+    // Identify the next queue, if there is nothing to
+    // map, return false
+    if (!findNextActiveALQ()) {
+        return false;
+    }
+    HSAQueueDescriptor* q_desc = activeList[nextALId].qDesc;
+    AQLRingBuffer* aql_buf = activeList[nextALId].aqlBuf;
+    // If there is empty slot available, use that slot
+    if(mapQIfSlotAvlbl(nextALId, aql_buf, q_desc)) {
+        return true;
+    }
+    // There is no empty slot to map this queue. So, we need to
+    // unmap a queue from registered list and find a slot.
+    // If nothing can be unmapped now, return false
+    if (!unmapQFromRQ()) {
+        return false;
+    }
+    // One queue is unmapped from registered list and that queueID
+    // is stored in nextRLId. We will map this queue to that unmapped slot
+    addQCntxt(nextALId, aql_buf, q_desc);
+    scheduleAndWakeupMappedQ();
+    updateRRVars(nextALId, nextRLId);
+    return true;
+}
+
+void
+HWScheduler::updateRRVars(uint32_t al_idx, uint32_t rl_idx)
+{
+    nextALId = (al_idx + 1) % MAX_ACTIVE_QUEUES;
+    nextRLId = (rl_idx + 1) % hsaPP->numHWQueues;
+}
+
+bool
+HWScheduler::unmapQFromRQ()
+{
+    // Identify the next idle queue, if there is no
+    // idle queue, we cannot unmap
+    if (!findNextIdleRLQ()) {
+        return false;
+    }
+    removeQCntxt();
+    return true;
+}
+
+void
+HWScheduler::removeQCntxt()
+{
+    // The nextRLId gives the registered queue that is to be unmapped.
+    // We can find the corresponding queue_id from the doorbellPointer
+    Addr db_offset =
+        hsaPP->getRegdListEntry(nextRLId)->qCntxt.qDesc->doorbellPointer;
+    hsaPP->getRegdListEntry(nextRLId)->qCntxt.qDesc = NULL;
+    hsaPP->getRegdListEntry(nextRLId)->qCntxt.aqlBuf = NULL;
+    // Here, we are unmappping a queue wihtout waiting for the outstanding
+    // dependency signal reads to complete. We will discard any outstanding
+    // reads and will reset the signal values here.
+    hsaPP->getRegdListEntry(nextRLId)->depSignalRdState.discardRead = true;
+    hsaPP->getRegdListEntry(nextRLId)->depSignalRdState.resetSigVals();
+    uint32_t al_idx = dbMap[db_offset];
+    assert(regdListMap[al_idx] == nextRLId);
+    // Unmap from regdListMap.
+    regdListMap.erase(al_idx);
+}
+
+bool
+HWScheduler::findNextActiveALQ()
+{
+    for (int activeQId = 0; activeQId < MAX_ACTIVE_QUEUES; activeQId++) {
+        uint32_t al_id = (nextALId + activeQId) % MAX_ACTIVE_QUEUES;
+        auto aqlmap_iter = activeList.find(al_id);
+        if (aqlmap_iter != activeList.end()) {
+            // If this queue is already mapped
+            if (regdListMap.find(al_id) != regdListMap.end()) {
+                continue;
+            } else {
+                DPRINTF(HSAPacketProcessor,
+                        "Next Active ALQ %d (current %d), max ALQ %d\n",
+                         al_id, nextALId, MAX_ACTIVE_QUEUES);
+                nextALId = al_id;
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool
+HWScheduler::findNextIdleRLQ()
+{
+    for (int regdQId = 0; regdQId < hsaPP->numHWQueues; regdQId++) {
+        uint32_t rl_idx = (nextRLId + regdQId) % hsaPP->numHWQueues;
+        if (isRLQIdle(rl_idx)) {
+            nextRLId = rl_idx;
+            return true;
+        }
+    }
+    return false;
+}
+
+// This function could be moved to packet processor
+bool
+HWScheduler::isRLQIdle(uint32_t rl_idx)
+{
+    DPRINTF(HSAPacketProcessor,
+            "@ %s, analyzing hw queue %d\n", __FUNCTION__, rl_idx);
+    HSAQueueDescriptor* qDesc = hsaPP->getRegdListEntry(rl_idx)->qCntxt.qDesc;
+    AQLRingBuffer* aql_buf = hsaPP->getRegdListEntry(rl_idx)->qCntxt.aqlBuf;
+
+    // If there a pending DMA to this registered queue
+    // then the queue is not idle
+    if (qDesc->dmaInProgress) {
+        return false;
+    }
+
+    // Since packet completion stage happens only after kernel completion
+    // we need to keep the queue mapped till all the outstanding kernels
+    // from that queue are finished
+    if (aql_buf->rdIdx() != aql_buf->dispIdx()) {
+        return false;
+    }
+
+    return true;
+}
+
+void
+HWScheduler::write(Addr db_addr, uint32_t doorbell_reg)
+{
+    auto dbmap_iter = dbMap.find(db_addr);
+    if (dbmap_iter == dbMap.end()) {
+        panic("Writing to a non-existing queue (db_offset %x)", db_addr);
+    }
+    uint32_t al_idx = dbMap[db_addr];
+    // Modify the write pointer
+    activeList[al_idx].qDesc->writeIndex = doorbell_reg;
+    // If this queue is mapped, then start DMA to fetch the
+    // AQL packet
+    if (regdListMap.find(al_idx) != regdListMap.end()) {
+        hsaPP->getCommandsFromHost(0, regdListMap[al_idx]);
+    }
+}
+
+void
+HWScheduler::unregisterQueue(uint64_t queue_id)
+{
+    Addr db_offset = (Addr)(VOID_PTR_ADD32(0, queue_id));
+    auto dbmap_iter = dbMap.find(db_offset);
+    if (dbmap_iter == dbMap.end()) {
+        panic("Destroying a non-existing queue (db_offset %x)",
+               db_offset);
+    }
+    uint32_t al_idx = dbMap[db_offset];
+    assert(dbMap[db_offset] == dbmap_iter->second);
+    if (!activeList[al_idx].qDesc->isEmpty()) {
+        // According to HSA runtime specification says, deleting
+        // a queue before it is fully processed can lead to undefined
+        // behavior and it is the application's responsibility to
+        // avoid this situation.
+        // Even completion signal is not a sufficient indication for a
+        // fully processed queue; for example completion signal may be
+        // asserted when a read pointer update is in progress
+        warn("Destroying a non-empty queue");
+    }
+    delete activeList[al_idx].qDesc;
+    delete activeList[al_idx].aqlBuf;
+    activeList.erase(al_idx);
+    // Unmap doorbell from doorbell map
+    dbMap.erase(db_offset);
+    if (regdListMap.find(al_idx) != regdListMap.end()) {
+        uint32_t rl_idx = regdListMap[al_idx];
+        hsaPP->getRegdListEntry(rl_idx)->qCntxt.aqlBuf = NULL;
+        hsaPP->getRegdListEntry(rl_idx)->qCntxt.qDesc = NULL;
+        hsaPP->getRegdListEntry(rl_idx)->depSignalRdState.discardRead = true;
+        hsaPP->getRegdListEntry(rl_idx)->depSignalRdState.resetSigVals();
+        assert(!hsaPP->getRegdListEntry(rl_idx)->aqlProcessEvent.scheduled());
+        regdListMap.erase(al_idx);
+        // A registered queue is released, let us try to map
+        // a queue to that slot
+        contextSwitchQ();
+    }
+    schedWakeup();
+}
--- a/src/dev/hsa/hw_scheduler.hh
+++ b/src/dev/hsa/hw_scheduler.hh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Sooraj Puthoor
+ */
+
+#ifndef __DEV_HSA_HW_SCHEDULER_HH__
+#define __DEV_HSA_HW_SCHEDULER_HH__
+
+#include "dev/hsa/hsa_packet_processor.hh"
+
+// We allocate one PIO page for doorbells and each
+// address is 8 bytes
+#define MAX_ACTIVE_QUEUES (PAGE_SIZE/8)
+
+class HWScheduler
+{
+  public:
+    HWScheduler(HSAPacketProcessor* hsa_pp, Tick wakeup_delay)
+               : hsaPP(hsa_pp), nextALId(0), nextRLId(0),
+                 wakeupDelay(wakeup_delay), schedWakeupEvent(this)
+    {}
+    void write(Addr db_addr, uint32_t doorbell_reg);
+    void registerNewQueue(uint64_t hostReadIndexPointer,
+                          uint64_t basePointer,
+                          uint64_t queue_id,
+                          uint32_t size);
+    void unregisterQueue(uint64_t queue_id);
+    void wakeup();
+    void schedWakeup();
+    class SchedulerWakeupEvent : public Event
+    {
+      private:
+        HWScheduler *hwSchdlr;
+      public:
+        SchedulerWakeupEvent(HWScheduler *hw_schdlr) : hwSchdlr(hw_schdlr) {}
+        virtual void process();
+        virtual const char *description() const;
+    };
+    bool isRLQIdle(uint32_t rl_idx);
+    bool findNextActiveALQ();
+    bool findNextIdleRLQ();
+    bool unmapQFromRQ();
+    bool contextSwitchQ();
+    bool findEmptyHWQ();
+    bool mapQIfSlotAvlbl(uint32_t al_idx, AQLRingBuffer* aql_buf,
+                         HSAQueueDescriptor* q_desc);
+    void addQCntxt(uint32_t al_idx, AQLRingBuffer* aql_buf,
+                   HSAQueueDescriptor* q_desc);
+    void removeQCntxt();
+    void scheduleAndWakeupMappedQ();
+    void updateRRVars(uint32_t al_idx, uint32_t rl_idx);
+
+  private:
+    // Active list keeps track of all queues created
+    std::map<uint32_t, QCntxt> activeList;
+    //TODO: Modify this to support multi-process in the future.
+    // doorbell map, maps doorbells to active list entry
+    std::map<Addr, uint32_t> dbMap;
+    // regdListMap keeps track of the mapping of queues to
+    // registered list. regdListMap is indexed with active
+    // list index (which is same as queue ID)
+    std::map<uint32_t, uint32_t> regdListMap;
+    HSAPacketProcessor* hsaPP;
+
+    // Scheduling information.
+    // For now, this is simple round robin but
+    // this will be changed to a sophisticated logic
+    // in the future. So, in the future, we will
+    // move these variables into a scheduler class
+    uint32_t nextALId;
+    uint32_t nextRLId;
+    const Tick wakeupDelay;
+    SchedulerWakeupEvent schedWakeupEvent;
+};
+
+#endif // __DEV_HSA_HW_SCHEDULER_HH__
--- a/src/dev/hsa/kfd_ioctl.h
+++ b/src/dev/hsa/kfd_ioctl.h
@@ -0,0 +1,536 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_IOCTL_H_INCLUDED
+#define KFD_IOCTL_H_INCLUDED
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define KFD_IOCTL_MAJOR_VERSION 1
+#define KFD_IOCTL_MINOR_VERSION 2
+
+struct kfd_ioctl_get_version_args {
+	uint32_t major_version;	/* from KFD */
+	uint32_t minor_version;	/* from KFD */
+};
+
+/* For kfd_ioctl_create_queue_args.queue_type. */
+#define KFD_IOC_QUEUE_TYPE_COMPUTE	0
+#define KFD_IOC_QUEUE_TYPE_SDMA		1
+#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL	2
+
+#define KFD_MAX_QUEUE_PERCENTAGE	100
+#define KFD_MAX_QUEUE_PRIORITY		15
+
+struct kfd_ioctl_create_queue_args {
+	uint64_t ring_base_address;	/* to KFD */
+	uint64_t write_pointer_address;	/* from KFD */
+	uint64_t read_pointer_address;	/* from KFD */
+	uint64_t doorbell_offset;	/* from KFD */
+
+	uint32_t ring_size;		/* to KFD */
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t queue_type;		/* to KFD */
+	uint32_t queue_percentage;	/* to KFD */
+	uint32_t queue_priority;	/* to KFD */
+	uint32_t queue_id;		/* from KFD */
+
+	uint64_t eop_buffer_address;	/* to KFD */
+	uint64_t eop_buffer_size;	/* to KFD */
+	uint64_t ctx_save_restore_address; /* to KFD */
+	uint32_t ctx_save_restore_size;	/* to KFD */
+	uint32_t ctl_stack_size;	/* to KFD */
+};
+
+struct kfd_ioctl_destroy_queue_args {
+	uint32_t queue_id;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_update_queue_args {
+	uint64_t ring_base_address;	/* to KFD */
+
+	uint32_t queue_id;		/* to KFD */
+	uint32_t ring_size;		/* to KFD */
+	uint32_t queue_percentage;	/* to KFD */
+	uint32_t queue_priority;	/* to KFD */
+};
+
+struct kfd_ioctl_set_cu_mask_args {
+	uint32_t queue_id;		/* to KFD */
+	uint32_t num_cu_mask;		/* to KFD */
+	uint64_t cu_mask_ptr;		/* to KFD */
+};
+
+/* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
+#define KFD_IOC_CACHE_POLICY_COHERENT 0
+#define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
+
+struct kfd_ioctl_set_memory_policy_args {
+	uint64_t alternate_aperture_base;	/* to KFD */
+	uint64_t alternate_aperture_size;	/* to KFD */
+
+	uint32_t gpu_id;			/* to KFD */
+	uint32_t default_policy;		/* to KFD */
+	uint32_t alternate_policy;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_set_trap_handler_args {
+	uint64_t tba_addr;
+	uint64_t tma_addr;
+	uint32_t gpu_id;			/* to KFD */
+	uint32_t pad;
+};
+
+/*
+ * All counters are monotonic. They are used for profiling of compute jobs.
+ * The profiling is done by userspace.
+ *
+ * In case of GPU reset, the counter should not be affected.
+ */
+
+struct kfd_ioctl_get_clock_counters_args {
+	uint64_t gpu_clock_counter;	/* from KFD */
+	uint64_t cpu_clock_counter;	/* from KFD */
+	uint64_t system_clock_counter;	/* from KFD */
+	uint64_t system_clock_freq;	/* from KFD */
+
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t pad;
+};
+
+#define NUM_OF_SUPPORTED_GPUS 7
+
+struct kfd_process_device_apertures {
+	uint64_t lds_base;		/* from KFD */
+	uint64_t lds_limit;		/* from KFD */
+	uint64_t scratch_base;		/* from KFD */
+	uint64_t scratch_limit;		/* from KFD */
+	uint64_t gpuvm_base;		/* from KFD */
+	uint64_t gpuvm_limit;		/* from KFD */
+	uint32_t gpu_id;		/* from KFD */
+	uint32_t pad;
+};
+
+/* This IOCTL and the limited NUM_OF_SUPPORTED_GPUS is deprecated. Use
+ * kfd_ioctl_get_process_apertures_new instead, which supports
+ * arbitrary numbers of GPUs.
+ */
+struct kfd_ioctl_get_process_apertures_args {
+	struct kfd_process_device_apertures
+			process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
+
+	/* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */
+	uint32_t num_of_nodes;
+	uint32_t pad;
+};
+
+struct kfd_ioctl_get_process_apertures_new_args {
+	/* User allocated. Pointer to struct kfd_process_device_apertures
+	 * filled in by Kernel
+	 */
+	uint64_t kfd_process_device_apertures_ptr;
+	/* to KFD - indicates amount of memory present in
+	 *  kfd_process_device_apertures_ptr
+	 * from KFD - Number of entries filled by KFD.
+	 */
+	uint32_t num_of_nodes;
+	uint32_t pad;
+};
+
+#define MAX_ALLOWED_NUM_POINTS    100
+#define MAX_ALLOWED_AW_BUFF_SIZE 4096
+#define MAX_ALLOWED_WAC_BUFF_SIZE  128
+
+struct kfd_ioctl_dbg_register_args {
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_dbg_unregister_args {
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_dbg_address_watch_args {
+	uint64_t content_ptr;		/* a pointer to the actual content */
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t buf_size_in_bytes;	/*including gpu_id and buf_size */
+};
+
+struct kfd_ioctl_dbg_wave_control_args {
+	uint64_t content_ptr;		/* a pointer to the actual content */
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t buf_size_in_bytes;	/*including gpu_id and buf_size */
+};
+
+/* Matching HSA_EVENTTYPE */
+#define KFD_IOC_EVENT_SIGNAL		0
+#define KFD_IOC_EVENT_NODECHANGE	1
+#define KFD_IOC_EVENT_DEVICESTATECHANGE	2
+#define KFD_IOC_EVENT_HW_EXCEPTION	3
+#define KFD_IOC_EVENT_SYSTEM_EVENT	4
+#define KFD_IOC_EVENT_DEBUG_EVENT	5
+#define KFD_IOC_EVENT_PROFILE_EVENT	6
+#define KFD_IOC_EVENT_QUEUE_EVENT	7
+#define KFD_IOC_EVENT_MEMORY		8
+
+#define KFD_IOC_WAIT_RESULT_COMPLETE	0
+#define KFD_IOC_WAIT_RESULT_TIMEOUT	1
+#define KFD_IOC_WAIT_RESULT_FAIL	2
+
+/*
+ * The added 512 is because, currently, 8*(4096/256) signal events are
+ * reserved for debugger events, and we want to provide at least 4K signal
+ * events for EOP usage.
+ * We add 512 to make the allocated size (KFD_SIGNAL_EVENT_LIMIT * 8) be
+ * page aligned.
+ */
+#define KFD_SIGNAL_EVENT_LIMIT		(4096 + 512)
+
+struct kfd_ioctl_create_event_args {
+	uint64_t event_page_offset;	/* from KFD */
+	uint32_t event_trigger_data;	/* from KFD - signal events only */
+	uint32_t event_type;		/* to KFD */
+	uint32_t auto_reset;		/* to KFD */
+	uint32_t node_id;		/* to KFD - only valid for certain
+							event types */
+	uint32_t event_id;		/* from KFD */
+	uint32_t event_slot_index;	/* from KFD */
+};
+
+struct kfd_ioctl_destroy_event_args {
+	uint32_t event_id;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_set_event_args {
+	uint32_t event_id;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_reset_event_args {
+	uint32_t event_id;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_memory_exception_failure {
+	uint32_t NotPresent;	/* Page not present or supervisor privilege */
+	uint32_t ReadOnly;	/* Write access to a read-only page */
+	uint32_t NoExecute;	/* Execute access to a page marked NX */
+	uint32_t imprecise;	/* Can't determine the	exact fault address */
+};
+
+/* memory exception data */
+struct kfd_hsa_memory_exception_data {
+	struct kfd_memory_exception_failure failure;
+	uint64_t va;
+	uint32_t gpu_id;
+	uint32_t pad;
+};
+
+/* Event data */
+struct kfd_event_data {
+	union {
+		struct kfd_hsa_memory_exception_data memory_exception_data;
+	};				/* From KFD */
+	uint64_t kfd_event_data_ext;	/* pointer to an extension structure
+	 	 	 	 	   for future exception types */
+	uint32_t event_id;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_wait_events_args {
+	uint64_t events_ptr;		/* pointed to struct
+					   kfd_event_data array, to KFD */
+	uint32_t num_events;		/* to KFD */
+	uint32_t wait_for_all;		/* to KFD */
+	uint32_t timeout;		/* to KFD */
+	uint32_t wait_result;		/* from KFD */
+};
+
+struct kfd_ioctl_alloc_memory_of_scratch_args {
+	uint64_t va_addr;	/* to KFD */
+	uint64_t size;		/* to KFD */
+	uint32_t gpu_id;	/* to KFD */
+	uint32_t pad;
+};
+
+/* Allocation flags: memory types */
+#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM		(1 << 0)
+#define KFD_IOC_ALLOC_MEM_FLAGS_GTT		(1 << 1)
+#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR		(1 << 2)
+#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL	(1 << 3)
+/* Allocation flags: attributes/access options */
+#define KFD_IOC_ALLOC_MEM_FLAGS_NONPAGED	(1 << 31)
+#define KFD_IOC_ALLOC_MEM_FLAGS_READONLY	(1 << 30)
+#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC		(1 << 29)
+#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE	(1 << 28)
+#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM	(1 << 27)
+#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTE_ACCESS	(1 << 26)
+#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT	(1 << 25)
+
+struct kfd_ioctl_alloc_memory_of_gpu_args {
+	uint64_t va_addr;	/* to KFD */
+	uint64_t size;		/* to KFD */
+	uint64_t handle;	/* from KFD */
+	uint64_t mmap_offset;   /* to KFD (userptr), from KFD (mmap offset) */
+	uint32_t gpu_id;	/* to KFD */
+	uint32_t flags;
+};
+
+struct kfd_ioctl_free_memory_of_gpu_args {
+	uint64_t handle;	/* to KFD */
+};
+
+struct kfd_ioctl_map_memory_to_gpu_args {
+	uint64_t handle;			/* to KFD */
+	uint64_t device_ids_array_ptr;		/* to KFD */
+	uint32_t device_ids_array_size;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_unmap_memory_from_gpu_args {
+	uint64_t handle;			/* to KFD */
+	uint64_t device_ids_array_ptr;		/* to KFD */
+	uint32_t device_ids_array_size;		/* to KFD */
+	uint32_t pad;
+};
+
+/* TODO: remove this. It's only implemented for Kaveri and was never
+ * upstreamed. There are no open-source users of this interface. It
+ * has been superseded by the pair of get_dmabuf_info and
+ * import_dmabuf, which is implemented for all supported GPUs.
+ */
+struct kfd_ioctl_open_graphic_handle_args {
+	uint64_t va_addr;		/* to KFD */
+	uint64_t handle;		/* from KFD */
+	uint32_t gpu_id;		/* to KFD */
+	int graphic_device_fd;		/* to KFD */
+	uint32_t graphic_handle;	/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_set_process_dgpu_aperture_args {
+	uint64_t dgpu_base;
+	uint64_t dgpu_limit;
+	uint32_t gpu_id;
+	uint32_t pad;
+};
+
+struct kfd_ioctl_get_dmabuf_info_args {
+	uint64_t size;		/* from KFD */
+	uint64_t metadata_ptr;	/* to KFD */
+	uint32_t metadata_size;	/* to KFD (space allocated by user)
+				 * from KFD (actual metadata size) */
+	uint32_t gpu_id;	/* from KFD */
+	uint32_t flags;		/* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */
+	uint32_t dmabuf_fd;	/* to KFD */
+};
+
+struct kfd_ioctl_import_dmabuf_args {
+	uint64_t va_addr;	/* to KFD */
+	uint64_t handle;	/* from KFD */
+	uint32_t gpu_id;	/* to KFD */
+	uint32_t dmabuf_fd;	/* to KFD */
+};
+
+struct kfd_ioctl_ipc_export_handle_args {
+	uint64_t handle;		/* to KFD */
+	uint32_t share_handle[4];	/* from KFD */
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_ipc_import_handle_args {
+	uint64_t handle;		/* from KFD */
+	uint64_t va_addr;		/* to KFD */
+	uint64_t mmap_offset;		/* from KFD */
+	uint32_t share_handle[4];	/* to KFD */
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t pad;
+};
+
+struct kfd_ioctl_get_tile_config_args {
+	/* to KFD: pointer to tile array */
+	uint64_t tile_config_ptr;
+	/* to KFD: pointer to macro tile array */
+	uint64_t macro_tile_config_ptr;
+	/* to KFD: array size allocated by user mode
+	 * from KFD: array size filled by kernel
+	 */
+	uint32_t num_tile_configs;
+	/* to KFD: array size allocated by user mode
+	 * from KFD: array size filled by kernel
+	 */
+	uint32_t num_macro_tile_configs;
+
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t gb_addr_config;	/* from KFD */
+	uint32_t num_banks;		/* from KFD */
+	uint32_t num_ranks;		/* from KFD */
+	/* struct size can be extended later if needed
+	 * without breaking ABI compatibility
+	 */
+};
+
+struct kfd_memory_range {
+	uint64_t va_addr;
+	uint64_t size;
+};
+
+/* flags definitions
+ * BIT0: 0: read operation, 1: write operation.
+ * This also identifies if the src or dst array belongs to remote process
+ */
+#define KFD_CROSS_MEMORY_RW_BIT (1 << 0)
+#define KFD_SET_CROSS_MEMORY_READ(flags) (flags &= ~KFD_CROSS_MEMORY_RW_BIT)
+#define KFD_SET_CROSS_MEMORY_WRITE(flags) (flags |= KFD_CROSS_MEMORY_RW_BIT)
+#define KFD_IS_CROSS_MEMORY_WRITE(flags) (flags & KFD_CROSS_MEMORY_RW_BIT)
+
+struct kfd_ioctl_cross_memory_copy_args {
+	/* to KFD: Process ID of the remote process */
+	uint32_t pid;
+	/* to KFD: See above definition */
+	uint32_t flags;
+	/* to KFD: Source GPU VM range */
+	uint64_t src_mem_range_array;
+	/* to KFD: Size of above array */
+	uint64_t src_mem_array_size;
+	/* to KFD: Destination GPU VM range */
+	uint64_t dst_mem_range_array;
+	/* to KFD: Size of above array */
+	uint64_t dst_mem_array_size;
+	/* from KFD: Total amount of bytes copied */
+	uint64_t bytes_copied;
+};
+
+#define AMDKFD_IOCTL_BASE 'K'
+#define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
+#define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
+#define AMDKFD_IOW(nr, type)		_IOW(AMDKFD_IOCTL_BASE, nr, type)
+#define AMDKFD_IOWR(nr, type)		_IOWR(AMDKFD_IOCTL_BASE, nr, type)
+
+#define AMDKFD_IOC_GET_VERSION			\
+		AMDKFD_IOR(0x01, struct kfd_ioctl_get_version_args)
+
+#define AMDKFD_IOC_CREATE_QUEUE			\
+		AMDKFD_IOWR(0x02, struct kfd_ioctl_create_queue_args)
+
+#define AMDKFD_IOC_DESTROY_QUEUE		\
+		AMDKFD_IOWR(0x03, struct kfd_ioctl_destroy_queue_args)
+
+#define AMDKFD_IOC_SET_MEMORY_POLICY		\
+		AMDKFD_IOW(0x04, struct kfd_ioctl_set_memory_policy_args)
+
+#define AMDKFD_IOC_GET_CLOCK_COUNTERS		\
+		AMDKFD_IOWR(0x05, struct kfd_ioctl_get_clock_counters_args)
+
+#define AMDKFD_IOC_GET_PROCESS_APERTURES	\
+		AMDKFD_IOR(0x06, struct kfd_ioctl_get_process_apertures_args)
+
+#define AMDKFD_IOC_UPDATE_QUEUE			\
+		AMDKFD_IOW(0x07, struct kfd_ioctl_update_queue_args)
+
+#define AMDKFD_IOC_CREATE_EVENT			\
+		AMDKFD_IOWR(0x08, struct kfd_ioctl_create_event_args)
+
+#define AMDKFD_IOC_DESTROY_EVENT		\
+		AMDKFD_IOW(0x09, struct kfd_ioctl_destroy_event_args)
+
+#define AMDKFD_IOC_SET_EVENT			\
+		AMDKFD_IOW(0x0A, struct kfd_ioctl_set_event_args)
+
+#define AMDKFD_IOC_RESET_EVENT			\
+		AMDKFD_IOW(0x0B, struct kfd_ioctl_reset_event_args)
+
+#define AMDKFD_IOC_WAIT_EVENTS			\
+		AMDKFD_IOWR(0x0C, struct kfd_ioctl_wait_events_args)
+
+#define AMDKFD_IOC_DBG_REGISTER			\
+		AMDKFD_IOW(0x0D, struct kfd_ioctl_dbg_register_args)
+
+#define AMDKFD_IOC_DBG_UNREGISTER		\
+		AMDKFD_IOW(0x0E, struct kfd_ioctl_dbg_unregister_args)
+
+#define AMDKFD_IOC_DBG_ADDRESS_WATCH		\
+		AMDKFD_IOW(0x0F, struct kfd_ioctl_dbg_address_watch_args)
+
+#define AMDKFD_IOC_DBG_WAVE_CONTROL		\
+		AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args)
+
+#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU		\
+		AMDKFD_IOWR(0x11, struct kfd_ioctl_alloc_memory_of_gpu_args)
+
+#define AMDKFD_IOC_FREE_MEMORY_OF_GPU		\
+		AMDKFD_IOWR(0x12, struct kfd_ioctl_free_memory_of_gpu_args)
+
+#define AMDKFD_IOC_MAP_MEMORY_TO_GPU		\
+		AMDKFD_IOWR(0x13, struct kfd_ioctl_map_memory_to_gpu_args)
+
+#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU	\
+		AMDKFD_IOWR(0x14, struct kfd_ioctl_unmap_memory_from_gpu_args)
+
+#define AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH	\
+		AMDKFD_IOWR(0x15, struct kfd_ioctl_alloc_memory_of_scratch_args)
+
+#define AMDKFD_IOC_SET_CU_MASK		\
+		AMDKFD_IOW(0x16, struct kfd_ioctl_set_cu_mask_args)
+
+#define AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE   \
+		AMDKFD_IOW(0x17,	\
+		struct kfd_ioctl_set_process_dgpu_aperture_args)
+
+#define AMDKFD_IOC_SET_TRAP_HANDLER		\
+		AMDKFD_IOW(0x18, struct kfd_ioctl_set_trap_handler_args)
+
+#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW	\
+	AMDKFD_IOWR(0x19, struct kfd_ioctl_get_process_apertures_new_args)
+
+#define AMDKFD_IOC_GET_DMABUF_INFO		\
+		AMDKFD_IOWR(0x1A, struct kfd_ioctl_get_dmabuf_info_args)
+
+#define AMDKFD_IOC_IMPORT_DMABUF		\
+		AMDKFD_IOWR(0x1B, struct kfd_ioctl_import_dmabuf_args)
+
+#define AMDKFD_IOC_GET_TILE_CONFIG		\
+		AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_tile_config_args)
+
+#define AMDKFD_IOC_IPC_IMPORT_HANDLE		\
+		AMDKFD_IOWR(0x1D, struct kfd_ioctl_ipc_import_handle_args)
+
+#define AMDKFD_IOC_IPC_EXPORT_HANDLE		\
+		AMDKFD_IOWR(0x1E, struct kfd_ioctl_ipc_export_handle_args)
+
+#define AMDKFD_IOC_CROSS_MEMORY_COPY		\
+		AMDKFD_IOWR(0x1F, struct kfd_ioctl_cross_memory_copy_args)
+
+/* TODO: remove this */
+#define AMDKFD_IOC_OPEN_GRAPHIC_HANDLE		\
+		AMDKFD_IOWR(0x20, struct kfd_ioctl_open_graphic_handle_args)
+
+#define AMDKFD_COMMAND_START		0x01
+#define AMDKFD_COMMAND_END		0x21
+
+#endif