gpu-compute, dev-hsa: Remove HSADriver, HSADevice

HSADriver/HSADevice were primarily used with GPUCommandProcessor/ GPUComputeDriver. This change merges the classes together to simplify the inheritance hierarchy, as well as removing any casting. Change-Id: I670eb9b49a16c8aba17e13fd1d1287d0621c9f48 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/42219 Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
2019-06-19 16:04:01 -04:00
parent d019912efa
commit ec6b325382
13 changed files with 386 additions and 636 deletions
--- a/src/dev/hsa/HSADevice.py
+++ b/src/dev/hsa/HSADevice.py
@@ -34,12 +34,6 @@ from m5.params import *
 from m5.proxy import *
 from m5.objects.Device import DmaDevice

-class HSADevice(DmaDevice):
-    type = 'HSADevice'
-    abstract = True
-    cxx_header = "dev/hsa/hsa_device.hh"
-    hsapp = Param.HSAPacketProcessor("PP attached to this device")
-
 class HSAPacketProcessor(DmaDevice):
    type = 'HSAPacketProcessor'
    cxx_header = 'dev/hsa/hsa_packet_processor.hh'
--- a/src/dev/hsa/SConscript
+++ b/src/dev/hsa/SConscript
@@ -37,12 +37,8 @@ if not env['BUILD_GPU']:
    Return()

 SimObject('HSADevice.py')
-SimObject('HSADriver.py')

-Source('hsa_device.cc')
-Source('hsa_driver.cc')
 Source('hsa_packet_processor.cc')
 Source('hw_scheduler.cc')

-DebugFlag('HSADriver')
 DebugFlag('HSAPacketProcessor')
--- a/src/dev/hsa/hsa_device.cc
+++ b/src/dev/hsa/hsa_device.cc
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * For use for simulation and test purposes only
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "dev/hsa/hsa_device.hh"
-
-#include "base/chunk_generator.hh"
-#include "sim/process.hh"
-
-HSAPacketProcessor&
-HSADevice::hsaPacketProc()
-{
-    return *hsaPP;
-}
-
-void
-HSADevice::dmaReadVirt(Addr host_addr, unsigned size,
-                             DmaCallback *cb, void *data, Tick delay)
-{
-    dmaVirt(&DmaDevice::dmaRead, host_addr, size, cb, data, delay);
-}
-
-void
-HSADevice::dmaWriteVirt(Addr host_addr, unsigned size,
-                              DmaCallback *cb, void *data, Tick delay)
-{
-    dmaVirt(&DmaDevice::dmaWrite, host_addr, size, cb, data, delay);
-}
-
-void
-HSADevice::dmaVirt(DmaFnPtr dmaFn, Addr addr, unsigned size,
-                           DmaCallback *cb, void *data, Tick delay)
-{
-    if (size == 0) {
-        if (cb)
-            schedule(cb->getChunkEvent(), curTick() + delay);
-        return;
-    }
-
-    // move the buffer data pointer with the chunks
-    uint8_t *loc_data = (uint8_t*)data;
-
-    for (ChunkGenerator gen(addr, size, PAGE_SIZE); !gen.done(); gen.next()) {
-        Addr phys;
-
-        // translate pages into their corresponding frames
-        translateOrDie(gen.addr(), phys);
-
-        Event *event = cb ? cb->getChunkEvent() : nullptr;
-
-        (this->*dmaFn)(phys, gen.size(), event, loc_data, delay);
-
-        loc_data += gen.size();
-    }
-}
-
-/**
- * HSADevices will perform DMA operations on VAs, and because
- * page faults are not currently supported for HSADevices, we
- * must be able to find the pages mapped for the process.
- */
-void
-HSADevice::translateOrDie(Addr vaddr, Addr &paddr)
-{
-    /**
-     * Grab the process and try to translate the virtual address with it;
-     * with new extensions, it will likely be wrong to just arbitrarily
-     * grab context zero.
-     */
-    auto process = sys->threads[0]->getProcessPtr();
-
-    if (!process->pTable->translate(vaddr, paddr)) {
-        fatal("failed translation: vaddr 0x%x\n", vaddr);
-    }
-}
--- a/src/dev/hsa/hsa_device.hh
+++ b/src/dev/hsa/hsa_device.hh
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * For use for simulation and test purposes only
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __DEV_HSA_HSA_DEVICE_HH__
-#define __DEV_HSA_HSA_DEVICE_HH__
-
-#include <cassert>
-#include <cstdint>
-
-#include "base/logging.hh"
-#include "base/types.hh"
-#include "dev/dma_device.hh"
-#include "dev/hsa/hsa_packet_processor.hh"
-#include "params/HSADevice.hh"
-
-class HSADriver;
-
-class HSADevice : public DmaDevice
-{
-  public:
-    typedef HSADeviceParams Params;
-    typedef std::function<void(const uint64_t &)> HsaSignalCallbackFunction;
-
-    HSADevice(const Params &p) : DmaDevice(p), hsaPP(p.hsapp)
-    {
-        assert(hsaPP);
-        hsaPP->setDevice(this);
-    };
-
-    HSAPacketProcessor& hsaPacketProc();
-
-    /**
-     * submitAgentDispatchPkt() accepts AQL dispatch packets from the HSA
-     * packet processor. Not all devices will accept AQL dispatch packets,
-     * so the default implementation will fatal.
-     * Implementation added to steal kernel signals.
-     */
-    virtual void
-    submitAgentDispatchPkt(void *raw_pkt, uint32_t qID, Addr host_pkt_addr)
-    {
-        fatal("%s does not accept dispatch packets\n", name());
-    }
-
-    /**
-     * submitDispatchPkt() accepts AQL dispatch packets from the HSA packet
-     * processor. Not all devices will accept AQL dispatch packets, so the
-     * default implementation will fatal.
-     */
-    virtual void
-    submitDispatchPkt(void *raw_pkt, uint32_t qID, Addr host_pkt_addr)
-    {
-        fatal("%s does not accept dispatch packets\n", name());
-    }
-
-    /**
-     * submitVendorPkt() accepts vendor specific packets from the HSA
-     * packet processor. This method should be overriden in any HSADevice
-     * that acceptes vendor specific packets, and should interpret the
-     * packet according to the vendor's specifications. Not all HSA
-     * devices will accept vendor specific packets, so the default
-     * implementation will fatal.
-     */
-    virtual void
-    submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
-    {
-        fatal("%s does not accept vendor specific packets\n", name());
-    }
-    virtual void
-    attachDriver(HSADriver *driver)
-    {
-        fatal("%s does not need HSA driver\n", name());
-    }
-    virtual void
-    updateHsaSignal(Addr signal_handle, uint64_t signal_value,
-        HsaSignalCallbackFunction function = [] (const uint64_t &) { })
-    {
-        fatal("%s does not have HSA signal update functionality.\n", name());
-    }
-    virtual uint64_t
-    functionalReadHsaSignal(Addr signal_handle)
-    {
-        fatal("%s does not have HSA signal read functionality.\n", name());
-    }
-    void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb,
-                     void *data, Tick delay = 0);
-    void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *cb,
-                      void *data, Tick delay = 0);
-
-  protected:
-    // Typedefing dmaRead and dmaWrite function pointer
-    typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
-    HSAPacketProcessor *hsaPP;
-    void dmaVirt(DmaFnPtr, Addr host_addr, unsigned size, DmaCallback *cb,
-                 void *data, Tick delay = 0);
-    void translateOrDie(Addr vaddr, Addr &paddr);
-};
-
-#endif // __DEV_HSA_HSA_DEVICE_HH__
--- a/src/dev/hsa/hsa_driver.cc
+++ b/src/dev/hsa/hsa_driver.cc
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * For use for simulation and test purposes only
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "dev/hsa/hsa_driver.hh"
-
-#include "base/trace.hh"
-#include "debug/HSADriver.hh"
-#include "dev/hsa/hsa_device.hh"
-#include "dev/hsa/hsa_packet_processor.hh"
-#include "dev/hsa/kfd_event_defines.h"
-#include "dev/hsa/kfd_ioctl.h"
-#include "params/HSADriver.hh"
-#include "sim/process.hh"
-#include "sim/proxy_ptr.hh"
-#include "sim/syscall_emul_buf.hh"
-
-HSADriver::HSADriver(const HSADriverParams &p)
-    : EmulatedDriver(p), device(p.device), queueId(0)
-{
-}
-
-/**
- * Create an FD entry for the KFD inside of the owning process.
- */
-int
-HSADriver::open(ThreadContext *tc, int mode, int flags)
-{
-    DPRINTF(HSADriver, "Opened %s\n", filename);
-    auto process = tc->getProcessPtr();
-    auto device_fd_entry = std::make_shared<DeviceFDEntry>(this, filename);
-    int tgt_fd = process->fds->allocFD(device_fd_entry);
-    return tgt_fd;
-}
-
-/**
- * Currently, mmap() will simply setup a mapping for the associated
- * device's packet processor's doorbells and creates the event page.
- */
-Addr
-HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
-                int tgt_flags, int tgt_fd, off_t offset)
-{
-    auto process = tc->getProcessPtr();
-    auto mem_state = process->memState;
-
-    Addr pg_off = offset >> PAGE_SHIFT;
-    Addr mmap_type = pg_off & KFD_MMAP_TYPE_MASK;
-    DPRINTF(HSADriver, "amdkfd mmap (start: %p, length: 0x%x,"
-            "offset: 0x%x)\n", start, length, offset);
-
-    switch (mmap_type) {
-        case KFD_MMAP_TYPE_DOORBELL:
-            DPRINTF(HSADriver, "amdkfd mmap type DOORBELL offset\n");
-            start = mem_state->extendMmap(length);
-            process->pTable->map(start, device->hsaPacketProc().pioAddr,
-                    length, false);
-            break;
-        case KFD_MMAP_TYPE_EVENTS:
-            DPRINTF(HSADriver, "amdkfd mmap type EVENTS offset\n");
-            panic_if(start != 0,
-                     "Start address should be provided by KFD\n");
-            panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
-                     "Requested length %d, expected length %d; length "
-                     "mismatch\n", length, 8 * KFD_SIGNAL_EVENT_LIMIT);
-            /**
-             * We don't actually access these pages.  We just need to reserve
-             * some VA space.  See commit id 5ce8abce for details on how
-             * events are currently implemented.
-             */
-             if (!eventPage) {
-                eventPage = mem_state->extendMmap(length);
-                start = eventPage;
-             }
-             break;
-        default:
-            warn_once("Unrecognized kfd mmap type %llx\n", mmap_type);
-            break;
-    }
-
-    return start;
-}
-
-/**
- * Forward relevant parameters to packet processor; queueID
- * is used to link doorbell. The queueIDs are not re-used
- * in current implementation, and we allocate only one page
- * (4096 bytes) for doorbells, so check if this queue ID can
- * be mapped into that page.
- */
-void
-HSADriver::allocateQueue(ThreadContext *tc, Addr ioc_buf)
-{
-    VPtr<kfd_ioctl_create_queue_args> args(ioc_buf, tc);
-
-    if (queueId >= 0x1000) {
-        fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
-    }
-
-    args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL |
-        KFD_MMAP_GPU_ID(args->gpu_id)) << PAGE_SHIFT;
-
-    args->queue_id = queueId++;
-    auto &hsa_pp = device->hsaPacketProc();
-    hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
-                              args->ring_base_address, args->queue_id,
-                              args->ring_size);
-}
-
-const char*
-HSADriver::DriverWakeupEvent::description() const
-{
-    return "DriverWakeupEvent";
-}
-
-void
-HSADriver::DriverWakeupEvent::scheduleWakeup(Tick wakeup_delay)
-{
-    assert(driver);
-    driver->schedule(this, curTick() + wakeup_delay);
-}
-
-void
-HSADriver::signalWakeupEvent(uint32_t event_id)
-{
-    panic_if(event_id >= eventSlotIndex,
-        "Trying wakeup on an event that is not yet created\n");
-    if (ETable[event_id].threadWaiting) {
-        panic_if(!ETable[event_id].tc,
-                 "No thread context to wake up\n");
-        ThreadContext *tc = ETable[event_id].tc;
-        DPRINTF(HSADriver,
-                "Signal event: Waking up CPU %d\n", tc->cpuId());
-        // Wake up this thread
-        tc->activate();
-        // Remove events that can wake up this thread
-        TCEvents[tc].clearEvents();
-    } else {
-       // This may be a race condition between an ioctl call asking to wait on
-       // this event and this signalWakeupEvent. Taking care of this race
-       // condition here by setting the event here. The ioctl call should take
-       // the necessary action when waiting on an already set event.  However,
-       // this may be a genuine instance in which the runtime has decided not
-       // to wait on this event. But since we cannot distinguish this case with
-       // the race condition, we are any way setting the event.
-       ETable[event_id].setEvent = true;
-    }
-}
-
-void
-HSADriver::DriverWakeupEvent::process()
-{
-    DPRINTF(HSADriver,
-            "Timer event: Waking up CPU %d\n", tc->cpuId());
-    // Wake up this thread
-    tc->activate();
-    // Remove events that can wake up this thread
-    driver->TCEvents[tc].clearEvents();
-}
--- a/src/dev/hsa/hsa_driver.hh
+++ b/src/dev/hsa/hsa_driver.hh
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * For use for simulation and test purposes only
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/**
- * @file
- * An HSADriver is an emulated driver that controls an HSA agent,
- * or more simply put, an HSA device. An HSA device is a device
- * that has an associated HSA packet processor.
- *
- * In the base HSADriver class the open() method is implemented, as
- * well as the mmap() call, which maps the HSA packet processor's
- * doorbells. Drivers for other HSA devices should derive from this
- * class and implement the necessary methods; typically this is an
- * ioctl() method that satisfies the ioctl requests needed to manage
- * and control the device.
- */
-
-#ifndef __DEV_HSA_HSA_DRIVER_HH__
-#define __DEV_HSA_HSA_DRIVER_HH__
-
-#include <cassert>
-#include <cstdint>
-#include <set>
-#include <unordered_map>
-
-#include "base/logging.hh"
-#include "base/types.hh"
-#include "cpu/thread_context.hh"
-#include "sim/emul_driver.hh"
-
-struct HSADriverParams;
-class HSADevice;
-
-class HSADriver : public EmulatedDriver
-{
-  public:
-    HSADriver(const HSADriverParams &p);
-
-    int open(ThreadContext *tc, int mode, int flags);
-    Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
-              int prot, int tgt_flags, int tgt_fd, off_t offset);
-    virtual void signalWakeupEvent(uint32_t event_id);
-    class DriverWakeupEvent : public Event
-    {
-      public:
-        DriverWakeupEvent(HSADriver *hsa_driver, ThreadContext *thrd_cntxt)
-            : driver(hsa_driver), tc(thrd_cntxt)  {}
-        void process() override;
-        const char *description() const override;
-        void scheduleWakeup(Tick wakeup_delay);
-      private:
-        HSADriver *driver;
-        ThreadContext *tc;
-    };
-    class EventTableEntry
-    {
-      public:
-        EventTableEntry() :
-            mailBoxPtr(0), tc(nullptr), threadWaiting(false), setEvent(false)
-        {}
-        // Mail box pointer for this address. Current implementation does not
-        // use this mailBoxPtr to notify events but directly calls
-        // signalWakeupEvent from dispatcher (GPU) to notify event. So,
-        // currently this mailBoxPtr is not used. But a future implementation
-        // may communicate to the driver using mailBoxPtr.
-        Addr mailBoxPtr;
-        // Thread context waiting on this event. We do not support multiple
-        // threads waiting on an event currently.
-        ThreadContext *tc;
-        // threadWaiting = true, if some thread context is waiting on this
-        // event.  A thread context waiting on this event is put to sleep.
-        bool threadWaiting;
-        // setEvent = true, if this event is triggered but when this event
-        // triggered, no thread context was waiting on it. In the future, some
-        // thread context will try to wait on this event but since event has
-        // already happened, we will not allow that thread context to go to
-        // sleep. The above mentioned scenario can happen when the waiting
-        // thread and wakeup thread race on this event and the wakeup thread
-        // beat the waiting thread at the driver.
-        bool setEvent;
-    };
-    typedef class EventTableEntry ETEntry;
-
-  protected:
-    Addr eventPage;
-    uint32_t eventSlotIndex;
-    // Event table that keeps track of events. It is indexed with event ID.
-    std::unordered_map<uint32_t, ETEntry> ETable;
-
-    // TCEvents map keeps track of the events that can wakeup this thread. When
-    // multiple events can wake up this thread, this data structure helps to
-    // reset all events when one of those events wake up this thread. The
-    // signal events that can wake up this thread are stored in signalEvents
-    // whereas the timer wakeup event is stored in timerEvent.
-    class EventList
-    {
-      public:
-        EventList() : driver(nullptr), timerEvent(nullptr, nullptr) {}
-        EventList(HSADriver *hsa_driver, ThreadContext *thrd_cntxt)
-            : driver(hsa_driver), timerEvent(hsa_driver, thrd_cntxt)
-        { }
-        void clearEvents() {
-            assert(driver);
-            for (auto event : signalEvents) {
-               assert(event < driver->eventSlotIndex);
-               panic_if(driver->ETable[event].tc->status() == \
-                            ThreadContext::Suspended,
-                        "Thread should not be suspended\n");
-               driver->ETable[event].tc = nullptr;
-               driver->ETable[event].threadWaiting = false;
-            }
-            signalEvents.clear();
-            if (timerEvent.scheduled()) {
-                driver->deschedule(timerEvent);
-            }
-        }
-        HSADriver *driver;
-        DriverWakeupEvent timerEvent;
-        // The set of events that can wake up the same thread.
-        std::set<uint32_t> signalEvents;
-    };
-    std::unordered_map<ThreadContext *, EventList> TCEvents;
-
-    /**
-     * HSA agent (device) that is controled by this driver.
-     */
-    HSADevice *device;
-    uint32_t queueId;
-
-    void allocateQueue(ThreadContext *tc, Addr ioc_buf);
-};
-
-#endif // __DEV_HSA_HSA_DRIVER_HH__
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -42,9 +42,9 @@
 #include "base/trace.hh"
 #include "debug/HSAPacketProcessor.hh"
 #include "dev/dma_device.hh"
-#include "dev/hsa/hsa_device.hh"
 #include "dev/hsa/hsa_packet.hh"
 #include "dev/hsa/hw_scheduler.hh"
+#include "gpu-compute/gpu_command_processor.hh"
 #include "mem/packet_access.hh"
 #include "mem/page_table.hh"
 #include "sim/process.hh"
@@ -330,14 +330,24 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
        DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
                " active list ID = %d\n", __FUNCTION__, rl_idx);
        // Submit packet to HSA device (dispatcher)
-        hsa_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
+        gpu_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
        is_submitted = UNBLOCKED;
    } else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
        DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
                " active list ID = %d\n", __FUNCTION__, rl_idx);
        // Submit packet to HSA device (dispatcher)
-        hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
+        gpu_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
        is_submitted = UNBLOCKED;
+        /*
+          If this packet is using the "barrier bit" to enforce ordering with
+          subsequent kernels, set the bit for this queue now, after
+          dispatching.
+        */
+        if (IS_BARRIER(disp_pkt)) {
+            DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
+                    " list ID = %d\n", __FUNCTION__, rl_idx);
+            regdQList[rl_idx]->setBarrierBit(true);
+        }
    } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
        DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
                " active list ID = %d\n", __FUNCTION__, rl_idx);
@@ -404,14 +414,14 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
                // I'm going to cheat here and read out
                // the value from main memory using functional
                // access, and then just DMA the decremented value.
-                uint64_t signal_value = hsa_device->functionalReadHsaSignal(\
+                uint64_t signal_value = gpu_device->functionalReadHsaSignal(\
                                            bar_and_pkt->completion_signal);

                DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
                       " completion signal! Addr: %x\n",
                       bar_and_pkt->completion_signal);

-                hsa_device->updateHsaSignal(bar_and_pkt->completion_signal,
+                gpu_device->updateHsaSignal(bar_and_pkt->completion_signal,
                                            signal_value - 1);
            }
        }
@@ -428,7 +438,7 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
        DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \
                " active list ID = %d\n", __FUNCTION__, rl_idx);
        // Submit packet to HSA device (dispatcher)
-        hsa_device->submitAgentDispatchPkt(
+        gpu_device->submitAgentDispatchPkt(
                (void *)disp_pkt, rl_idx, host_pkt_addr);
        is_submitted = UNBLOCKED;
        sendAgentDispatchCompletionSignal((void *)disp_pkt,0);
@@ -633,9 +643,9 @@ AQLRingBuffer::freeEntry(void *pkt)
 }

 void
-HSAPacketProcessor::setDevice(HSADevice *dev)
+HSAPacketProcessor::setDevice(GPUCommandProcessor *dev)
 {
-    this->hsa_device = dev;
+    this->gpu_device = dev;
 }

 int
@@ -670,15 +680,13 @@ HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
        DPRINTF(HSAPacketProcessor,
                "Unset barrier bit for active list ID %d\n", rl_idx);
        regdQList[rl_idx]->setBarrierBit(false);
-        panic_if(!regdQList[rl_idx]->dispPending(),
-                 "There should be pending kernels in this queue\n");
-        DPRINTF(HSAPacketProcessor,
-                "Rescheduling active list ID %d after unsetting barrier "
-                "bit\n", rl_idx);
-        // Try to schedule wakeup in the next cycle.  There is a minimum
-        // pktProcessDelay for queue wake up. If that processing delay is
-        // elapsed, schedAQLProcessing will wakeup next tick.
-        schedAQLProcessing(rl_idx, 1);
+        // if pending kernels in the queue after this kernel, reschedule
+        if (regdQList[rl_idx]->dispPending()) {
+            DPRINTF(HSAPacketProcessor,
+                    "Rescheduling active list ID %d after unsetting barrier "
+                    "bit\n", rl_idx);
+            schedAQLProcessing(rl_idx);
+        }
    }

    // If set, then blocked schedule, so need to reschedule
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -66,7 +66,7 @@ typedef enum
                   // barrier packet completes.
 } Q_STATE;

-class HSADevice;
+class GPUCommandProcessor;
 class HWScheduler;

 // Our internal representation of an HSA queue
@@ -120,7 +120,7 @@ class HSAQueueDescriptor
 * FREE: Entry is empty
 * ALLOCATED: Entry has been allocated for a packet, but the DMA has not
 *            yet completed
- * SUBMITTED: Packet has been submitted to the HSADevice, but has not
+ * SUBMITTED: Packet has been submitted to the GPUCommandProcessor, but has not
 *            yet completed
 */
 class AQLRingBuffer
@@ -224,7 +224,7 @@ class HSAPacketProcessor: public DmaDevice
    friend class HWScheduler;
  protected:
    typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
-    HSADevice *hsa_device;
+    GPUCommandProcessor *gpu_device;
    HWScheduler *hwSchdlr;

    // Structure to store the read values of dependency signals
@@ -333,7 +333,7 @@ class HSAPacketProcessor: public DmaDevice
                            uint64_t queue_id,
                            uint32_t size);
    void unsetDeviceQueueDesc(uint64_t queue_id);
-    void setDevice(HSADevice * dev);
+    void setDevice(GPUCommandProcessor * dev);
    void updateReadIndex(int, uint32_t);
    void getCommandsFromHost(int pid, uint32_t rl_idx);

--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -37,8 +37,6 @@ from m5.SimObject import SimObject
 from m5.objects.Bridge import Bridge
 from m5.objects.ClockedObject import ClockedObject
 from m5.objects.Device import DmaDevice
-from m5.objects.HSADevice import HSADevice
-from m5.objects.HSADriver import HSADriver
 from m5.objects.LdsState import LdsState
 from m5.objects.Process import EmulatedDriver

@@ -239,9 +237,10 @@ class Shader(ClockedObject):
    idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold")
    max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting")

-class GPUComputeDriver(HSADriver):
+class GPUComputeDriver(EmulatedDriver):
    type = 'GPUComputeDriver'
    cxx_header = 'gpu-compute/gpu_compute_driver.hh'
+    device = Param.GPUCommandProcessor('GPU controlled by this driver')
    isdGPU = Param.Bool(False, 'Driver is for a dGPU')
    gfxVersion = Param.GfxVersion('gfx801', 'ISA of gpu to model')
    dGPUPoolID = Param.Int(False, 'Pool ID for dGPU.')
@@ -259,11 +258,13 @@ class GPUDispatcher(SimObject):
    type = 'GPUDispatcher'
    cxx_header = 'gpu-compute/dispatcher.hh'

-class GPUCommandProcessor(HSADevice):
+class GPUCommandProcessor(DmaDevice):
    type = 'GPUCommandProcessor'
    cxx_header = 'gpu-compute/gpu_command_processor.hh'
    dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU')

+    hsapp = Param.HSAPacketProcessor('PP attached to this device')
+
 class StorageClassType(Enum): vals = [
    'SC_SPILL',
    'SC_GLOBAL',
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -33,6 +33,9 @@

 #include "gpu-compute/gpu_command_processor.hh"

+#include <cassert>
+
+#include "base/chunk_generator.hh"
 #include "debug/GPUCommandProc.hh"
 #include "debug/GPUKernelInfo.hh"
 #include "gpu-compute/dispatcher.hh"
@@ -42,11 +45,75 @@
 #include "sim/syscall_emul_buf.hh"

 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
-    : HSADevice(p), dispatcher(*p.dispatcher), _driver(nullptr)
+    : DmaDevice(p), dispatcher(*p.dispatcher), _driver(nullptr), hsaPP(p.hsapp)
 {
+    assert(hsaPP);
+    hsaPP->setDevice(this);
    dispatcher.setCommandProcessor(this);
 }

+HSAPacketProcessor&
+GPUCommandProcessor::hsaPacketProc()
+{
+    return *hsaPP;
+}
+
+void
+GPUCommandProcessor::dmaReadVirt(Addr host_addr, unsigned size,
+                                 DmaCallback *cb, void *data, Tick delay)
+{
+    dmaVirt(&DmaDevice::dmaRead, host_addr, size, cb, data, delay);
+}
+
+void
+GPUCommandProcessor::dmaWriteVirt(Addr host_addr, unsigned size,
+                                  DmaCallback *cb, void *data, Tick delay)
+{
+    dmaVirt(&DmaDevice::dmaWrite, host_addr, size, cb, data, delay);
+}
+
+void
+GPUCommandProcessor::dmaVirt(DmaFnPtr dmaFn, Addr addr, unsigned size,
+                             DmaCallback *cb, void *data, Tick delay)
+{
+    if (size == 0) {
+        if (cb)
+            schedule(cb->getChunkEvent(), curTick() + delay);
+        return;
+    }
+
+    // move the buffer data pointer with the chunks
+    uint8_t *loc_data = (uint8_t*)data;
+
+    for (ChunkGenerator gen(addr, size, PAGE_SIZE); !gen.done(); gen.next()) {
+        Addr phys;
+
+        // translate pages into their corresponding frames
+        translateOrDie(gen.addr(), phys);
+
+        Event *event = cb ? cb->getChunkEvent() : nullptr;
+
+        (this->*dmaFn)(phys, gen.size(), event, loc_data, delay);
+
+        loc_data += gen.size();
+    }
+}
+
+void
+GPUCommandProcessor::translateOrDie(Addr vaddr, Addr &paddr)
+{
+    /**
+     * Grab the process and try to translate the virtual address with it;
+     * with new extensions, it will likely be wrong to just arbitrarily
+     * grab context zero.
+     */
+    auto process = sys->threads[0]->getProcessPtr();
+
+    if (!process->pTable->translate(vaddr, paddr)) {
+        fatal("failed translation: vaddr 0x%x\n", vaddr);
+    }
+}
+
 /**
 * submitDispatchPkt() is the entry point into the CP from the HSAPP
 * and is only meant to be used with AQL kernel dispatch packets.
@@ -192,12 +259,12 @@ GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
 }

 void
-GPUCommandProcessor::attachDriver(HSADriver *hsa_driver)
+GPUCommandProcessor::attachDriver(GPUComputeDriver *gpu_driver)
 {
    fatal_if(_driver, "Should not overwrite driver.");
    // TODO: GPU Driver inheritance hierarchy doesn't really make sense.
    // Should get rid of the base class.
-    _driver = dynamic_cast<GPUComputeDriver *>(hsa_driver);
+    _driver = gpu_driver;
    assert(_driver);
 }

--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -45,17 +45,27 @@
 #ifndef __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
 #define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__

+#include <cstdint>
+#include <functional>
+
+#include "base/logging.hh"
+#include "base/trace.hh"
+#include "base/types.hh"
 #include "debug/GPUCommandProc.hh"
-#include "dev/hsa/hsa_device.hh"
+#include "dev/dma_device.hh"
+#include "dev/hsa/hsa_packet_processor.hh"
 #include "dev/hsa/hsa_signal.hh"
+#include "gpu-compute/dispatcher.hh"
 #include "gpu-compute/gpu_compute_driver.hh"
 #include "gpu-compute/hsa_queue_entry.hh"
+#include "params/GPUCommandProcessor.hh"

 struct GPUCommandProcessorParams;
+class GPUComputeDriver;
 class GPUDispatcher;
 class Shader;

-class GPUCommandProcessor : public HSADevice
+class GPUCommandProcessor : public DmaDevice
 {
  public:
    typedef GPUCommandProcessorParams Params;
@@ -64,6 +74,13 @@ class GPUCommandProcessor : public HSADevice
    GPUCommandProcessor() = delete;
    GPUCommandProcessor(const Params &p);

+    HSAPacketProcessor& hsaPacketProc();
+
+    void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb,
+                     void *data, Tick delay = 0);
+    void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b,
+                      void *data, Tick delay = 0);
+
    void setShader(Shader *shader);
    Shader* shader();
    GPUComputeDriver* driver();
@@ -75,12 +92,13 @@ class GPUCommandProcessor : public HSADevice
    };

    void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
-                           Addr host_pkt_addr) override;
+                           Addr host_pkt_addr);
    void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
-                           Addr host_pkt_addr) override;
+                           Addr host_pkt_addr);
    void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
-                         Addr host_pkt_addr) override;
-    void attachDriver(HSADriver *driver) override;
+                         Addr host_pkt_addr);
+    void attachDriver(GPUComputeDriver *driver);
+
    void dispatchPkt(HSAQueueEntry *task);
    void signalWakeupEvent(uint32_t event_id);

@@ -91,9 +109,9 @@ class GPUCommandProcessor : public HSADevice

    void updateHsaSignal(Addr signal_handle, uint64_t signal_value,
                         HsaSignalCallbackFunction function =
-                            [] (const uint64_t &) { }) override;
+                            [] (const uint64_t &) { });

-    uint64_t functionalReadHsaSignal(Addr signal_handle) override;
+    uint64_t functionalReadHsaSignal(Addr signal_handle);

    Addr getHsaSignalValueAddr(Addr signal_handle)
    {
@@ -115,8 +133,13 @@ class GPUCommandProcessor : public HSADevice
    GPUDispatcher &dispatcher;
    GPUComputeDriver *_driver;

+    // Typedefing dmaRead and dmaWrite function pointer
+    typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
    void initABI(HSAQueueEntry *task);
-
+    HSAPacketProcessor *hsaPP;
+    void dmaVirt(DmaFnPtr, Addr host_addr, unsigned size, DmaCallback *cb,
+                 void *data, Tick delay = 0);
+    void translateOrDie(Addr vaddr, Addr &paddr);

    /**
     * Wraps a std::function object in a DmaCallback.  Much cleaner than
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -33,22 +33,26 @@

 #include "gpu-compute/gpu_compute_driver.hh"

+#include <memory>
+
+#include "base/logging.hh"
+#include "base/trace.hh"
 #include "cpu/thread_context.hh"
 #include "debug/GPUDriver.hh"
 #include "debug/GPUShader.hh"
-#include "dev/hsa/hsa_device.hh"
 #include "dev/hsa/hsa_packet_processor.hh"
 #include "dev/hsa/kfd_event_defines.h"
 #include "dev/hsa/kfd_ioctl.h"
 #include "gpu-compute/gpu_command_processor.hh"
 #include "gpu-compute/shader.hh"
+#include "mem/port_proxy.hh"
 #include "params/GPUComputeDriver.hh"
 #include "sim/process.hh"
 #include "sim/syscall_emul_buf.hh"

 GPUComputeDriver::GPUComputeDriver(const Params &p)
-    : HSADriver(p), isdGPU(p.isdGPU), gfxVersion(p.gfxVersion),
-      dGPUPoolID(p.dGPUPoolID)
+    : EmulatedDriver(p), device(p.device), queueId(0),
+      isdGPU(p.isdGPU), gfxVersion(p.gfxVersion), dGPUPoolID(p.dGPUPoolID)
 {
    device->attachDriver(this);
    DPRINTF(GPUDriver, "Constructing KFD: device\n");
@@ -65,6 +69,146 @@ GPUComputeDriver::GPUComputeDriver(const Params &p)
        defaultMtype.set(Request::CACHED);
 }

+const char*
+GPUComputeDriver::DriverWakeupEvent::description() const
+{
+    return "DriverWakeupEvent";
+}
+
+/**
+ * Create an FD entry for the KFD inside of the owning process.
+ */
+int
+GPUComputeDriver::open(ThreadContext *tc, int mode, int flags)
+{
+    DPRINTF(GPUDriver, "Opened %s\n", filename);
+    auto process = tc->getProcessPtr();
+    auto device_fd_entry = std::make_shared<DeviceFDEntry>(this, filename);
+    int tgt_fd = process->fds->allocFD(device_fd_entry);
+    return tgt_fd;
+}
+
+/**
+ * Currently, mmap() will simply setup a mapping for the associated
+ * device's packet processor's doorbells and creates the event page.
+ */
+Addr
+GPUComputeDriver::mmap(ThreadContext *tc, Addr start, uint64_t length,
+                       int prot, int tgt_flags, int tgt_fd, off_t offset)
+{
+    auto process = tc->getProcessPtr();
+    auto mem_state = process->memState;
+
+    Addr pg_off = offset >> PAGE_SHIFT;
+    Addr mmap_type = pg_off & KFD_MMAP_TYPE_MASK;
+    DPRINTF(GPUDriver, "amdkfd mmap (start: %p, length: 0x%x,"
+            "offset: 0x%x)\n", start, length, offset);
+
+    switch(mmap_type) {
+        case KFD_MMAP_TYPE_DOORBELL:
+            DPRINTF(GPUDriver, "amdkfd mmap type DOORBELL offset\n");
+            start = mem_state->extendMmap(length);
+            process->pTable->map(start, device->hsaPacketProc().pioAddr,
+                    length, false);
+            break;
+        case KFD_MMAP_TYPE_EVENTS:
+            DPRINTF(GPUDriver, "amdkfd mmap type EVENTS offset\n");
+            panic_if(start != 0,
+                     "Start address should be provided by KFD\n");
+            panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
+                     "Requested length %d, expected length %d; length "
+                     "mismatch\n", length, 8* KFD_SIGNAL_EVENT_LIMIT);
+            /**
+             * We don't actually access these pages.  We just need to reserve
+             * some VA space.  See commit id 5ce8abce for details on how
+             * events are currently implemented.
+             */
+            if (!eventPage) {
+                eventPage = mem_state->extendMmap(length);
+                start = eventPage;
+            }
+            break;
+        default:
+            warn_once("Unrecognized kfd mmap type %llx\n", mmap_type);
+            break;
+    }
+
+    return start;
+}
+
+/**
+ * Forward relevant parameters to packet processor; queueId
+ * is used to link doorbell. The queueIDs are not re-used
+ * in current implementation, and we allocate only one page
+ * (4096 bytes) for doorbells, so check if this queueID can
+ * be mapped into that page.
+ */
+void
+GPUComputeDriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
+{
+    TypedBufferArg<kfd_ioctl_create_queue_args> args(ioc_buf);
+    args.copyIn(mem_proxy);
+
+    if ((sizeof(uint32_t) * queueId) > 4096) {
+        fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
+    }
+
+    args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL |
+        KFD_MMAP_GPU_ID(args->gpu_id)) << PAGE_SHIFT;
+
+    args->queue_id = queueId++;
+    auto &hsa_pp = device->hsaPacketProc();
+    hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
+                              args->ring_base_address, args->queue_id,
+                              args->ring_size);
+    args.copyOut(mem_proxy);
+}
+
+void
+GPUComputeDriver::DriverWakeupEvent::scheduleWakeup(Tick wakeup_delay)
+{
+    assert(driver);
+    driver->schedule(this, curTick() + wakeup_delay);
+}
+
+void
+GPUComputeDriver::signalWakeupEvent(uint32_t event_id)
+{
+    panic_if(event_id >= eventSlotIndex,
+        "Trying wakeup on an event that is not yet created\n");
+    if (ETable[event_id].threadWaiting) {
+        panic_if(!ETable[event_id].tc,
+                 "No thread context to wake up\n");
+        ThreadContext *tc = ETable[event_id].tc;
+        DPRINTF(GPUDriver,
+                "Signal event: Waking up CPU %d\n", tc->cpuId());
+        // Remove events that can wakeup this thread
+        TCEvents[tc].clearEvents();
+        // Now wakeup this thread
+        tc->activate();
+    } else {
+       // This may be a race condition between an ioctl call asking to wait on
+       // this event and this signalWakeupEvent. Taking care of this race
+       // condition here by setting the event here. The ioctl call should take
+       // the necessary action when waiting on an already set event.  However,
+       // this may be a genuine instance in which the runtime has decided not
+       // to wait on this event. But since we cannot distinguish this case with
+       // the race condition, we are any way setting the event.
+       ETable[event_id].setEvent = true;
+    }
+}
+
+void
+GPUComputeDriver::DriverWakeupEvent::process()
+{
+    DPRINTF(GPUDriver,
+            "Timer event: Waking up CPU %d\n", tc->cpuId());
+    // Remove events that can wakeup this thread
+    driver->TCEvents[tc].clearEvents();
+    // Now wakeup this thread
+    tc->activate();
+}
+
 int
 GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
 {
@@ -88,7 +232,7 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
          {
            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");

-            allocateQueue(tc, ioc_buf);
+            allocateQueue(virt_proxy, ioc_buf);

            DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
          }
--- a/src/gpu-compute/gpu_compute_driver.hh
+++ b/src/gpu-compute/gpu_compute_driver.hh
@@ -42,19 +42,33 @@
 #ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
 #define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__

+#include <cassert>
+#include <cstdint>
+#include <set>
+#include <unordered_map>
+
 #include "base/addr_range_map.hh"
-#include "dev/hsa/hsa_driver.hh"
+#include "base/types.hh"
 #include "enums/GfxVersion.hh"
 #include "mem/request.hh"
+#include "sim/emul_driver.hh"

 struct GPUComputeDriverParams;
+class GPUCommandProcessor;
+class PortProxy;
+class ThreadContext;

-class GPUComputeDriver final : public HSADriver
+class GPUComputeDriver final : public EmulatedDriver
 {
  public:
    typedef GPUComputeDriverParams Params;
    GPUComputeDriver(const Params &p);
    int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;
+
+    int open(ThreadContext *tc, int mode, int flags);
+    Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
+              int prot, int tgt_flags, int tgt_fd, off_t offset);
+    virtual void signalWakeupEvent(uint32_t event_id);
    void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
    /**
     * Called by the compute units right before a request is issued to ruby.
@@ -67,10 +81,62 @@ class GPUComputeDriver final : public HSADriver
     */
    void setMtype(RequestPtr req);

+    class DriverWakeupEvent : public Event
+    {
+      public:
+        DriverWakeupEvent(GPUComputeDriver *gpu_driver,
+                          ThreadContext *thrd_cntxt)
+          : driver(gpu_driver), tc(thrd_cntxt) {}
+        void process() override;
+        const char *description() const override;
+        void scheduleWakeup(Tick wakeup_delay);
+      private:
+        GPUComputeDriver *driver;
+        ThreadContext *tc;
+    };
+
+    class EventTableEntry
+    {
+      public:
+        EventTableEntry() :
+            mailBoxPtr(0), tc(nullptr), threadWaiting(false), setEvent(false)
+        {}
+        // Mail box pointer for this address. Current implementation does not
+        // use this mailBoxPtr to notify events but directly calls
+        // signalWakeupEvent from dispatcher (GPU) to notifiy events. So,
+        // currently this mailBoxPtr is not used. But a future implementation
+        // may communicate to the driver using mailBoxPtr.
+        Addr mailBoxPtr;
+        // Thread context waiting on this even. We do not support multiple
+        // threads waiting on an event currently.
+        ThreadContext *tc;
+        // threadWaiting = true, if some thread context is waiting on this
+        // event. A thread context waiting on this event is put to sleep.
+        bool threadWaiting;
+        // setEvent = true, if this event is triggered but when this event
+        // triggered, no thread context was waiting on it. In the future, some
+        // thread context will try to wait on this event but since event has
+        // already happened, we will not allow that thread context to go to
+        // sleep. The above mentioned scneario can happen when the waiting
+        // thread and wakeup thread race on this event and the wakeup thread
+        // beat the waiting thread at the driver.
+        bool setEvent;
+    };
+    typedef class EventTableEntry ETEntry;
+
  private:
+    /**
+     * GPU that is controlled by this driver.
+     */
+    GPUCommandProcessor *device;
+    uint32_t queueId;
    bool isdGPU;
    GfxVersion gfxVersion;
    int dGPUPoolID;
+    Addr eventPage;
+    uint32_t eventSlotIndex;
+    //Event table that keeps track of events. It is indexed with event ID.
+    std::unordered_map<uint32_t, ETEntry> ETable;

    /**
     * VMA structures for GPUVM memory.
@@ -89,6 +155,37 @@ class GPUComputeDriver final : public HSADriver

    Request::CacheCoherenceFlags defaultMtype;

+    // TCEvents map keeps trak of the events that can wakeup this thread. When
+    // multiple events can wake up this thread, this data structure helps to
+    // reset all events when one of those events wake up this thread. the
+    // signal events that can wake up this thread are stored in signalEvents
+    // whereas the timer wakeup event is stored in timerEvent.
+    class EventList
+    {
+      public:
+        EventList() : driver(nullptr), timerEvent(nullptr, nullptr) {}
+        EventList(GPUComputeDriver *gpu_driver, ThreadContext *thrd_cntxt)
+            : driver(gpu_driver), timerEvent(gpu_driver, thrd_cntxt)
+        { }
+        void clearEvents() {
+            assert(driver);
+            for (auto event : signalEvents) {
+                assert(event < driver->eventSlotIndex);
+                driver->ETable[event].tc = nullptr;
+                driver->ETable[event].threadWaiting = false;
+            }
+            signalEvents.clear();
+            if (timerEvent.scheduled()) {
+                driver->deschedule(timerEvent);
+            }
+        }
+        GPUComputeDriver *driver;
+        DriverWakeupEvent timerEvent;
+        // The set of events that can wake up the same thread.
+        std::set<uint32_t> signalEvents;
+    };
+    std::unordered_map<ThreadContext *, EventList> TCEvents;
+
    /**
     * Register a region of host memory as uncacheable from the perspective
     * of the dGPU.
@@ -126,6 +223,9 @@ class GPUComputeDriver final : public HSADriver
    void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start,
                        Addr length);
    Addr deallocateGpuVma(Addr start);
+
+    void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr);
+
 };

 #endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__