gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
2018-05-01 16:59:35 -04:00
parent b0eac7857a
commit b8da9abba7
86 changed files with 10299 additions and 3734 deletions
--- a/build_opts/GCN3_X86
+++ b/build_opts/GCN3_X86
@@ -0,0 +1,5 @@
+PROTOCOL = 'GPU_VIPER'
+TARGET_ISA = 'x86'
+TARGET_GPU_ISA = 'gcn3'
+BUILD_GPU = True
+CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'
--- a/configs/common/GPUTLBConfig.py
+++ b/configs/common/GPUTLBConfig.py
@@ -48,7 +48,7 @@ def TLB_constructor(level):
            maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\
            accessDistance = options.L%(level)dAccessDistanceStat,\
            clk_domain = SrcClockDomain(\
-                clock = options.GPUClock,\
+                clock = options.gpu_clock,\
                voltage_domain = VoltageDomain(\
                    voltage = options.gpu_voltage)))" % locals()
    return constructor_call
@@ -60,23 +60,22 @@ def Coalescer_constructor(level):
                coalescingWindow = options.L%(level)dCoalescingWindow,\
                disableCoalescing = options.L%(level)dDisableCoalescing,\
                clk_domain = SrcClockDomain(\
-                    clock = options.GPUClock,\
+                    clock = options.gpu_clock,\
                    voltage_domain = VoltageDomain(\
                        voltage = options.gpu_voltage)))" % locals()
    return constructor_call

-def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name):
-    # arguments: options, TLB level, number of private structures for this Level,
-    # TLB name and  Coalescer name
+def create_TLB_Coalescer(options, my_level, my_index, tlb_name,
+    coalescer_name):
+    # arguments: options, TLB level, number of private structures for this
+    # Level, TLB name and  Coalescer name
    for i in range(my_index):
-        TLB_name.append(eval(TLB_constructor(my_level)))
-        Coalescer_name.append(eval(Coalescer_constructor(my_level)))
+        tlb_name.append(eval(TLB_constructor(my_level)))
+        coalescer_name.append(eval(Coalescer_constructor(my_level)))

 def config_tlb_hierarchy(options, system, shader_idx):
-    n_cu = options.num_compute_units
-    # Make this configurable now, instead of the hard coded val.  The dispatcher
-    # is always the last item in the system.cpu list.
-    dispatcher_idx = len(system.cpu) - 1
+    n_cu = options.cu_per_sa * options.sa_per_complex * \
+           options.num_gpu_complexes

    if options.TLB_config == "perLane":
        num_TLBs = 64 * n_cu
@@ -90,21 +89,26 @@ def config_tlb_hierarchy(options, system, shader_idx):
        print("Bad option for TLB Configuration.")
        sys.exit(1)

-    #----------------------------------------------------------------------------------------
+    #-------------------------------------------------------------------------
    # A visual representation of the TLB hierarchy
    # for ease of configuration
-    # < Modify here the width and the number of levels if you want a different configuration >
-    # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level
-    L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []},
-          {'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []},
-          {'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}]
+    # < Modify here the width and the number of levels if you want a different
+    # configuration >
+    # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc)
+    # for this level
+    L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [],
+           'CoalescerArray': []},
+          {'name': 'scalar', 'width' : options.num_scalar_cache,
+           'TLBarray': [], 'CoalescerArray': []},
+          {'name': 'l1', 'width': num_TLBs, 'TLBarray': [],
+           'CoalescerArray': []}]

    L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
    L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]

    TLB_hierarchy = [L1, L2, L3]

-    #----------------------------------------------------------------------------------------
+    #-------------------------------------------------------------------------
    # Create the hiearchy
    # Call the appropriate constructors and add objects to the system

@@ -164,17 +168,14 @@ def config_tlb_hierarchy(options, system, shader_idx):
                    for tlb in range(tlb_per_cu):
                        exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
                                system.l1_coalescer[%d].slave[%d]' % \
-                                (shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0))
+                                (shader_idx, cu_idx, tlb,
+                                    cu_idx*tlb_per_cu+tlb, 0))
                else:
                    exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
                            system.l1_coalescer[%d].slave[%d]' % \
-                            (shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs)))
-
-        elif name == 'dispatcher': # Dispatcher TLB
-            for index in range(TLB_type['width']):
-                exec('system.cpu[%d].translation_port = \
-                        system.dispatcher_coalescer[%d].slave[0]' % \
-                        (dispatcher_idx, index))
+                            (shader_idx, cu_idx, tlb_per_cu,
+                                cu_idx / (n_cu / num_TLBs),
+                                cu_idx % (n_cu / num_TLBs)))
        elif name == 'sqc': # I-TLB
            for index in range(n_cu):
                sqc_tlb_index = index / options.cu_per_sqc
@@ -182,7 +183,14 @@ def config_tlb_hierarchy(options, system, shader_idx):
                exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \
                        system.sqc_coalescer[%d].slave[%d]' % \
                        (shader_idx, index, sqc_tlb_index, sqc_tlb_port_id))
-
+        elif name == 'scalar': # Scalar D-TLB
+            for index in range(n_cu):
+                scalar_tlb_index = index / options.cu_per_scalar_cache
+                scalar_tlb_port_id = index % options.cu_per_scalar_cache
+                exec('system.cpu[%d].CUs[%d].scalar_tlb_port = \
+                        system.scalar_coalescer[%d].slave[%d]' % \
+                        (shader_idx, index, scalar_tlb_index,
+                         scalar_tlb_port_id))

    # Connect the memSidePorts (masters) of all the TLBs with the
    # cpuSidePorts (slaves) of the Coalescers of the next level
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -3728,7 +3728,7 @@ namespace Gcn3ISA
        DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
            wf->computeUnit->cu_id, wf->wgId, refCount);

-        wf->computeUnit->registerManager.freeRegisters(wf);
+        wf->computeUnit->registerManager->freeRegisters(wf);
        wf->computeUnit->completedWfs++;
        wf->computeUnit->activeWaves--;

--- a/src/arch/gcn3/insts/op_encodings.hh
+++ b/src/arch/gcn3/insts/op_encodings.hh
@@ -192,7 +192,7 @@ namespace Gcn3ISA
             */
            bool misaligned_acc = split_addr > vaddr;

-            RequestPtr req = new Request(0, vaddr, req_size, 0,
+            RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
                    gpuDynInst->computeUnit()->masterId(), 0,
                    gpuDynInst->wfDynId);

@@ -208,7 +208,6 @@ namespace Gcn3ISA
                pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
-                delete req;
            } else {
                gpuDynInst->numScalarReqs = 1;
                gpuDynInst->setRequestFlags(req);
@@ -243,7 +242,7 @@ namespace Gcn3ISA
             */
            bool misaligned_acc = split_addr > vaddr;

-            RequestPtr req = new Request(0, vaddr, req_size, 0,
+            RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
                    gpuDynInst->computeUnit()->masterId(), 0,
                    gpuDynInst->wfDynId);

@@ -259,7 +258,6 @@ namespace Gcn3ISA
                pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
                gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
-                delete req;
            } else {
                gpuDynInst->numScalarReqs = 1;
                gpuDynInst->setRequestFlags(req);
@@ -574,7 +572,8 @@ namespace Gcn3ISA
                if (gpuDynInst->exec_mask[lane]) {
                    Addr vaddr = gpuDynInst->addr[lane];

-                    RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr,
+                        sizeof(T), 0,
                        gpuDynInst->computeUnit()->masterId(), 0,
                        gpuDynInst->wfDynId);

@@ -600,7 +599,8 @@ namespace Gcn3ISA
                if (gpuDynInst->exec_mask[lane]) {
                    Addr vaddr = gpuDynInst->addr[lane];

-                    RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr,
+                        sizeof(T), 0,
                        gpuDynInst->computeUnit()->masterId(),
                        0, gpuDynInst->wfDynId);

@@ -619,7 +619,7 @@ namespace Gcn3ISA
        {
            // create request and set flags
            gpuDynInst->statusBitVector = VectorMask(1);
-            Request *req = new Request(0, 0, 0, 0,
+            RequestPtr req = std::make_shared<Request>(0, 0, 0,
                                       gpuDynInst->computeUnit()->
                                       masterId(), 0,
                                       gpuDynInst->wfDynId);
@@ -777,7 +777,8 @@ namespace Gcn3ISA
                if (gpuDynInst->exec_mask[lane]) {
                    Addr vaddr = gpuDynInst->addr[lane];

-                    RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr,
+                        sizeof(T), 0,
                            gpuDynInst->computeUnit()->masterId(), 0,
                            gpuDynInst->wfDynId);

@@ -802,7 +803,8 @@ namespace Gcn3ISA
                if (gpuDynInst->exec_mask[lane]) {
                    Addr vaddr = gpuDynInst->addr[lane];

-                    RequestPtr req = new Request(0, vaddr, req_size, 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr, req_size,
+                        0,
                        gpuDynInst->computeUnit()->masterId(), 0,
                        gpuDynInst->wfDynId);

@@ -826,7 +828,8 @@ namespace Gcn3ISA
                if (gpuDynInst->exec_mask[lane]) {
                    Addr vaddr = gpuDynInst->addr[lane];

-                    RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr,
+                        sizeof(T), 0,
                        gpuDynInst->computeUnit()->masterId(),
                            0, gpuDynInst->wfDynId);

@@ -851,7 +854,8 @@ namespace Gcn3ISA
                if (gpuDynInst->exec_mask[lane]) {
                    Addr vaddr = gpuDynInst->addr[lane];

-                    RequestPtr req = new Request(0, vaddr, req_size, 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr, req_size,
+                        0,
                        gpuDynInst->computeUnit()->masterId(),
                            0, gpuDynInst->wfDynId);

@@ -875,7 +879,8 @@ namespace Gcn3ISA
                if (gpuDynInst->exec_mask[lane]) {
                    Addr vaddr = gpuDynInst->addr[lane];

-                    RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+                    RequestPtr req = std::make_shared<Request>(vaddr,
+                        sizeof(T), 0,
                        gpuDynInst->computeUnit()->masterId(), 0,
                        gpuDynInst->wfDynId,
                        gpuDynInst->makeAtomicOpFunctor<T>(
--- a/src/arch/gcn3/operand.hh
+++ b/src/arch/gcn3/operand.hh
@@ -153,7 +153,7 @@ namespace Gcn3ISA
            ComputeUnit *cu = _gpuDynInst->computeUnit();

            for (auto i = 0; i < NumDwords; ++i) {
-                int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx + i);
+                int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx + i);
                vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);

                DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx);
@@ -207,7 +207,7 @@ namespace Gcn3ISA
                ? _gpuDynInst->exec_mask : wf->execMask();

            if (NumDwords == 1) {
-                int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx);
+                int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx);
                vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
                assert(vrfData[0]);
                auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
@@ -223,8 +223,8 @@ namespace Gcn3ISA
                DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx);
                cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
            } else if (NumDwords == 2) {
-                int vgprIdx0 = cu->registerManager.mapVgpr(wf, _opIdx);
-                int vgprIdx1 = cu->registerManager.mapVgpr(wf, _opIdx + 1);
+                int vgprIdx0 = cu->registerManager->mapVgpr(wf, _opIdx);
+                int vgprIdx1 = cu->registerManager->mapVgpr(wf, _opIdx + 1);
                vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0);
                vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1);
                assert(vrfData[0]);
@@ -605,16 +605,16 @@ namespace Gcn3ISA

            if (_opIdx == REG_VCC_LO) {
                sgprIdx = cu->registerManager
-                    .mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
+                    ->mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
            } else if (_opIdx == REG_FLAT_SCRATCH_HI) {
                sgprIdx = cu->registerManager
-                    .mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
+                    ->mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
            } else if (_opIdx == REG_FLAT_SCRATCH_LO) {
                assert(NumDwords == 1);
                sgprIdx = cu->registerManager
-                    .mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
+                    ->mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
            } else {
-                sgprIdx = cu->registerManager.mapSgpr(wf, _opIdx + dword);
+                sgprIdx = cu->registerManager->mapSgpr(wf, _opIdx + dword);
            }

            assert(sgprIdx > -1);
--- a/src/dev/hsa/hsa_device.cc
+++ b/src/dev/hsa/hsa_device.cc
@@ -101,7 +101,7 @@ HSADevice::translateOrDie(Addr vaddr, Addr &paddr)
     * with new extensions, it will likely be wrong to just arbitrarily
     * grab context zero.
     */
-    auto process = sys->getThreadContext(0)->getProcessPtr();
+    auto process = sys->threads[0]->getProcessPtr();

    if (!process->pTable->translate(vaddr, paddr)) {
        fatal("failed translation: vaddr 0x%x\n", vaddr);
--- a/src/dev/hsa/hsa_driver.cc
+++ b/src/dev/hsa/hsa_driver.cc
@@ -92,3 +92,28 @@ HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
    DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start);
    return start;
 }
+
+/**
+ * Forward relevant parameters to packet processor; queueID
+ * is used to link doorbell. The queueIDs are not re-used
+ * in current implementation, and we allocate only one page
+ * (4096 bytes) for doorbells, so check if this queue ID can
+ * be mapped into that page.
+ */
+void
+HSADriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
+{
+    TypedBufferArg<kfd_ioctl_create_queue_args> args(ioc_buf);
+    args.copyIn(mem_proxy);
+
+    if (queueId >= 0x1000) {
+        fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
+    }
+
+    args->queue_id = queueId++;
+    auto &hsa_pp = device->hsaPacketProc();
+    hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
+                              args->ring_base_address, args->queue_id,
+                              args->ring_size);
+    args.copyOut(mem_proxy);
+}
--- a/src/dev/hsa/hsa_driver.hh
+++ b/src/dev/hsa/hsa_driver.hh
@@ -56,7 +56,7 @@

 struct HSADriverParams;
 class HSADevice;
-class SETranslatingPortProxy;
+class PortProxy;
 class ThreadContext;

 class HSADriver : public EmulatedDriver
@@ -74,8 +74,7 @@ class HSADriver : public EmulatedDriver
    HSADevice *device;
    uint32_t queueId;

-    void allocateQueue(const SETranslatingPortProxy &mem_proxy,
-                       Addr ioc_buf_addr);
+    void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf);
 };

 #endif // __DEV_HSA_HSA_DRIVER_HH__
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -151,7 +151,7 @@ HSAPacketProcessor::translateOrDie(Addr vaddr, Addr &paddr)
    // Grab the process and try to translate the virtual address with it; with
    // new extensions, it will likely be wrong to just arbitrarily grab context
    // zero.
-    auto process = sys->getThreadContext(0)->getProcessPtr();
+    auto process = sys->threads[0]->getProcessPtr();

    if (!process->pTable->translate(vaddr, paddr))
        fatal("failed translation: vaddr 0x%x\n", vaddr);
@@ -393,7 +393,7 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
                 * The reason for this is that the DMASequencer does
                 * not support atomic operations.
                 */
-                auto tc = sys->getThreadContext(0);
+                auto tc = sys->threads[0];
                auto &virt_proxy = tc->getVirtProxy();
                TypedBufferArg<uint64_t> prev_signal(signal_addr);
                prev_signal.copyIn(virt_proxy);
--- a/src/dev/hsa/hw_scheduler.cc
+++ b/src/dev/hsa/hw_scheduler.cc
@@ -92,7 +92,7 @@ HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
    // We use the same mapping function used by hsa runtime to do this mapping
    //
    // Originally
-    // #define VOID_PTR_ADD32(ptr,n) \
+    // #define VOID_PTR_ADD32(ptr,n)
    //     (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
    // (Addr)VOID_PTR_ADD32(0, queue_id)
    Addr db_offset = queue_id;
@@ -343,7 +343,7 @@ HWScheduler::unregisterQueue(uint64_t queue_id)
    // `(Addr)(VOID_PRT_ADD32(0, queue_id))`
    //
    // Originally
-    // #define VOID_PTR_ADD32(ptr,n) \
+    // #define VOID_PTR_ADD32(ptr,n)
    //     (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
    // (Addr)VOID_PTR_ADD32(0, queue_id)
    Addr db_offset = queue_id;
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -1,48 +1,48 @@
+# Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+# All rights reserved.
 #
-#  Copyright (c) 2015 Advanced Micro Devices, Inc.
-#  All rights reserved.
+# For use for simulation and test purposes only
 #
-#  For use for simulation and test purposes only
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
 #
-#  1. Redistributions of source code must retain the above copyright notice,
-#  this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-#  2. Redistributions in binary form must reproduce the above copyright notice,
-#  this list of conditions and the following disclaimer in the documentation
-#  and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
 #
-#  3. Neither the name of the copyright holder nor the names of its contributors
-#  may be used to endorse or promote products derived from this software
-#  without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-#  POSSIBILITY OF SUCH DAMAGE.
-#
-#  Author: Steve Reinhardt
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
 #
+# Authors: Steve Reinhardt

 from m5.defines import buildEnv
 from m5.params import *
 from m5.proxy import *
 from m5.SimObject import SimObject

+from m5.objects.Bridge import Bridge
 from m5.objects.ClockedObject import ClockedObject
 from m5.objects.Device import DmaDevice
-from m5.objects.Process import EmulatedDriver
-from m5.objects.Bridge import Bridge
+from m5.objects.HSADevice import HSADevice
+from m5.objects.HSADriver import HSADriver
 from m5.objects.LdsState import LdsState
+from m5.objects.Process import EmulatedDriver

 class PrefetchType(Enum): vals = [
    'PF_CU',
@@ -52,15 +52,48 @@ class PrefetchType(Enum): vals = [
    'PF_END',
    ]

-class VectorRegisterFile(SimObject):
+class PoolManager(SimObject):
+    type = 'PoolManager'
+    abstract = True
+    cxx_header = "gpu-compute/pool_manager.hh"
+
+    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+    pool_size = Param.Int(2048, 'number of vector registers per SIMD')
+
+# The simple pool manage only allows one workgroup to
+# be executing on a CU at any given time.
+class SimplePoolManager(PoolManager):
+    type = 'SimplePoolManager'
+    cxx_class = 'SimplePoolManager'
+    cxx_header = "gpu-compute/simple_pool_manager.hh"
+
+class RegisterFile(SimObject):
+    type = 'RegisterFile'
+    cxx_class = 'RegisterFile'
+    cxx_header = 'gpu-compute/register_file.hh'
+
+    simd_id = Param.Int(-1, 'SIMD ID associated with this Register File')
+    num_regs = Param.Int(2048, 'number of registers in this RF')
+    wf_size = Param.Int(64, 'Wavefront size (in work items)')
+
+class ScalarRegisterFile(RegisterFile):
+    type = 'ScalarRegisterFile'
+    cxx_class = 'ScalarRegisterFile'
+    cxx_header = 'gpu-compute/scalar_register_file.hh'
+
+class VectorRegisterFile(RegisterFile):
    type = 'VectorRegisterFile'
    cxx_class = 'VectorRegisterFile'
    cxx_header = 'gpu-compute/vector_register_file.hh'

-    simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
-    num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
-    wfSize = Param.Int(64, 'Wavefront size (in work items)')
-    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+class RegisterManager(SimObject):
+    type = 'RegisterManager'
+    cxx_class = 'RegisterManager'
+    cxx_header = 'gpu-compute/register_manager.hh'
+
+    policy = Param.String("static", "Register Manager Policy")
+    vrf_pool_managers = VectorParam.PoolManager('VRF Pool Managers')
+    srf_pool_managers = VectorParam.PoolManager('SRF Pool Managers')

 class Wavefront(SimObject):
    type = 'Wavefront'
@@ -69,45 +102,68 @@ class Wavefront(SimObject):

    simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
    wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
-    wfSize = Param.Int(64, 'Wavefront size (in work items)')
+    wf_size = Param.Int(64, 'Wavefront size (in work items)')
+    max_ib_size = Param.Int(13, 'Maximum size (in number of insts) of the '
+                            'instruction buffer (IB).')

+# Most of the default values here are obtained from the
+# AMD Graphics Core Next (GCN) Architecture whitepaper.
 class ComputeUnit(ClockedObject):
    type = 'ComputeUnit'
    cxx_class = 'ComputeUnit'
    cxx_header = 'gpu-compute/compute_unit.hh'

    wavefronts = VectorParam.Wavefront('Number of wavefronts')
-    wfSize = Param.Int(64, 'Wavefront size (in work items)')
+    # Wavefront size is 64. This is configurable, however changing
+    # this value to anything other than 64 will likely cause errors.
+    wf_size = Param.Int(64, 'Wavefront size (in work items)')
    num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
+    num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU')
+    num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\
+                                     'per CU')
+    simd_width = Param.Int(16, 'width (number of lanes) per SIMD unit')
+
+    operand_network_length = Param.Int(1, 'number of pipe stages of operand '\
+                                          'network')

    spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
                                        'latency')

-    dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
+    dpbypass_pipe_length = Param.Int(4, 'vector ALU Double Precision bypass '\
                                        'latency')
-
+    scalar_pipe_length = Param.Int(1, 'number of pipe stages per scalar ALU')
    issue_period = Param.Int(4, 'number of cycles per issue period')
+
+    vrf_gm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
+                                      'GM bus')
+    srf_scm_bus_latency = Param.Int(1, 'number of cycles per use of SRF '\
+                                       'to Scalar Mem bus')
+    vrf_lm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
+                                      'LM bus')
+
    num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
    num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
-    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
-    mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
-                                "Represents the pipeline to reach the TCP and "\
-                                "specified in GPU clock cycles")
-    mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
-                                 "cu. Represents the pipeline between the TCP "\
-                                 "and cu as well as TCP data array access. "\
-                                 "Specified in GPU clock cycles")
+    n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
+    mem_req_latency = Param.Int(50, "Latency for request from the cu to ruby. "\
+                                "Represents the pipeline to reach the TCP "\
+                                "and specified in GPU clock cycles")
+    mem_resp_latency = Param.Int(50, "Latency for responses from ruby to the "\
+                                 "cu. Represents the pipeline between the "\
+                                 "TCP and cu as well as TCP data array "\
+                                 "access. Specified in GPU clock cycles")
    system = Param.System(Parent.any, "system object")
    cu_id = Param.Int('CU id')
-    vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
-                                           "in bytes")
-    coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
-                                           "in bytes")
+    vrf_to_coalescer_bus_width = Param.Int(64, "VRF->Coalescer data bus "\
+                                           "width in bytes")
+    coalescer_to_vrf_bus_width = Param.Int(64, "Coalescer->VRF data bus "\
+                                           "width  in bytes")

    memory_port = VectorMasterPort("Port to the memory system")
    translation_port = VectorMasterPort('Port to the TLB hierarchy')
    sqc_port = MasterPort("Port to the SQC (I-cache")
    sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
+    scalar_port = MasterPort("Port to the scalar data cache")
+    scalar_tlb_port = MasterPort("Port to the TLB for the scalar data cache")
    perLaneTLB = Param.Bool(False, "enable per-lane TLB")
    prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
                               "(0 turns off prefetching)")
@@ -116,19 +172,22 @@ class ComputeUnit(ClockedObject):
                                            "from last mem req in lane of "\
                                            "CU|Phase|Wavefront")
    execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
-    xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
    debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
    functionalTLB = Param.Bool(False, "Assume TLB causes no delay")

    localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
                                        "kernel end")

-    countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
-                                   "and how many times")
+    countPages = Param.Bool(False, "Generate per-CU file of all pages "\
+                            "touched and how many times")
+    scalar_mem_queue_size = Param.Int(32, "Number of entries in scalar "\
+                                      "memory pipeline's queues")
    global_mem_queue_size = Param.Int(256, "Number of entries in the global "
                                      "memory pipeline's queues")
    local_mem_queue_size = Param.Int(256, "Number of entries in the local "
                                      "memory pipeline's queues")
+    max_wave_requests = Param.Int(64, "number of pending vector memory "\
+                                      "requests per wavefront")
    max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\
                            " of instructions that can be sent to coalescer")
    ldsBus = Bridge() # the bridge between the CU and its LDS
@@ -137,72 +196,54 @@ class ComputeUnit(ClockedObject):

    vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
                                                          "file")
+
+    scalar_register_file = VectorParam.ScalarRegisterFile("Scalar register "\
+                                                          "file")
    out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
                                            " in the GM pipeline")
+    register_manager = Param.RegisterManager("Register Manager")
+    fetch_depth = Param.Int(2, 'number of i-cache lines that may be '
+                            'buffered in the fetch unit.')

 class Shader(ClockedObject):
    type = 'Shader'
    cxx_class = 'Shader'
    cxx_header = 'gpu-compute/shader.hh'
-
    CUs = VectorParam.ComputeUnit('Number of compute units')
-    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+    gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU')
+    dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher')
+    n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
    impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
-                                                  ruby at kernel boundaries""")
-    separate_acquire_release = Param.Bool(False,
-        """Do ld_acquire/st_release generate separate requests for the
-        acquire and release?""")
+                                         ruby at kernel boundaries""")
    globalmem = Param.MemorySize('64kB', 'Memory size')
    timing = Param.Bool(False, 'timing memory accesses')

    cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
    translation = Param.Bool(False, "address translation");
+    timer_period = Param.Clock('10us', "system timer period")
+    idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold")
+    max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting")

-class ClDriver(EmulatedDriver):
-    type = 'ClDriver'
-    cxx_header = 'gpu-compute/cl_driver.hh'
-    codefile = VectorParam.String('code file name(s)')
+class GPUComputeDriver(HSADriver):
+    type = 'GPUComputeDriver'
+    cxx_header = 'gpu-compute/gpu_compute_driver.hh'

-class GpuDispatcher(DmaDevice):
-    type = 'GpuDispatcher'
+class GPUDispatcher(SimObject):
+    type = 'GPUDispatcher'
    cxx_header = 'gpu-compute/dispatcher.hh'
-    # put at 8GB line for now
-    pio_addr = Param.Addr(0x200000000, "Device Address")
-    pio_latency = Param.Latency('1ns', "Programmed IO latency")
-    shader_pointer = Param.Shader('pointer to shader')
-    translation_port = MasterPort('Port to the dispatcher TLB')
-    cpu = Param.BaseCPU("CPU to wake up on kernel completion")

-    cl_driver = Param.ClDriver('pointer to driver')
-
-class MemType(Enum): vals = [
-    'M_U8',
-    'M_U16',
-    'M_U32',
-    'M_U64',
-    'M_S8',
-    'M_S16',
-    'M_S32',
-    'M_S64',
-    'M_F16',
-    'M_F32',
-    'M_F64',
-    ]
+class GPUCommandProcessor(HSADevice):
+    type = 'GPUCommandProcessor'
+    cxx_header = 'gpu-compute/gpu_command_processor.hh'
+    dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU')

 class StorageClassType(Enum): vals = [
    'SC_SPILL',
    'SC_GLOBAL',
-    'SC_SHARED',
+    'SC_GROUP',
    'SC_PRIVATE',
    'SC_READONLY',
    'SC_KERNARG',
+    'SC_ARG',
    'SC_NONE',
    ]
-
-class RegisterType(Enum): vals = [
-    'RT_VECTOR',
-    'RT_SCALAR',
-    'RT_CONDITION',
-    'RT_HARDWARE',
-    'RT_NONE',
-    ]
--- a/src/gpu-compute/GPUStaticInstFlags.py
+++ b/src/gpu-compute/GPUStaticInstFlags.py
@@ -13,9 +13,9 @@
 # this list of conditions and the following disclaimer in the documentation
 # and/or other materials provided with the distribution.
 #
-# 3. Neither the name of the copyright holder nor the names of its contributors
-# may be used to endorse or promote products derived from this software
-# without specific prior written permission.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -40,15 +40,18 @@ class GPUStaticInstFlags(Enum):
        # Op types
        'ALU',               # ALU op
        'Branch',            # Branch instruction
+        'CondBranch',        # Conditinal Branch instruction
        'Nop',               # No-op (no effect at all)
-        'Return',            # Return instruction
+        'Return',            # Subroutine return instruction
+        'EndOfKernel',       # Kernel termination instruction
+        'KernelLaunch',      # Kernel launch inst
        'UnconditionalJump', #
        'SpecialOp',         # Special op
        'Waitcnt',           # Is a waitcnt instruction

        # Memory ops
        'MemBarrier',        # Barrier instruction
-        'MemFence',          # Memory fence instruction
+        'MemSync',           # Synchronizing instruction
        'MemoryRef',         # References memory (load, store, or atomic)
        'Flat',              # Flat memory op
        'Load',              # Reads from memory
@@ -64,6 +67,13 @@ class GPUStaticInstFlags(Enum):
        'WritesSCC',         # The instruction writes SCC
        'ReadsVCC',          # The instruction reads VCC
        'WritesVCC',         # The instruction writes VCC
+        'ReadsEXEC',         # The instruction reads Exec Mask
+        'WritesEXEC',        # The instruction writes Exec Mask
+        'ReadsMode',         # The instruction reads Mode register
+        'WritesMode',        # The instruction writes Mode register
+        'IgnoreExec',        # The instruction ignores the Exec Mask
+        'IsSDWA',            # The instruction is a SDWA instruction
+        'IsDPP',             # The instruction is a DPP instruction

        # Atomic OP types
        'AtomicAnd',
@@ -78,13 +88,6 @@ class GPUStaticInstFlags(Enum):
        'AtomicMax',
        'AtomicMin',

-        # Memory order flags
-        'RelaxedOrder',
-        'Acquire',           # Has acquire semantics
-        'Release',           # Has release semantics
-        'AcquireRelease',    # Has acquire and release semantics
-        'NoOrder',           # Has no ordering restrictions
-
        # Segment access flags
        'ArgSegment',        # Accesses the arg segment
        'GlobalSegment',     # Accesses global memory
@@ -95,15 +98,17 @@ class GPUStaticInstFlags(Enum):
        'SpillSegment',      # Accesses the spill segment
        'NoSegment',         # Does not have an associated segment

-        # Scope flags
-        'WorkitemScope',
-        'WavefrontScope',
-        'WorkgroupScope',
-        'DeviceScope',
-        'SystemScope',
-        'NoScope',           # Does not have an associated scope
-
        # Coherence flags
-        'GloballyCoherent',  # Coherent with other workitems on same device
-        'SystemCoherent'     # Coherent with a different device, or the host
+        'GloballyCoherent',  # Coherent with other work-items on same device
+        'SystemCoherent',    # Coherent with a different device, or the host
+
+        # Floating-point flags
+        'F16',               # F16 operation
+        'F32',               # F32 operation
+        'F64',               # F64 operation
+
+        # MAC, MAD, FMA
+        'FMA',               # FMA
+        'MAC',               # MAC
+        'MAD'                # MAD
        ]
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -41,56 +41,62 @@ SimObject('GPUStaticInstFlags.py')
 SimObject('LdsState.py')
 SimObject('X86GPUTLB.py')

-if env['TARGET_GPU_ISA'] == 'hsail':
-    Source('brig_object.cc')
-    Source('hsail_code.cc')
-
-Source('cl_driver.cc')
 Source('compute_unit.cc')
-Source('condition_register_state.cc')
 Source('dispatcher.cc')
 Source('exec_stage.cc')
 Source('fetch_stage.cc')
 Source('fetch_unit.cc')
 Source('global_memory_pipeline.cc')
+Source('gpu_command_processor.cc')
+Source('gpu_compute_driver.cc')
 Source('gpu_dyn_inst.cc')
 Source('gpu_exec_context.cc')
 Source('gpu_static_inst.cc')
 Source('gpu_tlb.cc')
-Source('hsa_object.cc')
-Source('kernel_cfg.cc')
 Source('lds_state.cc')
 Source('local_memory_pipeline.cc')
 Source('pool_manager.cc')
+Source('register_file.cc')
+Source('register_manager.cc')
+Source('scalar_memory_pipeline.cc')
+Source('scalar_register_file.cc')
 Source('schedule_stage.cc')
 Source('scheduler.cc')
 Source('scoreboard_check_stage.cc')
 Source('shader.cc')
 Source('simple_pool_manager.cc')
+Source('static_register_manager_policy.cc')
 Source('tlb_coalescer.cc')
 Source('vector_register_file.cc')
-Source('vector_register_state.cc')
 Source('wavefront.cc')

-DebugFlag('BRIG')
 DebugFlag('GPUCoalescer')
+DebugFlag('GPUCommandProc')
+DebugFlag('GPUDriver')
+DebugFlag('GPUInitAbi')
 DebugFlag('GPUDisp')
 DebugFlag('GPUExec')
 DebugFlag('GPUFetch')
-DebugFlag('GPUHsailCFInfo')
+DebugFlag('GPUKernelInfo')
 DebugFlag('GPUMem')
 DebugFlag('GPUPort')
 DebugFlag('GPUPrefetch')
 DebugFlag('GPUReg')
+DebugFlag('GPURename')
+DebugFlag('GPURF')
+DebugFlag('GPURfState')
+DebugFlag('GPUSched')
+DebugFlag('GPUShader')
+DebugFlag('GPUSRF')
 DebugFlag('GPUSync')
 DebugFlag('GPUTLB')
 DebugFlag('GPUVRF')
-DebugFlag('HSALoader')
-DebugFlag('HSAIL')
-DebugFlag('HSAILObject')
+DebugFlag('GPUVRFSched')
+DebugFlag('GPUWgLatency')
 DebugFlag('Predictor')
 DebugFlag('WavefrontStack')

 CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
-                        'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL',
-                        'GPUVRF'])
+                        'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
+                        'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
+                        'GPUInitAbi'])
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -36,28 +36,30 @@

 #include <deque>
 #include <map>
-#include <unordered_map>
+#include <unordered_set>
 #include <vector>

 #include "base/callback.hh"
 #include "base/statistics.hh"
 #include "base/types.hh"
+#include "config/the_gpu_isa.hh"
 #include "enums/PrefetchType.hh"
 #include "gpu-compute/exec_stage.hh"
 #include "gpu-compute/fetch_stage.hh"
 #include "gpu-compute/global_memory_pipeline.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/local_memory_pipeline.hh"
-#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/register_manager.hh"
+#include "gpu-compute/scalar_memory_pipeline.hh"
 #include "gpu-compute/schedule_stage.hh"
 #include "gpu-compute/scoreboard_check_stage.hh"
 #include "mem/port.hh"
 #include "mem/token_port.hh"
 #include "sim/clocked_object.hh"

-static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
-static const int MAX_WIDTH_FOR_MEM_INST = 32;
-
-class NDRange;
+class HSAQueueEntry;
+class LdsChunk;
+class ScalarRegisterFile;
 class Shader;
 class VectorRegisterFile;

@@ -69,18 +71,6 @@ enum EXEC_POLICY
    RR
 };

-// List of execution units
-enum EXEC_UNIT
-{
-    SIMD0 = 0,
-    SIMD1,
-    SIMD2,
-    SIMD3,
-    GLBMEM_PIPE,
-    LDSMEM_PIPE,
-    NUM_UNITS
-};
-
 enum TLB_CACHE
 {
    TLB_MISS_CACHE_MISS = 0,
@@ -92,32 +82,100 @@ enum TLB_CACHE
 class ComputeUnit : public ClockedObject
 {
  public:
-    FetchStage fetchStage;
-    ScoreboardCheckStage scoreboardCheckStage;
-    ScheduleStage scheduleStage;
-    ExecStage execStage;
-    GlobalMemPipeline globalMemoryPipe;
-    LocalMemPipeline localMemoryPipe;
+
+
+    // Execution resources
+    //
+    // The ordering of units is:
+    // Vector ALUs
+    // Scalar ALUs
+    // GM Pipe
+    // LM Pipe
+    // Scalar Mem Pipe
+    //
+    // Note: the ordering of units is important and the code assumes the
+    // above ordering. However, there may be more than one resource of
+    // each type (e.g., 4 VALUs or 2 SALUs)
+
+    int numVectorGlobalMemUnits;
+    // Resource control for global memory to VRF data/address bus
+    WaitClass glbMemToVrfBus;
+    // Resource control for Vector Register File->Global Memory pipe buses
+    WaitClass vrfToGlobalMemPipeBus;
+    // Resource control for Vector Global Memory execution unit
+    WaitClass vectorGlobalMemUnit;
+
+    int numVectorSharedMemUnits;
+    // Resource control for local memory to VRF data/address bus
+    WaitClass locMemToVrfBus;
+    // Resource control for Vector Register File->Local Memory pipe buses
+    WaitClass vrfToLocalMemPipeBus;
+    // Resource control for Vector Shared/Local Memory execution unit
+    WaitClass vectorSharedMemUnit;
+
+    int numScalarMemUnits;
+    // Resource control for scalar memory to SRF data/address bus
+    WaitClass scalarMemToSrfBus;
+    // Resource control for Scalar Register File->Scalar Memory pipe buses
+    WaitClass srfToScalarMemPipeBus;
+    // Resource control for Scalar Memory execution unit
+    WaitClass scalarMemUnit;
+
+    // vector ALU execution resources
+    int numVectorALUs;
+    std::vector<WaitClass> vectorALUs;
+
+    // scalar ALU execution resources
+    int numScalarALUs;
+    std::vector<WaitClass> scalarALUs;
+
+    // Return total number of execution units on this CU
+    int numExeUnits() const;
+    // index into readyList of the first memory unit
+    int firstMemUnit() const;
+    // index into readyList of the last memory unit
+    int lastMemUnit() const;
+    // index into scalarALUs vector of SALU used by the wavefront
+    int mapWaveToScalarAlu(Wavefront *w) const;
+    // index into readyList of SALU used by wavefront
+    int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
+    // index into readyList of Global Memory unit used by wavefront
+    int mapWaveToGlobalMem(Wavefront *w) const;
+    // index into readyList of Local Memory unit used by wavefront
+    int mapWaveToLocalMem(Wavefront *w) const;
+    // index into readyList of Scalar Memory unit used by wavefront
+    int mapWaveToScalarMem(Wavefront *w) const;
+
+    int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
+    int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
+    int numCyclesPerStoreTransfer;  // number of cycles per vector store
+    int numCyclesPerLoadTransfer;  // number of cycles per vector load

    // Buffers used to communicate between various pipeline stages

+    // At a high level, the following intra-/inter-stage communication occurs:
+    // SCB to SCH: readyList provides per exec resource list of waves that
+    //             passed dependency and readiness checks. If selected by
+    //             scheduler, attempt to add wave to schList conditional on
+    //             RF support.
+    // SCH: schList holds waves that are gathering operands or waiting
+    //      for execution resource availability. Once ready, waves are
+    //      placed on the dispatchList as candidates for execution. A wave
+    //      may spend multiple cycles in SCH stage, on the schList due to
+    //      RF access conflicts or execution resource contention.
+    // SCH to EX: dispatchList holds waves that are ready to be executed.
+    //            LM/FLAT arbitration may remove an LM wave and place it
+    //            back on the schList. RF model may also force a wave back
+    //            to the schList if using the detailed model.
+
    // List of waves which are ready to be scheduled.
    // Each execution resource has a ready list. readyList is
    // used to communicate between scoreboardCheck stage and
    // schedule stage
-    // TODO: make enum to index readyList
    std::vector<std::vector<Wavefront*>> readyList;

-    // Stores the status of waves. A READY implies the
-    // wave is ready to be scheduled this cycle and
-    // is already present in the readyList. waveStatusList is
-    // used to communicate between scoreboardCheck stage and
-    // schedule stage
-    // TODO: convert std::pair to a class to increase readability
-    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
-
    // List of waves which will be dispatched to
-    // each execution resource. A FILLED implies
+    // each execution resource. An EXREADY implies
    // dispatch list is non-empty and
    // execution unit has something to execute
    // this cycle. Currently, the dispatch list of
@@ -127,32 +185,67 @@ class ComputeUnit : public ClockedObject
    // and exec stage
    // TODO: convert std::pair to a class to increase readability
    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
+    // track presence of dynamic instructions in the Schedule pipeline
+    // stage. This is used to check the readiness of the oldest,
+    // non-dispatched instruction of every WF in the Scoreboard stage.
+    std::unordered_set<uint64_t> pipeMap;
+
+    RegisterManager* registerManager;
+
+    FetchStage fetchStage;
+    ScoreboardCheckStage scoreboardCheckStage;
+    ScheduleStage scheduleStage;
+    ExecStage execStage;
+    GlobalMemPipeline globalMemoryPipe;
+    LocalMemPipeline localMemoryPipe;
+    ScalarMemPipeline scalarMemoryPipe;
+
+    EventFunctionWrapper tickEvent;

-    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
-    int rrNextALUWp;
    typedef ComputeUnitParams Params;
    std::vector<std::vector<Wavefront*>> wfList;
    int cu_id;

    // array of vector register files, one per SIMD
    std::vector<VectorRegisterFile*> vrf;
-    // Number of vector ALU units (SIMDs) in CU
-    int numSIMDs;
+    // array of scalar register files, one per SIMD
+    std::vector<ScalarRegisterFile*> srf;
+
+    // Width per VALU/SIMD unit: number of work items that can be executed
+    // on the vector ALU simultaneously in a SIMD unit
+    int simdWidth;
    // number of pipe stages for bypassing data to next dependent single
    // precision vector instruction inside the vector ALU pipeline
    int spBypassPipeLength;
    // number of pipe stages for bypassing data to next dependent double
    // precision vector instruction inside the vector ALU pipeline
    int dpBypassPipeLength;
-    // number of cycles per issue period
-    int issuePeriod;
+    // number of pipe stages for scalar ALU
+    int scalarPipeStages;
+    // number of pipe stages for operand collection & distribution network
+    int operandNetworkLength;
+    // number of cycles per instruction issue period
+    Cycles issuePeriod;
+
+    // VRF to GM Bus latency
+    Cycles vrf_gm_bus_latency;
+    // SRF to Scalar Mem Bus latency
+    Cycles srf_scm_bus_latency;
+    // VRF to LM Bus latency
+    Cycles vrf_lm_bus_latency;

-    // Number of global and local memory execution resources in CU
-    int numGlbMemUnits;
-    int numLocMemUnits;
    // tracks the last cycle a vector instruction was executed on a SIMD
    std::vector<uint64_t> lastExecCycle;

+    // Track the amount of interleaving between wavefronts on each SIMD.
+    // This stat is sampled using instExecPerSimd to compute the number of
+    // instructions that have been executed on a SIMD between a WF executing
+    // two successive instructions.
+    Stats::VectorDistribution instInterleave;
+
+    // tracks the number of dyn inst executed per SIMD
+    std::vector<uint64_t> instExecPerSimd;
+
    // true if we allow a separate TLB per lane
    bool perLaneTLB;
    // if 0, TLB prefetching is off.
@@ -166,8 +259,10 @@ class ComputeUnit : public ClockedObject
    Enums::PrefetchType prefetchType;
    EXEC_POLICY exec_policy;

-    bool xact_cas_mode;
    bool debugSegFault;
+    // Idle CU timeout in ticks
+    Tick idleCUTimeout;
+    int idleWfs;
    bool functionalTLB;
    bool localMemBarrier;

@@ -183,91 +278,67 @@ class ComputeUnit : public ClockedObject

    Shader *shader;
    uint32_t barrier_id;
-    // vector of Vector ALU (MACC) pipelines
-    std::vector<WaitClass> aluPipe;
-    // minimum issue period per SIMD unit (in cycles)
-    std::vector<WaitClass> wfWait;
-
-    // Resource control for Vector Register File->Global Memory pipe buses
-    std::vector<WaitClass> vrfToGlobalMemPipeBus;
-    // Resource control for Vector Register File->Local Memory pipe buses
-    std::vector<WaitClass> vrfToLocalMemPipeBus;
-    int nextGlbMemBus;
-    int nextLocMemBus;
-    // Resource control for global memory to VRF data/address bus
-    WaitClass glbMemToVrfBus;
-    // Resource control for local memory to VRF data/address bus
-    WaitClass locMemToVrfBus;
-
-    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
-    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
-    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
-    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load

    Tick req_tick_latency;
    Tick resp_tick_latency;

-    // number of vector registers being reserved for each SIMD unit
+    /**
+     * Number of WFs to schedule to each SIMD. This vector is populated
+     * by hasDispResources(), and consumed by the subsequent call to
+     * dispWorkgroup(), to schedule the specified number of WFs to the
+     * SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
+     */
+    std::vector<int> numWfsToSched;
+
+    // number of currently reserved vector registers per SIMD unit
    std::vector<int> vectorRegsReserved;
+    // number of currently reserved scalar registers per SIMD unit
+    std::vector<int> scalarRegsReserved;
    // number of vector registers per SIMD unit
-    uint32_t numVecRegsPerSimd;
-    // Support for scheduling VGPR status update events
-    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
-    std::vector<uint64_t> timestampVec;
-    std::vector<uint8_t>  statusVec;
+    int numVecRegsPerSimd;
+    // number of available scalar registers per SIMD unit
+    int numScalarRegsPerSimd;

-    void
-    registerEvent(uint32_t simdId,
-                  uint32_t regIdx,
-                  uint32_t operandSize,
-                  uint64_t when,
-                  uint8_t newStatus) {
-        regIdxVec.push_back(std::make_pair(simdId, regIdx));
-        timestampVec.push_back(when);
-        statusVec.push_back(newStatus);
-        if (operandSize > 4) {
-            regIdxVec.push_back(std::make_pair(simdId,
-                                               ((regIdx + 1) %
-                                                numVecRegsPerSimd)));
-            timestampVec.push_back(when);
-            statusVec.push_back(newStatus);
-        }
-    }
-
-    void updateEvents();
+    void updateReadyList(int unitId);

    // this hash map will keep track of page divergence
    // per memory instruction per wavefront. The hash map
    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
    std::map<Addr, int> pagesTouched;

+    void insertInPipeMap(Wavefront *w);
+    void deleteFromPipeMap(Wavefront *w);
+
    ComputeUnit(const Params *p);
    ~ComputeUnit();
-    int spBypassLength() { return spBypassPipeLength; };
-    int dpBypassLength() { return dpBypassPipeLength; };
-    int storeBusLength() { return numCyclesPerStoreTransfer; };
-    int loadBusLength() { return numCyclesPerLoadTransfer; };
-    int wfSize() const { return wavefrontSize; };

-    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    // Timing Functions
+    int oprNetPipeLength() const { return operandNetworkLength; }
+    int simdUnitWidth() const { return simdWidth; }
+    int spBypassLength() const { return spBypassPipeLength; }
+    int dpBypassLength() const { return dpBypassPipeLength; }
+    int scalarPipeLength() const { return scalarPipeStages; }
+    int storeBusLength() const { return numCyclesPerStoreTransfer; }
+    int loadBusLength() const { return numCyclesPerLoadTransfer; }
+    int wfSize() const { return wavefrontSize; }
+
    void exec();
    void initiateFetch(Wavefront *wavefront);
    void fetch(PacketPtr pkt, Wavefront *wavefront);
-    void fillKernelState(Wavefront *w, NDRange *ndr);
+    void fillKernelState(Wavefront *w, HSAQueueEntry *task);

    void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
-                        NDRange *ndr);
+                        HSAQueueEntry *task, bool fetchContext=false);

-    void StartWorkgroup(NDRange *ndr);
-    int ReadyWorkgroup(NDRange *ndr);
+    void doInvalidate(RequestPtr req, int kernId);
+    void doFlush(GPUDynInstPtr gpuDynInst);
+
+    void dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler=false);
+    bool hasDispResources(HSAQueueEntry *task);
+
+    int cacheLineSize() const { return _cacheLineSize; }
+    int getCacheLineBits() const { return cacheLineBits; }

-    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
-    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
-    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
-    int GlbMemUnitId() { return GLBMEM_PIPE; }
-    int ShrMemUnitId() { return LDSMEM_PIPE; }
-    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
-    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
    /* This function cycles through all the wavefronts in all the phases to see
     * if all of the wavefronts which should be associated with one barrier
     * (denoted with _barrier_id), are all at the same barrier in the program
@@ -275,14 +346,15 @@ class ComputeUnit : public ClockedObject
     * return true.
     */
    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
-    bool cedeSIMD(int simdId, int wfSlotId);

-    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
+    template<typename c0, typename c1>
+    void doSmReturn(GPUDynInstPtr gpuDynInst);
+
    virtual void init() override;
    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
-    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+    void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
-                              bool kernelLaunch=true,
+                              bool kernelMemSync,
                              RequestPtr req=nullptr);
    void handleMemPacket(PacketPtr pkt, int memport_index);
    bool processTimingPacket(PacketPtr pkt);
@@ -292,7 +364,7 @@ class ComputeUnit : public ClockedObject
    MasterID masterId() { return _masterId; }

    bool isDone() const;
-    bool isSimdDone(uint32_t) const;
+    bool isVectorAluIdle(uint32_t simdId) const;

  protected:
    MasterID _masterId;
@@ -323,6 +395,44 @@ class ComputeUnit : public ClockedObject
    Stats::Scalar scalarMemReads;
    Stats::Formula scalarMemReadsPerWF;

+    Stats::Formula vectorMemReadsPerKiloInst;
+    Stats::Formula vectorMemWritesPerKiloInst;
+    Stats::Formula vectorMemInstsPerKiloInst;
+    Stats::Formula scalarMemReadsPerKiloInst;
+    Stats::Formula scalarMemWritesPerKiloInst;
+    Stats::Formula scalarMemInstsPerKiloInst;
+
+    // Cycles required to send register source (addr and data) from
+    // register files to memory pipeline, per SIMD.
+    Stats::Vector instCyclesVMemPerSimd;
+    Stats::Vector instCyclesScMemPerSimd;
+    Stats::Vector instCyclesLdsPerSimd;
+
+    Stats::Scalar globalReads;
+    Stats::Scalar globalWrites;
+    Stats::Formula globalMemInsts;
+    Stats::Scalar argReads;
+    Stats::Scalar argWrites;
+    Stats::Formula argMemInsts;
+    Stats::Scalar spillReads;
+    Stats::Scalar spillWrites;
+    Stats::Formula spillMemInsts;
+    Stats::Scalar groupReads;
+    Stats::Scalar groupWrites;
+    Stats::Formula groupMemInsts;
+    Stats::Scalar privReads;
+    Stats::Scalar privWrites;
+    Stats::Formula privMemInsts;
+    Stats::Scalar readonlyReads;
+    Stats::Scalar readonlyWrites;
+    Stats::Formula readonlyMemInsts;
+    Stats::Scalar kernargReads;
+    Stats::Scalar kernargWrites;
+    Stats::Formula kernargMemInsts;
+
+    int activeWaves;
+    Stats::Distribution waveLevelParallelism;
+
    void updateInstStats(GPUDynInstPtr gpuDynInst);

    // the following stats compute the avg. TLB accesslatency per
@@ -339,21 +449,48 @@ class ComputeUnit : public ClockedObject
    // over all memory instructions executed over all wavefronts
    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
    Stats::Distribution pageDivergenceDist;
+    // count of non-flat global memory vector instructions executed
    Stats::Scalar dynamicGMemInstrCnt;
+    // count of flat global memory vector instructions executed
+    Stats::Scalar dynamicFlatMemInstrCnt;
    Stats::Scalar dynamicLMemInstrCnt;

    Stats::Scalar wgBlockedDueLdsAllocation;
-    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
-    // when the instruction is committed, this number is still incremented by 1
+    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
+    // active when the instruction is committed, this number is still
+    // incremented by 1
    Stats::Scalar numInstrExecuted;
    // Number of cycles among successive instruction executions across all
    // wavefronts of the same CU
    Stats::Distribution execRateDist;
    // number of individual vector operations executed
    Stats::Scalar numVecOpsExecuted;
+    // number of individual f16 vector operations executed
+    Stats::Scalar numVecOpsExecutedF16;
+    // number of individual f32 vector operations executed
+    Stats::Scalar numVecOpsExecutedF32;
+    // number of individual f64 vector operations executed
+    Stats::Scalar numVecOpsExecutedF64;
+    // number of individual FMA 16,32,64 vector operations executed
+    Stats::Scalar numVecOpsExecutedFMA16;
+    Stats::Scalar numVecOpsExecutedFMA32;
+    Stats::Scalar numVecOpsExecutedFMA64;
+    // number of individual MAC 16,32,64 vector operations executed
+    Stats::Scalar numVecOpsExecutedMAC16;
+    Stats::Scalar numVecOpsExecutedMAC32;
+    Stats::Scalar numVecOpsExecutedMAC64;
+    // number of individual MAD 16,32,64 vector operations executed
+    Stats::Scalar numVecOpsExecutedMAD16;
+    Stats::Scalar numVecOpsExecutedMAD32;
+    Stats::Scalar numVecOpsExecutedMAD64;
+    // total number of two op FP vector operations executed
+    Stats::Scalar numVecOpsExecutedTwoOpFP;
    // Total cycles that something is running on the GPU
    Stats::Scalar totalCycles;
    Stats::Formula vpc; // vector ops per cycle
+    Stats::Formula vpc_f16; // vector ops per cycle
+    Stats::Formula vpc_f32; // vector ops per cycle
+    Stats::Formula vpc_f64; // vector ops per cycle
    Stats::Formula ipc; // vector instructions per cycle
    Stats::Distribution controlFlowDivergenceDist;
    Stats::Distribution activeLanesPerGMemInstrDist;
@@ -362,20 +499,16 @@ class ComputeUnit : public ClockedObject
    Stats::Formula numALUInstsExecuted;
    // number of times a WG can not start due to lack of free VGPRs in SIMDs
    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+    // number of times a WG can not start due to lack of free SGPRs in SIMDs
+    Stats::Scalar numTimesWgBlockedDueSgprAlloc;
    Stats::Scalar numCASOps;
    Stats::Scalar numFailedCASOps;
    Stats::Scalar completedWfs;
-    // flag per vector SIMD unit that is set when there is at least one
-    // WV that has a vector ALU instruction as the oldest in its
-    // Instruction Buffer: Defined in the Scoreboard stage, consumed
-    // by the Execute stage.
-    std::vector<bool> vectorAluInstAvail;
-    // number of available (oldest) LDS instructions that could have
-    // been issued to the LDS at a specific issue slot
-    int shrMemInstAvail;
-    // number of available Global memory instructions that could have
-    // been issued to TCP at a specific issue slot
-    int glbMemInstAvail;
+    Stats::Scalar completedWGs;
+
+    // distrubtion in latency difference between first and last cache block
+    // arrival ticks
+    Stats::Distribution headTailLatency;

    void
    regStats() override;
@@ -389,8 +522,6 @@ class ComputeUnit : public ClockedObject
    int32_t
    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;

-    int cacheLineSize() const { return _cacheLineSize; }
-
    bool
    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));

@@ -486,6 +617,56 @@ class ComputeUnit : public ClockedObject

    };

+    // Scalar data cache access port
+    class ScalarDataPort : public MasterPort
+    {
+      public:
+        ScalarDataPort(const std::string &_name, ComputeUnit *_cu,
+                       PortID _index)
+            : MasterPort(_name, _cu, _index), computeUnit(_cu), index(_index)
+        {
+            (void)index;
+        }
+
+        bool recvTimingResp(PacketPtr pkt) override;
+        void recvReqRetry() override;
+
+        struct SenderState : public Packet::SenderState
+        {
+            SenderState(GPUDynInstPtr gpuDynInst,
+                        Packet::SenderState *sender_state=nullptr)
+                : _gpuDynInst(gpuDynInst), saved(sender_state)
+            {
+            }
+
+            GPUDynInstPtr _gpuDynInst;
+            Packet::SenderState *saved;
+        };
+
+        class MemReqEvent : public Event
+        {
+          private:
+            ScalarDataPort *scalarDataPort;
+            PacketPtr pkt;
+
+          public:
+            MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt)
+                : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        std::deque<PacketPtr> retries;
+
+      private:
+        ComputeUnit *computeUnit;
+        PortID index;
+    };
+
    // Instruction cache access port
    class SQCPort : public MasterPort
    {
@@ -500,10 +681,13 @@ class ComputeUnit : public ClockedObject
        {
            Wavefront *wavefront;
            Packet::SenderState *saved;
+            // kernel id to be used in handling I-Cache invalidate response
+            int kernId;

            SenderState(Wavefront *_wavefront, Packet::SenderState
-                    *sender_state=nullptr)
-                : wavefront(_wavefront), saved(sender_state) { }
+                    *sender_state=nullptr, int _kernId=-1)
+                : wavefront(_wavefront), saved(sender_state),
+                kernId(_kernId){ }
        };

        std::deque<std::pair<PacketPtr, Wavefront*>> retries;
@@ -575,6 +759,34 @@ class ComputeUnit : public ClockedObject
        virtual void recvReqRetry();
    };

+    class ScalarDTLBPort : public MasterPort
+    {
+      public:
+        ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
+            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false)
+        {
+        }
+
+        struct SenderState : public Packet::SenderState
+        {
+            SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
+            GPUDynInstPtr _gpuDynInst;
+        };
+
+        bool recvTimingResp(PacketPtr pkt) override;
+        void recvReqRetry() override { assert(false); }
+
+        bool isStalled() const { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        std::deque<PacketPtr> retries;
+
+      private:
+        ComputeUnit *computeUnit;
+        bool stalled;
+    };
+
    class ITLBPort : public MasterPort
    {
      public:
@@ -710,6 +922,10 @@ class ComputeUnit : public ClockedObject
    std::vector<DataPort*> memPort;
    // port to the TLB hierarchy (i.e., the L1 TLB)
    std::vector<DTLBPort*> tlbPort;
+    // port to the scalar data cache
+    ScalarDataPort *scalarDataPort;
+    // port to the scalar data TLB
+    ScalarDTLBPort *scalarDTLBPort;
    // port to the SQC (i.e. the I-cache)
    SQCPort *sqcPort;
    // port to the SQC TLB (there's a separate TLB for each I-cache)
@@ -726,6 +942,14 @@ class ComputeUnit : public ClockedObject
            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
                                        this, idx);
            return *tlbPort[idx];
+        } else if (if_name == "scalar_port") {
+            scalarDataPort = new ScalarDataPort(csprintf("%s-port%d", name(),
+                                                idx), this, idx);
+            return *scalarDataPort;
+        } else if (if_name == "scalar_tlb_port") {
+            scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port", name()),
+                                                this);
+            return *scalarDTLBPort;
        } else if (if_name == "sqc_port") {
            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
                                  this, idx);
@@ -746,32 +970,18 @@ class ComputeUnit : public ClockedObject
        }
    }

-    // xact_cas_load()
-    class waveIdentifier
-    {
-      public:
-        waveIdentifier() { }
-        waveIdentifier(int _simdId, int _wfSlotId)
-          : simdId(_simdId), wfSlotId(_wfSlotId) { }
-
-        int simdId;
-        int wfSlotId;
-    };
-
-    class waveQueue
-    {
-      public:
-        std::list<waveIdentifier> waveIDQueue;
-    };
-    std::map<unsigned, waveQueue> xactCasLoadMap;
-
-    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
+    InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }

  private:
    const int _cacheLineSize;
-    uint64_t globalSeqNum;
+    int cacheLineBits;
+    InstSeqNum globalSeqNum;
    int wavefrontSize;
-    GPUStaticInst *kernelLaunchInst;
+
+    // hold the time of the arrival of the first cache block related to
+    // a particular GPUDynInst. This is used to calculate the difference
+    // between the first and last chace block arrival times.
+    std::map<GPUDynInstPtr, Tick> headTailMap;
 };

 #endif // __COMPUTE_UNIT_HH__
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -34,66 +34,76 @@

 #include "gpu-compute/dispatcher.hh"

-#include "cpu/base.hh"
 #include "debug/GPUDisp.hh"
-#include "gpu-compute/cl_driver.hh"
-#include "gpu-compute/cl_event.hh"
+#include "debug/GPUKernelInfo.hh"
+#include "debug/GPUWgLatency.hh"
+#include "gpu-compute/gpu_command_processor.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/wavefront.hh"
-#include "mem/packet_access.hh"
+#include "sim/syscall_emul_buf.hh"
+#include "sim/system.hh"

-GpuDispatcher *GpuDispatcher::instance = nullptr;
-
-GpuDispatcher::GpuDispatcher(const Params *p)
-    : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")),
-      pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
-      dispatchCount(0), dispatchActive(false), cpu(p->cpu),
-      shader(p->shader_pointer), driver(p->cl_driver),
-      tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
-                false, Event::CPU_Tick_Pri)
+GPUDispatcher::GPUDispatcher(const Params *p)
+    : SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
+      tickEvent([this]{ exec(); },
+          "GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
+      dispatchActive(false)
 {
-    shader->handshake(this);
-    driver->handshake(this);
-
-    ndRange.wg_disp_rem = false;
-    ndRange.globalWgId = 0;
-
    schedule(&tickEvent, 0);
-
-    // translation port for the dispatcher
-    tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
-
-    num_kernelLaunched
-    .name(name() + ".num_kernel_launched")
-    .desc("number of kernel launched")
-    ;
 }

-GpuDispatcher *GpuDispatcherParams::create()
+GPUDispatcher::~GPUDispatcher()
 {
-    GpuDispatcher *dispatcher = new GpuDispatcher(this);
-    GpuDispatcher::setInstance(dispatcher);
-
-    return GpuDispatcher::getInstance();
 }

 void
-GpuDispatcher::serialize(CheckpointOut &cp) const
+GPUDispatcher::regStats()
+{
+    numKernelLaunched
+    .name(name() + ".num_kernel_launched")
+    .desc("number of kernel launched")
+    ;
+
+    cyclesWaitingForDispatch
+    .name(name() + ".cycles_wait_dispatch")
+    .desc("number of cycles with outstanding wavefronts "
+          "that are waiting to be dispatched")
+    ;
+}
+
+HSAQueueEntry*
+GPUDispatcher::hsaTask(int disp_id)
+{
+    assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end());
+    return hsaQueueEntries[disp_id];
+}
+
+void
+GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
+{
+    gpuCmdProc = gpu_cmd_proc;
+}
+
+void
+GPUDispatcher::setShader(Shader *new_shader)
+{
+    shader = new_shader;
+}
+
+void
+GPUDispatcher::serialize(CheckpointOut &cp) const
 {
    Tick event_tick = 0;

-    if (ndRange.wg_disp_rem)
-        fatal("Checkpointing not supported during active workgroup execution");
-
    if (tickEvent.scheduled())
        event_tick = tickEvent.when();

    SERIALIZE_SCALAR(event_tick);
-
 }

 void
-GpuDispatcher::unserialize(CheckpointIn &cp)
+GPUDispatcher::unserialize(CheckpointIn &cp)
 {
    Tick event_tick;

@@ -102,288 +112,256 @@ GpuDispatcher::unserialize(CheckpointIn &cp)

    UNSERIALIZE_SCALAR(event_tick);

-    if (event_tick)
+    if (event_tick) {
        schedule(&tickEvent, event_tick);
+    }
 }

-AddrRangeList
-GpuDispatcher::getAddrRanges() const
+/**
+ * After all relevant HSA data structures have been traversed/extracted
+ * from memory by the CP, dispatch() is called on the dispatcher. This will
+ * schedule a dispatch event that, when triggered, will attempt to dispatch
+ * the WGs associated with the given task to the CUs.
+ */
+void
+GPUDispatcher::dispatch(HSAQueueEntry *task)
 {
-    AddrRangeList ranges;
+    ++numKernelLaunched;

-    DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
-            pioAddr, pioSize);
+    DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
+            task->kernelName(), task->dispatchId());

-    ranges.push_back(RangeSize(pioAddr, pioSize));
+    execIds.push(task->dispatchId());
+    dispatchActive = true;
+    hsaQueueEntries.emplace(task->dispatchId(), task);

-    return ranges;
-}
-
-Tick
-GpuDispatcher::read(PacketPtr pkt)
-{
-    assert(pkt->getAddr() >= pioAddr);
-    assert(pkt->getAddr() < pioAddr + pioSize);
-
-    int offset = pkt->getAddr() - pioAddr;
-    pkt->allocate();
-
-    DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
-
-    if (offset < 8) {
-        assert(!offset);
-        assert(pkt->getSize() == 8);
-
-        uint64_t retval = dispatchActive;
-        pkt->setLE(retval);
-    } else {
-        offset -= 8;
-        assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
-        char *curTaskPtr = (char*)&curTask;
-
-        memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
+    if (!tickEvent.scheduled()) {
+        schedule(&tickEvent, curTick() + shader->clockPeriod());
    }
-
-    pkt->makeAtomicResponse();
-
-    return pioDelay;
-}
-
-Tick
-GpuDispatcher::write(PacketPtr pkt)
-{
-    assert(pkt->getAddr() >= pioAddr);
-    assert(pkt->getAddr() < pioAddr + pioSize);
-
-    int offset = pkt->getAddr() - pioAddr;
-
-#if TRACING_ON
-    uint64_t data_val = 0;
-
-    switch (pkt->getSize()) {
-      case 1:
-        data_val = pkt->getLE<uint8_t>();
-        break;
-      case 2:
-        data_val = pkt->getLE<uint16_t>();
-        break;
-      case 4:
-        data_val = pkt->getLE<uint32_t>();
-        break;
-      case 8:
-        data_val = pkt->getLE<uint64_t>();
-        break;
-      default:
-        DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
-    }
-
-    DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
-            pkt->getSize());
-#endif
-    if (!offset) {
-        static int nextId = 0;
-
-        // The depends field of the qstruct, which was previously unused, is
-        // used to communicate with simulated application.
-        if (curTask.depends) {
-            HostState hs;
-            shader->ReadMem((uint64_t)(curTask.depends), &hs,
-                            sizeof(HostState), 0);
-
-            // update event start time (in nano-seconds)
-            uint64_t start = curTick() / 1000;
-
-            shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
-                             &start, sizeof(uint64_t), 0);
-        }
-
-        // launch kernel
-        ++num_kernelLaunched;
-
-        NDRange *ndr = &(ndRangeMap[nextId]);
-        // copy dispatch info
-        ndr->q = curTask;
-
-        // update the numDispTask polled by the runtime
-        accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
-
-        ndr->numWgTotal = 1;
-
-        for (int i = 0; i < 3; ++i) {
-            ndr->wgId[i] = 0;
-            ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
-            ndr->numWgTotal *= ndr->numWg[i];
-        }
-
-        ndr->numWgCompleted = 0;
-        ndr->globalWgId = 0;
-        ndr->wg_disp_rem = true;
-        ndr->execDone = false;
-        ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
-        ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
-        ndr->dispatchId = nextId;
-        ndr->curCid = pkt->req->contextId();
-        DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
-        execIds.push(nextId);
-        ++nextId;
-
-        dispatchActive = true;
-
-        if (!tickEvent.scheduled()) {
-            schedule(&tickEvent, curTick() + shader->ticks(1));
-        }
-    } else {
-        // populate current task struct
-        // first 64 bits are launch reg
-        offset -= 8;
-        assert(offset < sizeof(HsaQueueEntry));
-        char *curTaskPtr = (char*)&curTask;
-        memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
-    }
-
-    pkt->makeAtomicResponse();
-
-    return pioDelay;
-}
-
-
-Port &
-GpuDispatcher::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "translation_port") {
-        return *tlbPort;
-    }
-
-    return DmaDevice::getPort(if_name, idx);
 }

 void
-GpuDispatcher::exec()
+GPUDispatcher::exec()
 {
-    int fail_count = 0;
+    int fail_count(0);

-    // There are potentially multiple outstanding kernel launches.
-    // It is possible that the workgroups in a different kernel
-    // can fit on the GPU even if another kernel's workgroups cannot
+    /**
+     * There are potentially multiple outstanding kernel launches.
+     * It is possible that the workgroups in a different kernel
+     * can fit on the GPU even if another kernel's workgroups cannot
+     */
    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());

+    if (execIds.size() > 0) {
+        ++cyclesWaitingForDispatch;
+    }
+
+    /**
+     * dispatch work cannot start until the kernel's invalidate is
+     * completely finished; hence, kernel will always initiates
+     * invalidate first and keeps waiting until inv done
+     */
    while (execIds.size() > fail_count) {
-        int execId = execIds.front();
+        int exec_id = execIds.front();
+        auto task = hsaQueueEntries[exec_id];
+        bool launched(false);

-        while (ndRangeMap[execId].wg_disp_rem) {
-            //update the thread context
-            shader->updateContext(ndRangeMap[execId].curCid);
+        // invalidate is needed before starting dispatch
+        if (shader->impl_kern_boundary_sync) {
+            // try to invalidate cache
+            shader->prepareInvalidate(task);
+        } else {
+            // kern boundary sync is not set, skip invalidate
+            task->markInvDone();
+        }

-            // attempt to dispatch_workgroup
-            if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
-                // if we failed try the next kernel,
-                // it may have smaller workgroups.
-                // put it on the queue to rety latter
-                DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
-                execIds.push(execId);
+        /**
+         * invalidate is still ongoing, put the kernel on the queue to
+         * retry later
+         */
+        if (!task->isInvDone()){
+            execIds.push(exec_id);
+            ++fail_count;
+
+            DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending"
+                " invalidate requests\n", exec_id, task->outstandingInvs());
+
+            // try the next kernel_id
+            execIds.pop();
+            continue;
+        }
+
+        // kernel invalidate is done, start workgroup dispatch
+        while (!task->dispComplete()) {
+            // update the thread context
+            shader->updateContext(task->contextId());
+
+            // attempt to dispatch workgroup
+            DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n",
+                curTick(), exec_id);
+
+            if (!shader->dispatchWorkgroups(task)) {
+                /**
+                 * if we failed try the next kernel,
+                 * it may have smaller workgroups.
+                 * put it on the queue to rety latter
+                 */
+                DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
+                execIds.push(exec_id);
                ++fail_count;
                break;
+            } else if (!launched) {
+                launched = true;
+                DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);
            }
        }
-        // let's try the next kernel_id
+
+        // try the next kernel_id
        execIds.pop();
    }

    DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());

-    if (doneIds.size() && cpu) {
-        shader->hostWakeUp(cpu);
-    }
-
    while (doneIds.size()) {
-        // wakeup the CPU if any Kernels completed this cycle
-        DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
+        DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
        doneIds.pop();
    }
 }

-void
-GpuDispatcher::notifyWgCompl(Wavefront *w)
+bool
+GPUDispatcher::isReachingKernelEnd(Wavefront *wf)
 {
-    int kern_id = w->kernId;
-    DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
-    assert(ndRangeMap[kern_id].dispatchId == kern_id);
-    ndRangeMap[kern_id].numWgCompleted++;
+    int kern_id = wf->kernId;
+    assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end());
+    auto task = hsaQueueEntries[kern_id];
+    assert(task->dispatchId() == kern_id);

-    if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
-        ndRangeMap[kern_id].execDone = true;
-        doneIds.push(kern_id);
+    /**
+     * whether the next workgroup is the final one in the kernel,
+     * +1 as we check first before taking action
+     */
+    return (task->numWgCompleted() + 1 == task->numWgTotal());
+}

-        if (ndRangeMap[kern_id].addrToNotify) {
-            accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
-                          0);
+/**
+ * update the counter of oustanding inv requests for the kernel
+ * kern_id: kernel id
+ * val: +1/-1, increment or decrement the counter (default: -1)
+ */
+void
+GPUDispatcher::updateInvCounter(int kern_id, int val) {
+    assert(val == -1 || val == 1);
+
+    auto task = hsaQueueEntries[kern_id];
+    task->updateOutstandingInvs(val);
+
+    // kernel invalidate is done, schedule dispatch work
+    if (task->isInvDone() && !tickEvent.scheduled()) {
+        schedule(&tickEvent, curTick() + shader->clockPeriod());
+    }
+}
+
+/**
+ * update the counter of oustanding wb requests for the kernel
+ * kern_id: kernel id
+ * val: +1/-1, increment or decrement the counter (default: -1)
+ *
+ * return true if all wbs are done for the kernel
+ */
+bool
+GPUDispatcher::updateWbCounter(int kern_id, int val) {
+    assert(val == -1 || val == 1);
+
+    auto task = hsaQueueEntries[kern_id];
+    task->updateOutstandingWbs(val);
+
+    // true: WB is done, false: WB is still ongoing
+    return (task->outstandingWbs() == 0);
+}
+
+/**
+ * get kernel's outstanding cache writeback requests
+ */
+int
+GPUDispatcher::getOutstandingWbs(int kernId) {
+    auto task = hsaQueueEntries[kernId];
+
+    return task->outstandingWbs();
+}
+
+/**
+ * When an end program instruction detects that the last WF in
+ * a WG has completed it will call this method on the dispatcher.
+ * If we detect that this is the last WG for the given task, then
+ * we ring the completion signal, which is used by the CPU to
+ * synchronize with the GPU. The HSAPP is also notified that the
+ * task has completed so it can be removed from its task queues.
+ */
+void
+GPUDispatcher::notifyWgCompl(Wavefront *wf)
+{
+    int kern_id = wf->kernId;
+    DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId);
+    auto task = hsaQueueEntries[kern_id];
+    assert(task->dispatchId() == kern_id);
+    task->notifyWgCompleted();
+
+    DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",
+        curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id);
+
+    if (task->numWgCompleted() == task->numWgTotal()) {
+        // Notify the HSA PP that this kernel is complete
+        gpuCmdProc->hsaPacketProc()
+            .finishPkt(task->dispPktPtr(), task->queueId());
+        if (task->completionSignal()) {
+            // The signal value is aligned 8 bytes from
+            // the actual handle in the runtime
+            Addr signal_addr = task->completionSignal() + sizeof(Addr);
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "
+                    "completion signal: %x!\n", signal_addr);
+
+            /**
+             * HACK: The semantics of the HSA signal is to decrement
+             * the current signal value. We cheat here and read out
+             * he value from main memory using functional access and
+             * then just DMA the decremented value. This is because
+             * the DMA controller does not currently support GPU
+             * atomics.
+             */
+            auto *tc = gpuCmdProc->system()->threads[0];
+            auto &virt_proxy = tc->getVirtProxy();
+            TypedBufferArg<Addr> prev_signal(signal_addr);
+            prev_signal.copyIn(virt_proxy);
+
+            Addr *new_signal = new Addr;
+            *new_signal = (Addr)*prev_signal - 1;
+
+            gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,
+                new_signal, 0);
+        } else {
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
+                "signal\n");
        }

-        accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
-
-        // update event end time (in nano-seconds)
-        if (ndRangeMap[kern_id].q.depends) {
-            HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
-            uint64_t event;
-            shader->ReadMem((uint64_t)(&host_state->event), &event,
-                            sizeof(uint64_t), 0);
-
-            uint64_t end = curTick() / 1000;
-
-            shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
-                             sizeof(uint64_t), 0);
-        }
+        DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n",
+                curTick(), kern_id);
+        DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
    }

    if (!tickEvent.scheduled()) {
-        schedule(&tickEvent, curTick() + shader->ticks(1));
+        schedule(&tickEvent, curTick() + shader->clockPeriod());
    }
 }

 void
-GpuDispatcher::scheduleDispatch()
+GPUDispatcher::scheduleDispatch()
 {
-    if (!tickEvent.scheduled())
-        schedule(&tickEvent, curTick() + shader->ticks(1));
-}
-
-void
-GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
-{
-    if (cpu) {
-        if (off) {
-            shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
-                              true);
-            val += off;
-        }
-
-        shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
-    } else {
-        panic("Cannot find host");
+    if (!tickEvent.scheduled()) {
+        schedule(&tickEvent, curTick() + shader->clockPeriod());
    }
 }

-// helper functions for driver to retrieve GPU attributes
-int
-GpuDispatcher::getNumCUs()
+GPUDispatcher *GPUDispatcherParams::create()
 {
-    return shader->cuList.size();
-}
-
-int
-GpuDispatcher::wfSize() const
-{
-    return shader->cuList[0]->wfSize();
-}
-
-void
-GpuDispatcher::setFuncargsSize(int funcargs_size)
-{
-    shader->funcargs_size = funcargs_size;
-}
-
-uint32_t
-GpuDispatcher::getStaticContextSize() const
-{
-    return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
+    return new GPUDispatcher(this);
 }
--- a/src/gpu-compute/dispatcher.hh
+++ b/src/gpu-compute/dispatcher.hh
@@ -31,125 +31,69 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-#ifndef __GPU_DISPATCHER_HH__
-#define __GPU_DISPATCHER_HH__
+/**
+ * @file
+ * The GPUDispatcher is the component of the shader that is responsible
+ * for creating and dispatching WGs to the compute units. If all WGs in
+ * a kernel cannot be dispatched simultaneously, then the dispatcher will
+ * keep track of all pending WGs and dispatch them as resources become
+ * available.
+ */
+
+#ifndef __GPU_COMPUTE_DISPATCHER_HH__
+#define __GPU_COMPUTE_DISPATCHER_HH__

 #include <queue>
+#include <unordered_map>
 #include <vector>

 #include "base/statistics.hh"
-#include "dev/dma_device.hh"
-#include "gpu-compute/compute_unit.hh"
-#include "gpu-compute/ndrange.hh"
-#include "gpu-compute/qstruct.hh"
-#include "mem/port.hh"
-#include "params/GpuDispatcher.hh"
+#include "dev/hsa/hsa_packet.hh"
+#include "params/GPUDispatcher.hh"
+#include "sim/sim_object.hh"

-class BaseCPU;
+class GPUCommandProcessor;
+class HSAQueueEntry;
 class Shader;
+class Wavefront;

-class GpuDispatcher : public DmaDevice
+class GPUDispatcher : public SimObject
 {
-    public:
-        typedef GpuDispatcherParams Params;
+  public:
+    typedef GPUDispatcherParams Params;

-        MasterID masterId() { return _masterId; }
+    GPUDispatcher(const Params *p);
+    ~GPUDispatcher();

-    protected:
-        MasterID _masterId;
+    void serialize(CheckpointOut &cp) const override;
+    void unserialize(CheckpointIn &cp) override;
+    void regStats() override;
+    void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc);
+    void setShader(Shader *new_shader);
+    void exec();
+    bool isReachingKernelEnd(Wavefront *wf);
+    void updateInvCounter(int kern_id, int val=-1);
+    bool updateWbCounter(int kern_id, int val=-1);
+    int getOutstandingWbs(int kern_id);
+    void notifyWgCompl(Wavefront *wf);
+    void scheduleDispatch();
+    void dispatch(HSAQueueEntry *task);
+    HSAQueueEntry* hsaTask(int disp_id);

-        // Base and length of PIO register space
-        Addr pioAddr;
-        Addr pioSize;
-        Tick pioDelay;
-
-        HsaQueueEntry curTask;
-
-        std::unordered_map<int, NDRange> ndRangeMap;
-        NDRange ndRange;
-
-        // list of kernel_ids to launch
-        std::queue<int> execIds;
-        // list of kernel_ids that have finished
-        std::queue<int> doneIds;
-
-        uint64_t dispatchCount;
-        // is there a kernel in execution?
-        bool dispatchActive;
-
-        BaseCPU *cpu;
-        Shader *shader;
-        ClDriver *driver;
-        EventFunctionWrapper tickEvent;
-
-
-        static GpuDispatcher *instance;
-
-        // sycall emulation mode can have only 1 application running(?)
-        // else we have to do some pid based tagging
-        // unused
-        typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
-        TranslationBuffer tlb;
-
-    public:
-        /*statistics*/
-        Stats::Scalar num_kernelLaunched;
-        GpuDispatcher(const Params *p);
-
-        ~GpuDispatcher() { }
-
-        void exec();
-        virtual void serialize(CheckpointOut &cp) const override;
-        virtual void unserialize(CheckpointIn &cp) override;
-        void notifyWgCompl(Wavefront *w);
-        void scheduleDispatch();
-        void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
-
-        // using singleton so that glue code can pass pointer locations
-        // to the dispatcher. when there are multiple dispatchers, we can
-        // call something like getInstance(index)
-        static void
-         setInstance(GpuDispatcher *_instance)
-        {
-            instance = _instance;
-        }
-
-        static GpuDispatcher* getInstance() { return instance; }
-
-        class TLBPort : public MasterPort
-        {
-          public:
-
-            TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
-                : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
-
-          protected:
-            GpuDispatcher *dispatcher;
-
-            virtual bool recvTimingResp(PacketPtr pkt) { return true; }
-            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
-            virtual void recvFunctional(PacketPtr pkt) { }
-            virtual void recvRangeChange() { }
-            virtual void recvReqRetry() { }
-
-        };
-
-        TLBPort *tlbPort;
-
-        Port &getPort(const std::string &if_name,
-                      PortID idx=InvalidPortID) override;
-
-        AddrRangeList getAddrRanges() const override;
-        Tick read(PacketPtr pkt) override;
-        Tick write(PacketPtr pkt) override;
-
-        // helper functions to retrieve/set GPU attributes
-        int getNumCUs();
-        int wfSize() const;
-        void setFuncargsSize(int funcargs_size);
-
-        /** Returns the size of the static hardware context of a wavefront */
-        uint32_t getStaticContextSize() const;
+  private:
+    Shader *shader;
+    GPUCommandProcessor *gpuCmdProc;
+    EventFunctionWrapper tickEvent;
+    std::unordered_map<int, HSAQueueEntry*> hsaQueueEntries;
+    // list of kernel_ids to launch
+    std::queue<int> execIds;
+    // list of kernel_ids that have finished
+    std::queue<int> doneIds;
+    // is there a kernel in execution?
+    bool dispatchActive;
+    /*statistics*/
+    Stats::Scalar numKernelLaunched;
+    Stats::Scalar cyclesWaitingForDispatch;
 };

-#endif // __GPU_DISPATCHER_HH__
+#endif // __GPU_COMPUTE_DISPATCHER_HH__
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -33,13 +33,15 @@

 #include "gpu-compute/exec_stage.hh"

+#include <sstream>
+
+#include "base/trace.hh"
+#include "debug/GPUSched.hh"
 #include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"

-ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
-    numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
-    vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
-    shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
+ExecStage::ExecStage(const ComputeUnitParams *p) : lastTimeInstExecuted(false),
    thisTimeInstExecuted(false), instrExecuted (false),
    executionResourcesUsed(0)
 {
@@ -53,37 +55,18 @@ ExecStage::init(ComputeUnit *cu)
    computeUnit = cu;
    _name = computeUnit->name() + ".ExecStage";
    dispatchList = &computeUnit->dispatchList;
-    vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
-    glbMemInstAvail= &(computeUnit->glbMemInstAvail);
-    shrMemInstAvail= &(computeUnit->shrMemInstAvail);
    idle_dur = 0;
 }

 void
 ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
    if (stage == IdleExec) {
-        // count cycles of no vector ALU instruction executed
-        // even if one was the oldest in a WV of that vector SIMD unit
-        if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
-            numCyclesWithNoInstrTypeIssued[unitId]++;
-        }
-
-        // count cycles of no global memory (vector) instruction executed
-        // even if one was the oldest in a WV of that vector SIMD unit
-        if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
-            numCyclesWithNoInstrTypeIssued[unitId]++;
-            (*glbMemInstAvail)--;
-        }
-
-        // count cycles of no shared memory (vector) instruction executed
-        // even if one was the oldest in a WV of that vector SIMD unit
-        if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
-            numCyclesWithNoInstrTypeIssued[unitId]++;
-            (*shrMemInstAvail)--;
-        }
+        // count cycles when no instruction to a specific execution resource
+        // is executed
+        numCyclesWithNoInstrTypeIssued[unitId]++;
    } else if (stage == BusyExec) {
-        // count the number of cycles an instruction to a specific unit
-        // was issued
+        // count the number of cycles an instruction to a specific execution
+        // resource type was issued
        numCyclesWithInstrTypeIssued[unitId]++;
        thisTimeInstExecuted = true;
        instrExecuted = true;
@@ -102,14 +85,13 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
        }

        lastTimeInstExecuted = thisTimeInstExecuted;
-        // track the number of cycles we either issued one vector instruction
-        // or issued no instructions at all
+        // track the number of cycles we either issued at least
+        // instruction or issued no instructions at all
        if (instrExecuted) {
            numCyclesWithInstrIssued++;
        } else {
            numCyclesWithNoIssue++;
        }
-
        spc.sample(executionResourcesUsed);
    }
 }
@@ -122,25 +104,86 @@ ExecStage::initStatistics()
    thisTimeInstExecuted = false;
 }

+std::string
+ExecStage::dispStatusToStr(int i)
+{
+    std::string s("INVALID");
+    switch (i) {
+    case EMPTY:
+        s = "EMPTY";
+        break;
+    case SKIP:
+        s = "SKIP";
+        break;
+    case EXREADY:
+        s = "EXREADY";
+        break;
+    }
+    return s;
+}
+
+void
+ExecStage::dumpDispList()
+{
+    std::stringstream ss;
+    bool empty = true;
+    for (int i = 0; i < computeUnit->numExeUnits(); i++) {
+        DISPATCH_STATUS s = dispatchList->at(i).second;
+        ss << i << ": " << dispStatusToStr(s);
+        if (s != EMPTY) {
+            empty = false;
+            Wavefront *w = dispatchList->at(i).first;
+            ss << " SIMD[" << w->simdId << "] WV[" << w->wfDynId << "]: ";
+            ss << (w->instructionBuffer.front())->seqNum() << ": ";
+            ss << (w->instructionBuffer.front())->disassemble();
+        }
+        ss << "\n";
+    }
+    if (!empty) {
+        DPRINTF(GPUSched, "Dispatch List:\n%s", ss.str());
+    }
+}
+
 void
 ExecStage::exec()
 {
    initStatistics();
-
-    for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
-         // if dispatch list for this execution resource is empty,
-         // skip this execution resource this cycle
-         if (dispatchList->at(unitId).second == EMPTY) {
-             collectStatistics(IdleExec, unitId);
-             continue;
-         }
-
-         collectStatistics(BusyExec, unitId);
-         // execute an instruction for the WF
-         dispatchList->at(unitId).first->exec();
-         // clear the dispatch list entry
-         dispatchList->at(unitId).second = EMPTY;
-         dispatchList->at(unitId).first = (Wavefront*)nullptr;
+    if (Debug::GPUSched) {
+        dumpDispList();
+    }
+    for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+        DISPATCH_STATUS s = dispatchList->at(unitId).second;
+        switch (s) {
+        case EMPTY:
+            // Do not execute if empty, waiting for VRF reads,
+            // or LM tied to GM waiting for VRF reads
+            collectStatistics(IdleExec, unitId);
+            break;
+        case EXREADY:
+        {
+            collectStatistics(BusyExec, unitId);
+            Wavefront *w = dispatchList->at(unitId).first;
+            DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
+                    unitId, w->simdId, w->wfDynId,
+                    (w->instructionBuffer.front())->disassemble());
+            DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
+            dispatchList->at(unitId).first->exec();
+            (computeUnit->scheduleStage).deleteFromSch(w);
+            dispatchList->at(unitId).second = EMPTY;
+            dispatchList->at(unitId).first->freeResources();
+            dispatchList->at(unitId).first = nullptr;
+            break;
+        }
+        case SKIP:
+            collectStatistics(BusyExec, unitId);
+            DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
+            dispatchList->at(unitId).second = EMPTY;
+            dispatchList->at(unitId).first->freeResources();
+            dispatchList->at(unitId).first = nullptr;
+            break;
+        default:
+            panic("Unknown dispatch status in exec()\n");
+        }
    }

    collectStatistics(PostExec, 0);
@@ -165,7 +208,7 @@ ExecStage::regStats()
        ;

    spc
-        .init(0, numSIMDs + numMemUnits, 1)
+        .init(0, computeUnit->numExeUnits(), 1)
        .name(name() + ".spc")
        .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
        ;
@@ -177,25 +220,36 @@ ExecStage::regStats()
        ;

    numCyclesWithInstrTypeIssued
-        .init(numSIMDs + numMemUnits)
-        .name(name() + ".num_cycles_with_instrtype_issue")
-        .desc("Number of cycles at least one instruction of specific type "
-              "issued")
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".num_cycles_issue_exec_rsrc")
+        .desc("Number of cycles at least one instruction issued to "
+              "execution resource type")
        ;

    numCyclesWithNoInstrTypeIssued
-        .init(numSIMDs + numMemUnits)
-       .name(name() + ".num_cycles_with_instr_type_no_issue")
-       .desc("Number of cycles no instruction of specific type issued")
+        .init(computeUnit->numExeUnits())
+       .name(name() + ".num_cycles_no_issue_exec_rsrc")
+       .desc("Number of clks no instructions issued to execution "
+             "resource type")
       ;

-    for (int i = 0; i < numSIMDs; ++i) {
-        numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
-        numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+    int c = 0;
+    for (int i = 0; i < computeUnit->numVectorALUs; i++,c++) {
+        std::string s = "VectorALU" + std::to_string(i);
+        numCyclesWithNoInstrTypeIssued.subname(c, s);
+        numCyclesWithInstrTypeIssued.subname(c, s);
    }
+    for (int i = 0; i < computeUnit->numScalarALUs; i++,c++) {
+        std::string s = "ScalarALU" + std::to_string(i);
+        numCyclesWithNoInstrTypeIssued.subname(c, s);
+        numCyclesWithInstrTypeIssued.subname(c, s);
+    }
+    numCyclesWithNoInstrTypeIssued.subname(c, "VectorMemPipe");
+    numCyclesWithInstrTypeIssued.subname(c++, "VectorMemPipe");

-    numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
-    numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
-    numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
-    numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+    numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe");
+    numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe");
+
+    numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe");
+    numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe");
 }
--- a/src/gpu-compute/exec_stage.hh
+++ b/src/gpu-compute/exec_stage.hh
@@ -35,6 +35,7 @@
 #define __EXEC_STAGE_HH__

 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>

@@ -53,8 +54,9 @@ enum STAT_STATUS

 enum DISPATCH_STATUS
 {
-    EMPTY = 0,
-    FILLED
+    EMPTY = 0, // no wave present in dispatchList slot
+    EXREADY, // wave ready for execution
+    SKIP, // extra memory resource needed, Shared Mem. only
 };

 // Execution stage.
@@ -72,18 +74,21 @@ class ExecStage
    void init(ComputeUnit *cu);
    void exec();

+    std::string dispStatusToStr(int j);
+    void dumpDispList();
+
    std::string name() { return _name; }
    void regStats();
    // number of idle cycles
    Stats::Scalar numCyclesWithNoIssue;
    // number of busy cycles
    Stats::Scalar numCyclesWithInstrIssued;
-    // number of cycles (per execution unit) during which at least one
-    // instruction was issued to that unit
+    // number of cycles during which at least one
+    // instruction was issued to an execution resource type
    Stats::Vector numCyclesWithInstrTypeIssued;
-    // number of idle cycles (per execution unit) during which the unit issued
-    // no instruction targeting that unit, even though there is at least one
-    // Wavefront with such an instruction as the oldest
+    // number of idle cycles during which the scheduler
+    // issued no instructions targeting a specific
+    // execution resource type
    Stats::Vector numCyclesWithNoInstrTypeIssued;
    // SIMDs active per cycle
    Stats::Distribution spc;
@@ -92,11 +97,6 @@ class ExecStage
    void collectStatistics(enum STAT_STATUS stage, int unitId);
    void initStatistics();
    ComputeUnit *computeUnit;
-    uint32_t numSIMDs;
-
-    // Number of memory execution resources;
-    // both global and local memory execution resources in CU
-    uint32_t numMemUnits;

    // List of waves which will be dispatched to
    // each execution resource. A FILLED implies
@@ -108,18 +108,12 @@ class ExecStage
    // dispatchList is used to communicate between schedule
    // and exec stage
    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
-    // flag per vector SIMD unit that is set when there is at least one
-    // WV that has a vector ALU instruction as the oldest in its
-    // Instruction Buffer
-    std::vector<bool> *vectorAluInstAvail;
-    int *glbMemInstAvail;
-    int *shrMemInstAvail;
    bool lastTimeInstExecuted;
    bool thisTimeInstExecuted;
    bool instrExecuted;
    Stats::Scalar  numTransActiveIdle;
    Stats::Distribution idleDur;
-    uint32_t executionResourcesUsed;
+    int executionResourcesUsed;
    uint64_t idle_dur;
    std::string _name;
 };
--- a/src/gpu-compute/fetch_stage.cc
+++ b/src/gpu-compute/fetch_stage.cc
@@ -36,18 +36,18 @@
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/wavefront.hh"

-FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
-    computeUnit(nullptr)
+FetchStage::FetchStage(const ComputeUnitParams* p) :
+    numVectorALUs(p->num_SIMDs), computeUnit(nullptr)
 {
-    for (int j = 0; j < numSIMDs; ++j) {
+    for (int j = 0; j < numVectorALUs; ++j) {
        FetchUnit newFetchUnit(p);
-        fetchUnit.push_back(newFetchUnit);
+        _fetchUnit.push_back(newFetchUnit);
    }
 }

 FetchStage::~FetchStage()
 {
-    fetchUnit.clear();
+    _fetchUnit.clear();
 }

 void
@@ -56,17 +56,17 @@ FetchStage::init(ComputeUnit *cu)
    computeUnit = cu;
    _name = computeUnit->name() + ".FetchStage";

-    for (int j = 0; j < numSIMDs; ++j) {
-        fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
-        fetchUnit[j].init(computeUnit);
+    for (int j = 0; j < numVectorALUs; ++j) {
+        _fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
+        _fetchUnit[j].init(computeUnit);
    }
 }

 void
 FetchStage::exec()
 {
-    for (int j = 0; j < numSIMDs; ++j) {
-        fetchUnit[j].exec();
+    for (int j = 0; j < numVectorALUs; ++j) {
+        _fetchUnit[j].exec();
    }
 }

@@ -83,13 +83,13 @@ FetchStage::processFetchReturn(PacketPtr pkt)

    instFetchInstReturned.sample(num_instructions);
    uint32_t simdId = wavefront->simdId;
-    fetchUnit[simdId].processFetchReturn(pkt);
+    _fetchUnit[simdId].processFetchReturn(pkt);
 }

 void
 FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
 {
-    fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
+    _fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
 }

 void
--- a/src/gpu-compute/fetch_stage.hh
+++ b/src/gpu-compute/fetch_stage.hh
@@ -62,14 +62,15 @@ class FetchStage
    std::string name() { return _name; }
    void regStats();
    Stats::Distribution instFetchInstReturned;
+    FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }

  private:
-    uint32_t numSIMDs;
+    int numVectorALUs;
    ComputeUnit *computeUnit;

    // List of fetch units. A fetch unit is
-    // instantiated per SIMD
-    std::vector<FetchUnit> fetchUnit;
+    // instantiated per VALU/SIMD
+    std::vector<FetchUnit> _fetchUnit;
    std::string _name;
 };

--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -45,11 +45,9 @@

 uint32_t FetchUnit::globalFetchUnitID;

-FetchUnit::FetchUnit(const ComputeUnitParams* params) :
-    timingSim(true),
-    computeUnit(nullptr),
-    fetchScheduler(params),
-    waveList(nullptr)
+FetchUnit::FetchUnit(const ComputeUnitParams* params)
+    : timingSim(true), computeUnit(nullptr), fetchScheduler(params),
+      waveList(nullptr), fetchDepth(params->fetch_depth)
 {
 }

@@ -66,9 +64,14 @@ FetchUnit::init(ComputeUnit *cu)
    timingSim = computeUnit->shader->timingSim;
    fetchQueue.clear();
    fetchStatusQueue.resize(computeUnit->shader->n_wf);
+    fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc());

-    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
-        fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
+    for (int i = 0; i < computeUnit->shader->n_wf; ++i) {
+        Wavefront *wf = waveList->at(i);
+        assert(wf->wfSlotId == i);
+        fetchStatusQueue[i] = std::make_pair(wf, false);
+        fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf);
+        fetchBuf[i].decoder(&decoder);
    }

    fetchScheduler.bindList(&fetchQueue);
@@ -77,6 +80,23 @@ FetchUnit::init(ComputeUnit *cu)
 void
 FetchUnit::exec()
 {
+    /**
+     * now we check if any of the fetch buffers have
+     * buffered instruction data that can be decoded
+     * and sent to its wavefront's instruction buffer.
+     * then we check if any of the fetch buffer entries
+     * can be released. we only check if we can
+     * release a buffer
+     */
+    for (auto &fetch_buf : fetchBuf) {
+        if (!fetch_buf.hasFreeSpace()) {
+            fetch_buf.checkWaveReleaseBuf();
+        }
+        if (fetch_buf.hasFetchDataToProcess()) {
+            fetch_buf.decodeInsts();
+        }
+    }
+
    // re-evaluate waves which are marked as not ready for fetch
    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
        // Following code assumes 64-bit opertaion and all insts are
@@ -88,9 +108,10 @@ FetchUnit::exec()
        // 4 or less instructions and it can not have any branches to
        // prevent speculative instruction fetches
        if (!fetchStatusQueue[j].second) {
-            if (curWave->status == Wavefront::S_RUNNING &&
-                curWave->instructionBuffer.size() <= 4 &&
-                !curWave->instructionBufferHasBranch() &&
+            if ((curWave->getStatus() == Wavefront::S_RUNNING ||
+                curWave->getStatus() == Wavefront::S_WAITCNT) &&
+                fetchBuf[j].hasFreeSpace() &&
+                !curWave->stopFetch() &&
                !curWave->pendingFetch) {
                fetchQueue.push_back(curWave);
                fetchStatusQueue[j].second = true;
@@ -111,45 +132,38 @@ FetchUnit::exec()
 void
 FetchUnit::initiateFetch(Wavefront *wavefront)
 {
-    // calculate the virtual address to fetch from the SQC
-    Addr vaddr = wavefront->pc();
+    assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace());

    /**
-     * the instruction buffer holds one instruction per entry, regardless
-     * of the underlying instruction's size. the PC, however, addresses
-     * instrutions on a 32b granularity so we must account for that here.
-    */
-    for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) {
-        vaddr +=
-            wavefront->instructionBuffer.at(i)->staticInstruction()->instSize();
-    }
-    vaddr = wavefront->basePtr +  vaddr;
+     * calculate the virtual address to fetch from the SQC. the fetch
+     * buffer holds a configurable number of cache lines. we start
+     * fetching at the address of the cache line immediately following
+     * the buffered line(s).
+     */
+    Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr();
+
+    // this should already be aligned to a cache line
+    assert(vaddr == makeLineAddress(vaddr,
+           computeUnit->getCacheLineBits()));
+
+    // shouldn't be fetching a line that is already buffered
+    assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
+
+    fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
+
+    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
+            "from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId,
+            wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);

    DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);

-    // Since this is an instruction prefetch, if you're split then just finish
-    // out the current line.
-    int block_size = computeUnit->cacheLineSize();
-    // check for split accesses
-    Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
-    int size = block_size;
-
-    if (split_addr > vaddr) {
-        // misaligned access, just grab the rest of the line
-        size = split_addr - vaddr;
-    }
-
    // set up virtual request
    RequestPtr req = std::make_shared<Request>(
-        vaddr, size, Request::INST_FETCH,
+        vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH,
        computeUnit->masterId(), 0, 0, nullptr);

    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    // This fetchBlock is kind of faux right now - because the translations so
-    // far don't actually return Data
-    uint64_t fetchBlock;
-    pkt->dataStatic(&fetchBlock);

    if (timingSim) {
        // SenderState needed on Return
@@ -210,19 +224,23 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
            pkt->req->getPaddr());

-    // this is necessary because the GPU TLB receives packets instead of
-    // requests. when the translation is complete, all relevent fields in the
-    // request will be populated, but not in the packet. here we create the
-    // new packet so we can set the size, addr, and proper flags.
+    /**
+     * this is necessary because the GPU TLB receives packets instead of
+     * requests. when the translation is complete, all relevent fields in
+     * the request will be populated, but not in the packet. here we create
+     * the new packet so we can set the size, addr, and proper flags.
+     */
    PacketPtr oldPkt = pkt;
    pkt = new Packet(oldPkt->req, oldPkt->cmd);
    delete oldPkt;

-    TheGpuISA::RawMachInst *data =
-        new TheGpuISA::RawMachInst[pkt->req->getSize() /
-        sizeof(TheGpuISA::RawMachInst)];
-
-    pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
+    /**
+     * we should have reserved an entry in the fetch buffer
+     * for this cache line. here we get the pointer to the
+     * entry used to buffer this request's line data.
+     */
+    pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId)
+                    .reservedBuf(pkt->req->getVaddr()));

    // New SenderState for the memory access
    pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
@@ -257,47 +275,15 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
    Wavefront *wavefront = sender_state->wavefront;

    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
-            "%d bytes, %d instructions!\n", computeUnit->cu_id,
-            wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
-            pkt->req->getSize(), pkt->req->getSize() /
-            sizeof(TheGpuISA::RawMachInst));
+            "%d bytes!\n", computeUnit->cu_id, wavefront->simdId,
+            wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());

    if (wavefront->dropFetch) {
        assert(wavefront->instructionBuffer.empty());
+        assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
        wavefront->dropFetch = false;
    } else {
-        TheGpuISA::RawMachInst *inst_index_ptr =
-            (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
-
-        assert(wavefront->instructionBuffer.size() <= 4);
-
-        for (int i = 0; i < pkt->req->getSize() /
-             sizeof(TheGpuISA::RawMachInst); ++i) {
-            GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
-
-            assert(inst_ptr);
-
-            if (inst_ptr->instSize() == 8) {
-                /**
-                 * this instruction occupies 2 consecutive
-                 * entries in the instruction array, the
-                 * second of which contains a nullptr. so if
-                 * this inst is 8 bytes we advance two entries
-                 * instead of 1
-                 */
-                ++i;
-            }
-
-            DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
-                    computeUnit->cu_id, wavefront->simdId,
-                    wavefront->wfSlotId, inst_ptr->disassemble());
-
-            GPUDynInstPtr gpuDynInst =
-                std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
-                                             computeUnit->getAndIncSeqNum());
-
-            wavefront->instructionBuffer.push_back(gpuDynInst);
-        }
+        fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
    }

    wavefront->pendingFetch = false;
@@ -306,8 +292,337 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
    delete pkt;
 }

+void
+FetchUnit::flushBuf(int wfSlotId)
+{
+    fetchBuf.at(wfSlotId).flushBuf();
+}
+
 void
 FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
 {
    waveList = wave_list;
 }
+
+/** FetchBufDesc */
+void
+FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size,
+                                     Wavefront *wf)
+{
+    wavefront = wf;
+    fetchDepth = fetch_depth;
+    maxIbSize = wavefront->maxIbSize;
+    cacheLineSize = cache_line_size;
+    maxFbSize = cacheLineSize * fetchDepth;
+
+    // Calculate the number of bits to address a cache line
+    panic_if(!isPowerOf2(cacheLineSize),
+        "Cache line size should be a power of two.");
+    cacheLineBits = floorLog2(cacheLineSize);
+
+    bufStart = new uint8_t[maxFbSize];
+    readPtr = bufStart;
+    bufEnd = bufStart + maxFbSize;
+
+    for (int i = 0; i < fetchDepth; ++i) {
+        freeList.emplace_back(readPtr + i * cacheLineSize);
+    }
+}
+
+void
+FetchUnit::FetchBufDesc::flushBuf()
+{
+    restartFromBranch = true;
+    /**
+     * free list may have some entries
+     * so we clear it here to avoid duplicates
+     */
+    freeList.clear();
+    bufferedPCs.clear();
+    reservedPCs.clear();
+    readPtr = bufStart;
+
+    for (int i = 0; i < fetchDepth; ++i) {
+        freeList.push_back(bufStart + i * cacheLineSize);
+    }
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
+            "buffer\n", wavefront->simdId, wavefront->wfSlotId,
+            wavefront->wfDynId);
+}
+
+Addr
+FetchUnit::FetchBufDesc::nextFetchAddr()
+{
+    Addr next_line = 0;
+
+    if (bufferedAndReservedLines()) {
+        Addr last_line_fetched = 0;
+        if (!reservedLines()) {
+            /**
+             * get the PC of the most recently fetched cache line,
+             * then return the address of the next line.
+             */
+            last_line_fetched = bufferedPCs.rbegin()->first;
+        } else {
+            last_line_fetched = reservedPCs.rbegin()->first;
+        }
+
+        next_line = last_line_fetched + cacheLineSize;
+
+        /**
+         * should not be trying to fetch a line that has already
+         * been fetched.
+         */
+        assert(bufferedPCs.find(next_line) == bufferedPCs.end());
+        assert(reservedPCs.find(next_line) == reservedPCs.end());
+    } else {
+        /**
+         * we do not have any buffered cache lines yet, so we
+         * assume this is the initial fetch, or the first fetch
+         * after a branch, and get the PC directly from the WF.
+         * in the case of a branch, we may not start at the
+         * beginning of a cache line, so we adjust the readPtr by
+         * the current PC's offset from the start of the line.
+         */
+        next_line = makeLineAddress(wavefront->pc(), cacheLineBits);
+        readPtr = bufStart;
+
+        /**
+         * if we are here we have no buffered lines. in the case we flushed
+         * the buffer due to a branch, we may need to start fetching from
+         * some offset from the start of the fetch buffer, so we adjust for
+         * that here.
+         */
+        if (restartFromBranch) {
+            restartFromBranch = false;
+            int byte_offset
+                = wavefront->pc() - makeLineAddress(wavefront->pc(),
+                                    cacheLineBits);
+            readPtr += byte_offset;
+        }
+    }
+
+    return next_line;
+}
+
+void
+FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
+{
+    // we should have free buffer space, and the line
+    // at vaddr should not already be cached.
+    assert(hasFreeSpace());
+    assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
+    assert(reservedPCs.find(vaddr) == reservedPCs.end());
+    assert(bufferedAndReservedLines() < fetchDepth);
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry "
+            "for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
+            wavefront->wfDynId, vaddr);
+
+    /**
+     * we reserve buffer space, by moving it out of the
+     * free list, however we do not mark the buffered
+     * line as valid until the fetch unit for this buffer
+     * has receieved the response from the memory system.
+     */
+    uint8_t *inst_buf = freeList.front();
+    reservedPCs.emplace(vaddr, inst_buf);
+    freeList.pop_front();
+}
+
+void
+FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
+{
+    assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
+            wavefront->simdId, wavefront->wfSlotId,
+            wavefront->wfDynId, vaddr);
+
+    /**
+     * this address should have an entry reserved in the
+     * fetch buffer already, however it should be invalid
+     * until the fetch completes.
+     */
+    auto reserved_pc = reservedPCs.find(vaddr);
+    assert(reserved_pc != reservedPCs.end());
+    bufferedPCs.emplace(vaddr, reserved_pc->second);
+
+    if (readPtr == bufEnd) {
+        readPtr = bufStart;
+    }
+
+    reserved_pc->second = nullptr;
+    reservedPCs.erase(reserved_pc);
+}
+
+bool
+FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
+{
+    return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst);
+}
+
+void
+FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
+{
+    Addr cur_wave_pc = roundDown(wavefront->pc(),
+                                 wavefront->computeUnit->cacheLineSize());
+    if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
+        DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still "
+                "being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
+                wavefront->wfDynId, cur_wave_pc);
+
+        // should be reserved, but not buffered yet
+        assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
+
+        return;
+    }
+
+    auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
+    auto oldest_buffered_pc = bufferedPCs.begin();
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
+            "(PC = %#x) can be released.\n", wavefront->simdId,
+            wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
+            wavefront->pc());
+
+#ifdef DEBUG
+    int idx = 0;
+    for (const auto &buf_pc : bufferedPCs) {
+        DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first);
+        ++idx;
+    }
+#endif
+
+    // if we haven't buffered data for this PC, we shouldn't
+    // be fetching from it.
+    assert(current_buffered_pc != bufferedPCs.end());
+
+    /**
+     * we're using a std::map so the addresses are sorted. if this
+     * PC is not the oldest one in the map, we must be fetching from
+     * a newer block, and we can release the oldest PC's fetch buffer
+     * entry back to the free list.
+     */
+    if (current_buffered_pc != oldest_buffered_pc) {
+        DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
+                "removing it from the fetch buffer.\n", wavefront->simdId,
+                wavefront->wfSlotId, wavefront->wfDynId,
+                oldest_buffered_pc->first);
+
+        freeList.emplace_back(oldest_buffered_pc->second);
+        oldest_buffered_pc->second = nullptr;
+        bufferedPCs.erase(oldest_buffered_pc);
+        DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n",
+                wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
+                bufferedLines());
+    }
+}
+
+void
+FetchUnit::FetchBufDesc::decodeInsts()
+{
+    assert(readPtr);
+
+    if (splitDecode()) {
+        decodeSplitInst();
+    }
+
+    while (wavefront->instructionBuffer.size() < maxIbSize
+           && hasFetchDataToProcess()) {
+        if (splitDecode()) {
+            decodeSplitInst();
+        } else {
+            TheGpuISA::MachInst mach_inst
+                = reinterpret_cast<TheGpuISA::MachInst>(readPtr);
+            GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
+            readPtr += gpu_static_inst->instSize();
+
+            assert(readPtr <= bufEnd);
+
+            GPUDynInstPtr gpu_dyn_inst
+                = std::make_shared<GPUDynInst>(wavefront->computeUnit,
+                                               wavefront, gpu_static_inst,
+                                               wavefront->computeUnit->
+                                                   getAndIncSeqNum());
+            wavefront->instructionBuffer.push_back(gpu_dyn_inst);
+
+            DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
+                    "%d bytes remain.\n", wavefront->simdId,
+                    wavefront->wfSlotId, wavefront->wfDynId,
+                    gpu_static_inst->disassemble(),
+                    gpu_static_inst->instSize(),
+                    fetchBytesRemaining());
+        }
+    }
+}
+
+void
+FetchUnit::FetchBufDesc::decodeSplitInst()
+{
+    TheGpuISA::RawMachInst split_inst = 0;
+    int dword_size = sizeof(uint32_t);
+    int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
+
+    for (int i = 0; i < num_dwords; ++i) {
+        ((uint32_t*)(&split_inst))[i] = *reinterpret_cast<uint32_t*>(readPtr);
+        if (readPtr + dword_size >= bufEnd) {
+            readPtr = bufStart;
+        }
+    }
+
+    assert(readPtr == bufStart);
+
+    TheGpuISA::MachInst mach_inst
+        = reinterpret_cast<TheGpuISA::MachInst>(&split_inst);
+    GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
+    readPtr += (gpu_static_inst->instSize() - dword_size);
+    assert(readPtr < bufEnd);
+
+    GPUDynInstPtr gpu_dyn_inst
+        = std::make_shared<GPUDynInst>(wavefront->computeUnit,
+                                       wavefront, gpu_static_inst,
+                                       wavefront->computeUnit->
+                                           getAndIncSeqNum());
+    wavefront->instructionBuffer.push_back(gpu_dyn_inst);
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
+            "(%d bytes). %d bytes remain in %d buffered lines.\n",
+            wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
+            gpu_static_inst->disassemble(), split_inst,
+            gpu_static_inst->instSize(), fetchBytesRemaining(),
+            bufferedLines());
+}
+
+bool
+FetchUnit::FetchBufDesc::splitDecode() const
+{
+    /**
+     * if a read of a raw instruction would go beyond the end
+     * of the fetch buffer, then we must perform a split decode.
+     */
+    bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd;
+
+    return is_split;
+}
+
+int
+FetchUnit::FetchBufDesc::fetchBytesRemaining() const
+{
+    int bytes_remaining = 0;
+
+    if (bufferedLines() && readPtr != bufEnd) {
+        auto last_buf_pc = bufferedPCs.rbegin();
+        uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
+        int byte_diff = end_ptr - readPtr;
+
+        if (end_ptr > readPtr) {
+            bytes_remaining = byte_diff;
+        } else if (end_ptr < readPtr) {
+            bytes_remaining = bufferedBytes() + byte_diff;
+        }
+    }
+
+    assert(bytes_remaining <= bufferedBytes());
+    return bytes_remaining;
+}
--- a/src/gpu-compute/fetch_unit.hh
+++ b/src/gpu-compute/fetch_unit.hh
@@ -36,7 +36,6 @@

 #include <string>
 #include <utility>
-#include <vector>

 #include "arch/gpu_decoder.hh"
 #include "base/statistics.hh"
@@ -58,9 +57,170 @@ class FetchUnit
    void initiateFetch(Wavefront *wavefront);
    void fetch(PacketPtr pkt, Wavefront *wavefront);
    void processFetchReturn(PacketPtr pkt);
+    void flushBuf(int wfSlotId);
    static uint32_t globalFetchUnitID;

  private:
+    /**
+     * fetch buffer descriptor. holds buffered
+     * instruction data in the fetch unit.
+     */
+    class FetchBufDesc
+    {
+      public:
+        FetchBufDesc() : bufStart(nullptr), bufEnd(nullptr),
+            readPtr(nullptr), fetchDepth(0), maxIbSize(0), maxFbSize(0),
+            cacheLineSize(0), restartFromBranch(false), wavefront(nullptr),
+            _decoder(nullptr)
+        {
+        }
+
+        ~FetchBufDesc()
+        {
+            delete[] bufStart;
+        }
+
+        /**
+         * allocate the fetch buffer space, and set the fetch depth
+         * (number of lines that may be buffered), fetch size
+         * (cache line size), and parent WF for this fetch buffer.
+         */
+        void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf);
+
+        int
+        bufferedAndReservedLines() const
+        {
+            return bufferedLines() + reservedLines();
+        }
+
+        int bufferedLines() const { return bufferedPCs.size(); }
+        int bufferedBytes() const { return bufferedLines() * cacheLineSize; }
+        int reservedLines() const { return reservedPCs.size(); }
+        bool hasFreeSpace() const { return !freeList.empty(); }
+        void flushBuf();
+        Addr nextFetchAddr();
+
+        /**
+         * reserve an entry in the fetch buffer for PC = vaddr,
+         */
+        void reserveBuf(Addr vaddr);
+
+        /**
+         * return a pointer to the raw fetch buffer data.
+         * this allows the fetch pkt to use this data directly
+         * to avoid unnecessary memcpy and malloc/new.
+         */
+        uint8_t*
+        reservedBuf(Addr vaddr) const
+        {
+            auto reserved_pc = reservedPCs.find(vaddr);
+            assert(reserved_pc != reservedPCs.end());
+            assert(reserved_pc == reservedPCs.begin());
+
+            return reserved_pc->second;
+        }
+
+        void fetchDone(Addr vaddr);
+
+        /**
+         * checks if the buffer contains valid data. this essentially
+         * tells fetch when there is data remaining that needs to be
+         * decoded into the WF's IB.
+         */
+        bool hasFetchDataToProcess() const;
+
+        /**
+         * each time the fetch stage is ticked, we check if there
+         * are any data in the fetch buffer that may be decoded and
+         * sent to the IB. because we are modeling the fetch buffer
+         * as a circular buffer, it is possible that an instruction
+         * can straddle the end/beginning of the fetch buffer, so
+         * decodeSplitInsts() handles that case.
+         */
+        void decodeInsts();
+
+        /**
+         * checks if the wavefront can release any of its fetch
+         * buffer entries. this will occur when the WF's PC goes
+         * beyond any of the currently buffered cache lines.
+         */
+        void checkWaveReleaseBuf();
+
+        void
+        decoder(TheGpuISA::Decoder *dec)
+        {
+            _decoder = dec;
+        }
+
+        bool
+        pcBuffered(Addr pc) const
+        {
+            bool buffered = bufferedPCs.find(pc) != bufferedPCs.end()
+                            && reservedPCs.find(pc) != reservedPCs.end();
+
+            return buffered;
+        }
+
+        /**
+         * calculates the number of fetched bytes that have yet
+         * to be decoded.
+         */
+        int fetchBytesRemaining() const;
+
+      private:
+        void decodeSplitInst();
+
+        /**
+         * check if the next instruction to be processed out of
+         * the fetch buffer is split across the end/beginning of
+         * the fetch buffer.
+         */
+        bool splitDecode() const;
+
+        /**
+         * the set of PCs (fetch addresses) that are currently
+         * buffered. bufferedPCs are valid, reservedPCs are
+         * waiting for their buffers to be filled with valid
+         * fetch data.
+         */
+        std::map<Addr, uint8_t*> bufferedPCs;
+        std::map<Addr, uint8_t*> reservedPCs;
+
+        /**
+         * represents the fetch buffer free list. holds buffer space
+         * that is currently free. each pointer in this array must
+         * have enough space to hold a cache line. in reality we
+         * have one actual fetch buffer: 'bufStart', these pointers
+         * point to addresses within bufStart that are aligned to the
+         * cache line size.
+         */
+        std::deque<uint8_t*> freeList;
+
+        /**
+         * raw instruction buffer. holds cache line data associated with
+         * the set of PCs (fetch addresses) that are buffered here.
+         */
+        uint8_t *bufStart;
+        uint8_t *bufEnd;
+        /**
+         * pointer that points to the next chunk of inst data to be
+         * decoded.
+         */
+        uint8_t *readPtr;
+        // how many lines the fetch unit may buffer
+        int fetchDepth;
+        // maximum size (in number of insts) of the WF's IB
+        int maxIbSize;
+        // maximum size (in bytes) of this fetch buffer
+        int maxFbSize;
+        int cacheLineSize;
+        int cacheLineBits;
+        bool restartFromBranch;
+        // wavefront whose IB is serviced by this fetch buffer
+        Wavefront *wavefront;
+        TheGpuISA::Decoder *_decoder;
+    };
+
    bool timingSim;
    ComputeUnit *computeUnit;
    TheGpuISA::Decoder decoder;
@@ -82,6 +242,15 @@ class FetchUnit

    // Pointer to list of waves dispatched on to this SIMD unit
    std::vector<Wavefront*> *waveList;
+    // holds the fetch buffers. each wave has 1 entry.
+    std::vector<FetchBufDesc> fetchBuf;
+    /**
+     * number of cache lines we can fetch and buffer.
+     * this includes the currently fetched line (i.e., the
+     * line that corresponds to the WF's current PC), as
+     * well as any lines that may be prefetched.
+     */
+    int fetchDepth;
 };

 #endif // __FETCH_UNIT_HH__
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -31,12 +31,13 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-#include "gpu-compute/global_memory_pipeline.hh"
-
+#define __STDC_FORMAT_MACROS
+#include <cinttypes>
 #include "debug/GPUCoalescer.hh"
 #include "debug/GPUMem.hh"
 #include "debug/GPUReg.hh"
 #include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/vector_register_file.hh"
@@ -44,7 +45,7 @@

 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
    computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
-    outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
+    maxWaveRequests(p->max_wave_requests), inflightStores(0),
    inflightLoads(0)
 {
 }
@@ -76,6 +77,31 @@ GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
    return true;
 }

+void
+GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
+{
+    // We require one token from the coalescer's uncoalesced table to
+    // proceed
+    int token_count = 1;
+
+    DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
+    assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
+    mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
+}
+
+bool
+GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
+{
+    // Ensure we haven't exceeded the maximum number of vmem requests
+    // for this wavefront
+    if ((mp->wavefront()->outstandingReqsRdGm
+         + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
+        return false;
+    }
+
+    return true;
+}
+
 void
 GlobalMemPipeline::exec()
 {
@@ -87,42 +113,60 @@ GlobalMemPipeline::exec()

    // check the VRF to see if the operands of a load (or load component
    // of an atomic) are accessible
-    if ((m) && (m->isLoad() || m->isAtomicRet())) {
+    if (m && (m->isLoad() || m->isAtomicRet())) {
        w = m->wavefront();

-        accessVrf =
-            w->computeUnit->vrf[w->simdId]->
-                vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
+        accessVrf = w->computeUnit->vrf[w->simdId]->
+            canScheduleWriteOperandsFromLoad(w, m);
+
    }

    if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
-        accessVrf && m->statusBitVector == VectorMask(0) &&
-        (computeUnit->shader->coissue_return ||
-        computeUnit->wfWait.at(m->pipeId).rdy())) {
+        accessVrf && (computeUnit->shader->coissue_return ||
+        computeUnit->vectorGlobalMemUnit.rdy())) {

        w = m->wavefront();

+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
+                m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
        m->completeAcc(m);

+        if (m->isLoad() || m->isAtomicRet()) {
+            w->computeUnit->vrf[w->simdId]->
+            scheduleWriteOperandsFromLoad(w, m);
+        }
+
        completeRequest(m);

-        // Decrement outstanding register count
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        Tick accessTime = curTick() - m->getAccessTime();

-        if (m->isStore() || m->isAtomic()) {
+        // Decrement outstanding requests count
+        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        if (m->isStore() || m->isAtomic() || m->isMemSync()) {
+            computeUnit->shader->sampleStore(accessTime);
            computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
                                             m->time, -1);
        }

-        if (m->isLoad() || m->isAtomic()) {
+        if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
+            computeUnit->shader->sampleLoad(accessTime);
            computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
                                             m->time, -1);
        }

+        w->validateRequestCounters();
+
+        // Generate stats for round-trip time for vectory memory insts
+        // going all the way to memory and stats for individual cache
+        // blocks generated by the instruction.
+        m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
+        computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
+        computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
+
        // Mark write bus busy for appropriate amount of time
        computeUnit->glbMemToVrfBus.set(m->time);
        if (!computeUnit->shader->coissue_return)
-            w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+            w->computeUnit->vectorGlobalMemUnit.set(m->time);
    }

    // If pipeline has executed a global memory instruction
@@ -148,13 +192,13 @@ GlobalMemPipeline::exec()
                mp->disassemble(), mp->seqNum());
        // Memfences will not return tokens and must be issued so we should
        // not request one as this will deplete the token count until deadlock
-        if (!mp->isMemFence()) {
+        if (!mp->isMemSync()) {
            assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
            mp->computeUnit()->getTokenManager()->acquireTokens(1);
        }
        mp->initiateAcc(mp);

-        if (!outOfOrderDataDelivery && !mp->isMemFence()) {
+        if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
            /**
             * if we are not in out-of-order data delivery mode
             * then we keep the responses sorted in program order.
@@ -178,19 +222,11 @@ GlobalMemPipeline::exec()
 GPUDynInstPtr
 GlobalMemPipeline::getNextReadyResp()
 {
-    if (outOfOrderDataDelivery) {
-        if (!gmReturnedLoads.empty()) {
-            return gmReturnedLoads.front();
-        } else if (!gmReturnedStores.empty()) {
-            return gmReturnedStores.front();
-        }
-    } else {
-        if (!gmOrderedRespBuffer.empty()) {
-            auto mem_req = gmOrderedRespBuffer.begin();
+    if (!gmOrderedRespBuffer.empty()) {
+        auto mem_req = gmOrderedRespBuffer.begin();

-            if (mem_req->second.second) {
-                return mem_req->second.first;
-            }
+        if (mem_req->second.second) {
+            return mem_req->second.first;
        }
    }

@@ -208,51 +244,33 @@ GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
        --inflightStores;
    }

-    if (outOfOrderDataDelivery) {
-        if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
-            assert(!gmReturnedLoads.empty());
-            gmReturnedLoads.pop();
-        } else if (gpuDynInst->isStore()) {
-            assert(!gmReturnedStores.empty());
-            gmReturnedStores.pop();
-        }
-    } else {
-        // we should only pop the oldest requst, and it
-        // should be marked as done if we are here
-        assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
-        assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
-        assert(gmOrderedRespBuffer.begin()->second.second);
-        // remove this instruction from the buffer by its
-        // unique seq ID
-        gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
-    }
+    // we should only pop the oldest requst, and it
+    // should be marked as done if we are here
+    assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
+    assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
+    assert(gmOrderedRespBuffer.begin()->second.second);
+    // remove this instruction from the buffer by its
+    // unique seq ID
+    gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
 }

 void
 GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
 {
+    gpuDynInst->setAccessTime(curTick());
+    gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
    gmIssuedRequests.push(gpuDynInst);
 }

 void
 GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
 {
-    if (outOfOrderDataDelivery) {
-        if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
-            assert(isGMLdRespFIFOWrRdy());
-            gmReturnedLoads.push(gpuDynInst);
-        } else {
-            assert(isGMStRespFIFOWrRdy());
-            gmReturnedStores.push(gpuDynInst);
-        }
-    } else {
-        auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
-        // if we are getting a response for this mem request,
-        // then it ought to already be in the ordered response
-        // buffer
-        assert(mem_req != gmOrderedRespBuffer.end());
-        mem_req->second.second = true;
-    }
+    auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
+    // if we are getting a response for this mem request,
+    // then it ought to already be in the ordered response
+    // buffer
+    assert(mem_req != gmOrderedRespBuffer.end());
+    mem_req->second.second = true;
 }

 void
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -60,52 +60,34 @@ class GlobalMemPipeline
    void init(ComputeUnit *cu);
    void exec();

-    std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
-    std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
-
    /**
-     * find the next ready response to service. for OoO mode we
-     * simply pop the oldest (based on when the response was
-     * received) response in the response FIFOs. for in-order mode
-     * we pop the oldest (in program order) response, and only if
-     * it is marked as done.
+     * Find the next ready response to service. In order to ensure
+     * that no waitcnts are violated, we pop the oldest (in program order)
+     * response, and only if it is marked as done. This is because waitcnt
+     * values expect memory operations to complete and decrement their
+     * counter values in program order.
     */
    GPUDynInstPtr getNextReadyResp();

    /**
     * once a memory request is finished we remove it from the
-     * buffer. this method determines which response buffer
-     * we're using based on the mode (in-order vs. OoO).
+     * buffer.
     */
    void completeRequest(GPUDynInstPtr gpuDynInst);

    /**
-     * issues a request to the pipeline - i.e., enqueue it
-     * in the request buffer.
+     * Issues a request to the pipeline (i.e., enqueue it
+     * in the request buffer).
     */
    void issueRequest(GPUDynInstPtr gpuDynInst);

    /**
-     * this method handles responses sent to this GM pipeline by the
-     * CU. in the case of in-order delivery it simply marks the reqeust
-     * as done in the ordered buffer to indicate that the requst is
-     * finished. for out-of-order data delivery, the requests are enqueued
-     * (in the order in which they are received) in the response FIFOs.
+     * This method handles responses sent to this GM pipeline by the
+     * CU. Simply marks the reqeust as done in the ordered buffer to
+     * indicate that the requst is finished.
     */
    void handleResponse(GPUDynInstPtr gpuDynInst);

-    bool
-    isGMLdRespFIFOWrRdy() const
-    {
-        return gmReturnedLoads.size() < gmQueueSize;
-    }
-
-    bool
-    isGMStRespFIFOWrRdy() const
-    {
-        return gmReturnedStores.size() < gmQueueSize;
-    }
-
    bool
    isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
    {
@@ -114,7 +96,6 @@ class GlobalMemPipeline

    const std::string &name() const { return _name; }
    void regStats();
-
    void
    incLoadVRFBankConflictCycles(int num_cycles)
    {
@@ -122,12 +103,15 @@ class GlobalMemPipeline
    }

    bool coalescerReady(GPUDynInstPtr mp) const;
+    bool outstandingReqsCheck(GPUDynInstPtr mp) const;
+
+    void acqCoalescerToken(GPUDynInstPtr mp);

  private:
    ComputeUnit *computeUnit;
    std::string _name;
    int gmQueueSize;
-    bool outOfOrderDataDelivery;
+    int maxWaveRequests;

    // number of cycles of delaying the update of a VGPR that is the
    // target of a load instruction (or the load component of an atomic)
@@ -143,12 +127,11 @@ class GlobalMemPipeline
    int globalMemSize;

    /*
-     * this buffer holds the memory responses when in-order data
-     * deilvery is used - the responses are ordered by their unique
-     * sequence number, which is monotonically increasing. when a
-     * memory request returns its "done" flag is set to true. during
-     * each tick the the GM pipeline will check if the oldest request
-     * is finished, and if so it will be removed from the queue.
+     * This buffer holds the memory responses in order data - the responses
+     * are ordered by their unique sequence number, which is monotonically
+     * increasing. When a memory request returns its "done" flag is set to
+     * true. During each tick the the GM pipeline will check if the oldest
+     * request is finished, and if so it will be removed from the queue.
     *
     * key:   memory instruction's sequence ID
     *
@@ -161,14 +144,6 @@ class GlobalMemPipeline
    // Global Memory Request FIFO: all global memory requests
    // are issued to this FIFO from the memory pipelines
    std::queue<GPUDynInstPtr> gmIssuedRequests;
-
-    // Globa Store Response FIFO: all responses of global memory
-    // stores are sent to this FIFO from TCP
-    std::queue<GPUDynInstPtr> gmReturnedStores;
-
-    // Global Load Response FIFO: all responses of global memory
-    // loads are sent to this FIFO from TCP
-    std::queue<GPUDynInstPtr> gmReturnedLoads;
 };

 #endif // __GLOBAL_MEMORY_PIPELINE_HH__
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_command_processor.hh"
+
+#include "debug/GPUCommandProc.hh"
+#include "debug/GPUKernelInfo.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "params/GPUCommandProcessor.hh"
+
+GPUCommandProcessor::GPUCommandProcessor(const Params *p)
+    : HSADevice(p), dispatcher(*p->dispatcher)
+{
+    dispatcher.setCommandProcessor(this);
+}
+
+/**
+ * submitDispatchPkt() is the entry point into the CP from the HSAPP
+ * and is only meant to be used with AQL kernel dispatch packets.
+ * After the HSAPP receives and extracts an AQL packet, it sends
+ * it to the CP, which is responsible for gathering all relevant
+ * information about a task, initializing CU state, and sending
+ * it to the dispatcher for WG creation and dispatch.
+ *
+ * First we need capture all information from the the AQL pkt and
+ * the code object, then store it in an HSAQueueEntry. Once the
+ * packet and code are extracted, we extract information from the
+ * queue descriptor that the CP needs to perform state initialization
+ * on the CU. Finally we call dispatch() to send the task to the
+ * dispatcher. When the task completely finishes, we call finishPkt()
+ * on the HSA packet processor in order to remove the packet from the
+ * queue, and notify the runtime that the task has completed.
+ */
+void
+GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
+                                       Addr host_pkt_addr)
+{
+    static int dynamic_task_id = 0;
+    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
+
+    /**
+     * we need to read a pointer in the application's address
+     * space to pull out the kernel code descriptor.
+     */
+    auto *tc = sys->threads[0];
+    auto &virt_proxy = tc->getVirtProxy();
+
+    /**
+     * The kernel_object is a pointer to the machine code, whose entry
+     * point is an 'amd_kernel_code_t' type, which is included in the
+     * kernel binary, and describes various aspects of the kernel. The
+     * desired entry is the 'kernel_code_entry_byte_offset' field,
+     * which provides the byte offset (positive or negative) from the
+     * address of the amd_kernel_code_t to the start of the machine
+     * instructions.
+     */
+    AMDKernelCode akc;
+    virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
+        sizeof(AMDKernelCode));
+
+    DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
+        "kernel object\n", akc.kernel_code_entry_byte_offset);
+
+    Addr machine_code_addr = (Addr)disp_pkt->kernel_object
+        + akc.kernel_code_entry_byte_offset;
+
+    DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
+        machine_code_addr);
+
+    Addr kern_name_addr(0);
+    virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
+        (uint8_t*)&kern_name_addr, 0x8);
+
+    std::string kernel_name;
+    virt_proxy.readString(kernel_name, kern_name_addr);
+
+    DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
+
+    HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
+        dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
+
+    DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
+        "grid size (%dx%dx%d) kernarg addr: %#x, completion "
+        "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
+        disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
+        disp_pkt->grid_size_x, disp_pkt->grid_size_y,
+        disp_pkt->grid_size_z, disp_pkt->kernarg_address,
+        disp_pkt->completion_signal);
+
+    DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
+        "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
+        "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
+        task->numScalarRegs(), task->codeAddr(), 0, 0);
+
+    initABI(task);
+    ++dynamic_task_id;
+}
+
+/**
+ * submitVendorPkt() is for accepting vendor-specific packets from
+ * the HSAPP. Vendor-specific packets may be used by the runtime to
+ * send commands to the HSA device that are specific to a particular
+ * vendor. The vendor-specific packets should be defined by the vendor
+ * in the runtime.
+ */
+
+/**
+ * TODO: For now we simply tell the HSAPP to finish the packet,
+ *       however a future patch will update this method to provide
+ *       the proper handling of any required vendor-specific packets.
+ *       In the version of ROCm that is currently supported (1.6)
+ *       the runtime will send packets that direct the CP to
+ *       invalidate the GPUs caches. We do this automatically on
+ *       each kernel launch in the CU, so this is safe for now.
+ */
+void
+GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
+    Addr host_pkt_addr)
+{
+    hsaPP->finishPkt(raw_pkt, queue_id);
+}
+
+/**
+ * Once the CP has finished extracting all relevant information about
+ * a task and has initialized the ABI state, we send a description of
+ * the task to the dispatcher. The dispatcher will create and dispatch
+ * WGs to the CUs.
+ */
+void
+GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
+{
+    dispatcher.dispatch(task);
+}
+
+/**
+ * The CP is responsible for traversing all HSA-ABI-related data
+ * structures from memory and initializing the ABI state.
+ * Information provided by the MQD, AQL packet, and code object
+ * metadata will be used to initialze register file state.
+ */
+void
+GPUCommandProcessor::initABI(HSAQueueEntry *task)
+{
+    auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
+
+    Addr hostReadIdxPtr
+        = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
+
+    dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
+        sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
+            &readDispIdOffEvent->readDispIdOffset);
+}
+
+System*
+GPUCommandProcessor::system()
+{
+    return sys;
+}
+
+AddrRangeList
+GPUCommandProcessor::getAddrRanges() const
+{
+    AddrRangeList ranges;
+    return ranges;
+}
+
+void
+GPUCommandProcessor::setShader(Shader *shader)
+{
+    _shader = shader;
+}
+
+Shader*
+GPUCommandProcessor::shader()
+{
+    return _shader;
+}
+
+GPUCommandProcessor*
+GPUCommandProcessorParams::create()
+{
+    return new GPUCommandProcessor(this);
+}
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+/**
+ * @file
+ * The GPUCommandProcessor (CP) is responsible for accepting commands, in
+ * the form of HSA AQL packets, from the HSA packet processor (HSAPP). The CP
+ * works with several components, including the HSAPP and the dispatcher.
+ * When the HSAPP sends a ready task to the CP, it will perform the necessary
+ * operations to extract relevant data structures from memory, such as the
+ * AQL queue descriptor and AQL packet, and initializes register state for the
+ * task's wavefronts.
+ */
+
+#ifndef __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
+#define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
+
+#include "dev/hsa/hsa_device.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
+
+struct GPUCommandProcessorParams;
+class GPUDispatcher;
+class Shader;
+
+class GPUCommandProcessor : public HSADevice
+{
+  public:
+    typedef GPUCommandProcessorParams Params;
+
+    GPUCommandProcessor() = delete;
+    GPUCommandProcessor(const Params *p);
+
+    void setShader(Shader *shader);
+    Shader* shader();
+
+    void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
+                           Addr host_pkt_addr) override;
+    void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
+                         Addr host_pkt_addr) override;
+    void dispatchPkt(HSAQueueEntry *task);
+
+    Tick write(PacketPtr pkt) override { return 0; }
+    Tick read(PacketPtr pkt) override { return 0; }
+    AddrRangeList getAddrRanges() const override;
+    System *system();
+
+  private:
+    Shader *_shader;
+    GPUDispatcher &dispatcher;
+
+    void initABI(HSAQueueEntry *task);
+
+    /**
+     * Perform a DMA read of the read_dispatch_id_field_base_byte_offset
+     * field, which follows directly after the read_dispatch_id (the read
+     * pointer) in the amd_hsa_queue_t struct (aka memory queue descriptor
+     * (MQD)), to find the base address of the MQD. The MQD is the runtime's
+     * soft representation of a HW queue descriptor (HQD).
+     *
+     * Any fields below the read dispatch ID in the amd_hsa_queue_t should
+     * not change according to the HSA standard, therefore we should be able
+     * to get them based on their known relative position to the read dispatch
+     * ID.
+     */
+    class ReadDispIdOffsetDmaEvent : public DmaCallback
+    {
+      public:
+        ReadDispIdOffsetDmaEvent(GPUCommandProcessor &gpu_cmd_proc,
+                                 HSAQueueEntry *task)
+            : DmaCallback(), readDispIdOffset(0), gpuCmdProc(gpu_cmd_proc),
+              _task(task)
+        {
+        }
+
+        void
+        process() override
+        {
+            /**
+             * Now that the read pointer's offset from the base of
+             * the MQD is known, we can use that to calculate the
+             * the address of the MQD itself, the dispatcher will
+             * DMA that into the HSAQueueEntry when a kernel is
+             * launched.
+             */
+            _task->hostAMDQueueAddr
+                = gpuCmdProc.hsaPP->getQueueDesc(_task->queueId())
+                    ->hostReadIndexPtr - readDispIdOffset;
+
+            /**
+             * DMA a copy of the MQD into the task. Some fields of
+             * the MQD will be used to initialize register state.
+             */
+            auto *mqdDmaEvent = new MQDDmaEvent(gpuCmdProc, _task);
+            gpuCmdProc.dmaReadVirt(_task->hostAMDQueueAddr,
+                                   sizeof(_amd_queue_t), mqdDmaEvent,
+                                   &_task->amdQueue);
+        }
+
+        uint32_t readDispIdOffset;
+
+      private:
+        GPUCommandProcessor &gpuCmdProc;
+        HSAQueueEntry *_task;
+    };
+
+    /**
+     * Perform a DMA read of the MQD that corresponds to a hardware
+     * queue descriptor (HQD). We store a copy of the MQD in the
+     * HSAQueueEntry object so we can send a copy of it along with
+     * a dispatch packet, which is needed to initialize register
+     * state.
+     */
+    class MQDDmaEvent : public DmaCallback
+    {
+      public:
+        MQDDmaEvent(GPUCommandProcessor &gpu_cmd_proc, HSAQueueEntry *task)
+            : DmaCallback(), gpuCmdProc(gpu_cmd_proc), _task(task)
+        {
+        }
+
+        void
+        process() override
+        {
+            gpuCmdProc.dispatchPkt(_task);
+        }
+
+      private:
+        GPUCommandProcessor &gpuCmdProc;
+        HSAQueueEntry *_task;
+    };
+};
+
+#endif // __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Sooraj Puthoor
+ *          Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_compute_driver.hh"
+
+#include "cpu/thread_context.hh"
+#include "debug/GPUDriver.hh"
+#include "dev/hsa/hsa_device.hh"
+#include "dev/hsa/hsa_packet_processor.hh"
+#include "dev/hsa/kfd_ioctl.h"
+#include "params/GPUComputeDriver.hh"
+#include "sim/syscall_emul_buf.hh"
+
+GPUComputeDriver::GPUComputeDriver(Params *p)
+    : HSADriver(p)
+{
+    DPRINTF(GPUDriver, "Constructing KFD: device\n");
+}
+
+int
+GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
+{
+    auto &virt_proxy = tc->getVirtProxy();
+
+    switch (req) {
+        case AMDKFD_IOC_GET_VERSION:
+          {
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");
+
+            TypedBufferArg<kfd_ioctl_get_version_args> args(ioc_buf);
+            args->major_version = 1;
+            args->minor_version = 0;
+
+            args.copyOut(virt_proxy);
+          }
+          break;
+        case AMDKFD_IOC_CREATE_QUEUE:
+          {
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
+
+            allocateQueue(virt_proxy, ioc_buf);
+
+            DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
+          }
+          break;
+        case AMDKFD_IOC_DESTROY_QUEUE:
+          {
+            TypedBufferArg<kfd_ioctl_destroy_queue_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
+                    "queue offset %d\n", args->queue_id);
+            device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id);
+          }
+          break;
+        case AMDKFD_IOC_SET_MEMORY_POLICY:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
+          }
+          break;
+        case AMDKFD_IOC_GET_CLOCK_COUNTERS:
+          {
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
+
+            TypedBufferArg<kfd_ioctl_get_clock_counters_args> args(ioc_buf);
+            args.copyIn(virt_proxy);
+
+            // Set nanosecond resolution
+            args->system_clock_freq = 1000000000;
+
+            /**
+             * Derive all clock counters based on the tick. All
+             * device clocks are identical and perfectly in sync.
+             */
+            uint64_t elapsed_nsec = curTick() / SimClock::Int::ns;
+            args->gpu_clock_counter = elapsed_nsec;
+            args->cpu_clock_counter = elapsed_nsec;
+            args->system_clock_counter = elapsed_nsec;
+
+            args.copyOut(virt_proxy);
+          }
+          break;
+        case AMDKFD_IOC_GET_PROCESS_APERTURES:
+          {
+            DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");
+
+            TypedBufferArg<kfd_ioctl_get_process_apertures_args> args(ioc_buf);
+            args->num_of_nodes = 1;
+
+            /**
+             * Set the GPUVM/LDS/Scratch APEs exactly as they
+             * are in the real driver, see the KFD driver
+             * in the ROCm Linux kernel source:
+             * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+             */
+            for (int i = 0; i < args->num_of_nodes; ++i) {
+                /**
+                 * While the GPU node numbers start at 0, we add 1
+                 * to force the count to start at 1. This is to
+                 * ensure that the base/limit addresses are
+                 * calculated correctly.
+                 */
+                args->process_apertures[i].scratch_base
+                    = scratchApeBase(i + 1);
+                args->process_apertures[i].scratch_limit =
+                    scratchApeLimit(args->process_apertures[i].scratch_base);
+
+                args->process_apertures[i].lds_base = ldsApeBase(i + 1);
+                args->process_apertures[i].lds_limit =
+                    ldsApeLimit(args->process_apertures[i].lds_base);
+
+                args->process_apertures[i].gpuvm_base = gpuVmApeBase(i + 1);
+                args->process_apertures[i].gpuvm_limit =
+                    gpuVmApeLimit(args->process_apertures[i].gpuvm_base);
+
+                // NOTE: Must match ID populated by hsaTopology.py
+                args->process_apertures[i].gpu_id = 2765;
+
+                DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
+                        args->process_apertures[i].gpuvm_base);
+                DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i,
+                        args->process_apertures[i].gpuvm_limit);
+
+                DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i,
+                        args->process_apertures[i].lds_base);
+                DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i,
+                        args->process_apertures[i].lds_limit);
+
+                DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i,
+                        args->process_apertures[i].scratch_base);
+                DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i,
+                        args->process_apertures[i].scratch_limit);
+
+                /**
+                 * The CPU's 64b address space can only use the
+                 * areas with VA[63:47] == 0x1ffff or VA[63:47] == 0,
+                 * therefore we must ensure that the apertures do not
+                 * fall in the CPU's address space.
+                 */
+                assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
+                       47) != 0);
+                assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
+                       47) != 0);
+                assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
+                       47) != 0);
+                assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
+                       47) != 0);
+                assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
+                       47) != 0);
+                assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
+                       47) != 0x1ffff);
+                assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
+                       47) != 0);
+            }
+
+            args.copyOut(virt_proxy);
+          }
+          break;
+        case AMDKFD_IOC_UPDATE_QUEUE:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
+          }
+          break;
+        case AMDKFD_IOC_CREATE_EVENT:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_CREATE_EVENT\n");
+          }
+          break;
+        case AMDKFD_IOC_DESTROY_EVENT:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
+          }
+          break;
+        case AMDKFD_IOC_SET_EVENT:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_SET_EVENT\n");
+          }
+          break;
+        case AMDKFD_IOC_RESET_EVENT:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
+          }
+          break;
+        case AMDKFD_IOC_WAIT_EVENTS:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
+          }
+          break;
+        case AMDKFD_IOC_DBG_REGISTER:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
+          }
+          break;
+        case AMDKFD_IOC_DBG_UNREGISTER:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
+          }
+          break;
+        case AMDKFD_IOC_DBG_ADDRESS_WATCH:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
+          }
+          break;
+        case AMDKFD_IOC_DBG_WAVE_CONTROL:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
+          }
+          break;
+        case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
+          }
+          break;
+        case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
+          }
+          break;
+        case AMDKFD_IOC_MAP_MEMORY_TO_GPU:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
+          }
+          break;
+        case AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
+          }
+        case AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH\n");
+          }
+          break;
+        case AMDKFD_IOC_SET_CU_MASK:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
+          }
+          break;
+        case AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE"
+                 "\n");
+          }
+          break;
+        case AMDKFD_IOC_SET_TRAP_HANDLER:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
+          }
+          break;
+        case AMDKFD_IOC_GET_PROCESS_APERTURES_NEW:
+          {
+            DPRINTF(GPUDriver,
+                    "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
+
+            TypedBufferArg<kfd_ioctl_get_process_apertures_new_args>
+                ioc_args(ioc_buf);
+
+            ioc_args.copyIn(virt_proxy);
+            ioc_args->num_of_nodes = 1;
+
+            for (int i = 0; i < ioc_args->num_of_nodes; ++i) {
+                TypedBufferArg<kfd_process_device_apertures> ape_args
+                    (ioc_args->kfd_process_device_apertures_ptr);
+
+                ape_args->scratch_base = scratchApeBase(i + 1);
+                ape_args->scratch_limit =
+                    scratchApeLimit(ape_args->scratch_base);
+                ape_args->lds_base = ldsApeBase(i + 1);
+                ape_args->lds_limit = ldsApeLimit(ape_args->lds_base);
+                ape_args->gpuvm_base = gpuVmApeBase(i + 1);
+                ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base);
+
+                ape_args->gpu_id = 2765;
+
+                assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
+                assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
+                assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0);
+                assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
+                assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0);
+                assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0x1ffff);
+                assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0);
+
+                ape_args.copyOut(virt_proxy);
+            }
+
+            ioc_args.copyOut(virt_proxy);
+          }
+          break;
+        case AMDKFD_IOC_GET_DMABUF_INFO:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
+          }
+          break;
+        case AMDKFD_IOC_IMPORT_DMABUF:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
+          }
+          break;
+        case AMDKFD_IOC_GET_TILE_CONFIG:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
+          }
+          break;
+        case AMDKFD_IOC_IPC_IMPORT_HANDLE:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_IPC_IMPORT_HANDLE\n");
+          }
+          break;
+        case AMDKFD_IOC_IPC_EXPORT_HANDLE:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_IPC_EXPORT_HANDLE\n");
+          }
+          break;
+        case AMDKFD_IOC_CROSS_MEMORY_COPY:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_CROSS_MEMORY_COPY\n");
+          }
+          break;
+        case AMDKFD_IOC_OPEN_GRAPHIC_HANDLE:
+          {
+            warn("unimplemented ioctl: AMDKFD_IOC_OPEN_GRAPHIC_HANDLE\n");
+          }
+          break;
+        default:
+          fatal("%s: bad ioctl %d\n", req);
+          break;
+    }
+    return 0;
+}
+
+Addr
+GPUComputeDriver::gpuVmApeBase(int gpuNum) const
+{
+    return ((Addr)gpuNum << 61) + 0x1000000000000L;
+}
+
+Addr
+GPUComputeDriver::gpuVmApeLimit(Addr apeBase) const
+{
+    return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
+}
+
+Addr
+GPUComputeDriver::scratchApeBase(int gpuNum) const
+{
+    return ((Addr)gpuNum << 61) + 0x100000000L;
+}
+
+Addr
+GPUComputeDriver::scratchApeLimit(Addr apeBase) const
+{
+    return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+}
+
+Addr
+GPUComputeDriver::ldsApeBase(int gpuNum) const
+{
+    return ((Addr)gpuNum << 61) + 0x0;
+}
+
+Addr
+GPUComputeDriver::ldsApeLimit(Addr apeBase) const
+{
+    return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+}
+
+GPUComputeDriver*
+GPUComputeDriverParams::create()
+{
+    return new GPUComputeDriver(this);
+}
--- a/src/gpu-compute/gpu_compute_driver.hh
+++ b/src/gpu-compute/gpu_compute_driver.hh
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Sooraj Puthoor
+ *          Anthony Gutierrez
+ */
+
+/**
+ * @file
+ * The GPUComputeDriver implements an HSADriver for an HSA AMD GPU
+ * agent. Other GPU devices, or other HSA agents, should not derive
+ * from this class. Instead device-specific implementations of an
+ * HSADriver should be provided for each unique device.
+ */
+
+#ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
+#define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
+
+#include "dev/hsa/hsa_driver.hh"
+
+struct GPUComputeDriverParams;
+
+class GPUComputeDriver final : public HSADriver
+{
+  public:
+    typedef GPUComputeDriverParams Params;
+    GPUComputeDriver(Params *p);
+    int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;
+
+  private:
+    /**
+     * The aperture (APE) base/limit pairs are set
+     * statically at startup by the real KFD. AMD
+     * x86_64 CPUs only use the areas in the 64b
+     * address space where VA[63:47] == 0x1ffff or
+     * VA[63:47] = 0. These methods generate the APE
+     * base/limit pairs in exactly the same way as
+     * the real KFD does, which ensures these APEs do
+     * not fall into the CPU's address space
+     *
+     * see the macros in the KFD driver in the ROCm
+     * Linux kernel source:
+     *
+     * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+     */
+    Addr gpuVmApeBase(int gpuNum) const;
+    Addr gpuVmApeLimit(Addr apeBase) const;
+    Addr scratchApeBase(int gpuNum) const;
+    Addr scratchApeLimit(Addr apeBase) const;
+    Addr ldsApeBase(int gpuNum) const;
+    Addr ldsApeLimit(Addr apeBase) const;
+};
+
+#endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -35,26 +35,50 @@

 #include "debug/GPUMem.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/wavefront.hh"

 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
-                       GPUStaticInst *static_inst, uint64_t instSeqNum)
-    : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
-      n_reg(0), useContinuation(false),
-      statusBitVector(0), _staticInst(static_inst), _seqNum(instSeqNum)
+                       GPUStaticInst *static_inst, InstSeqNum instSeqNum)
+    : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
+      (Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false),
+      _staticInst(static_inst), _seqNum(instSeqNum)
 {
    tlbHitLevel.assign(computeUnit()->wfSize(), -1);
-    d_data = new uint8_t[computeUnit()->wfSize() * 16];
+    // vector instructions can have up to 4 source/destination operands
+    d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
    a_data = new uint8_t[computeUnit()->wfSize() * 8];
    x_data = new uint8_t[computeUnit()->wfSize() * 8];
+    // scalar loads can read up to 16 Dwords of data (see publicly
+    // available GCN3 ISA manual)
+    scalar_data = new uint8_t[16 * sizeof(uint32_t)];
+    for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
+        scalar_data[i] = 0;
+    }
    for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
        a_data[i] = 0;
        x_data[i] = 0;
    }
-    for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
+    for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {
        d_data[i] = 0;
    }
+    time = 0;
+
+    cu_id = _cu->cu_id;
+    if (_wf) {
+        simdId = _wf->simdId;
+        wfDynId = _wf->wfDynId;
+        kern_id = _wf->kernId;
+        wg_id = _wf->wgId;
+        wfSlotId = _wf->wfSlotId;
+    } else {
+        simdId = -1;
+        wfDynId = -1;
+        kern_id = -1;
+        wg_id = -1;
+        wfSlotId = -1;
+    }
 }

 GPUDynInst::~GPUDynInst()
@@ -62,6 +86,8 @@ GPUDynInst::~GPUDynInst()
    delete[] d_data;
    delete[] a_data;
    delete[] x_data;
+    delete[] scalar_data;
+    delete _staticInst;
 }

 void
@@ -82,6 +108,36 @@ GPUDynInst::numDstRegOperands()
    return _staticInst->numDstRegOperands();
 }

+int
+GPUDynInst::numSrcVecOperands()
+{
+    return _staticInst->numSrcVecOperands();
+}
+
+int
+GPUDynInst::numDstVecOperands()
+{
+    return _staticInst->numDstVecOperands();
+}
+
+int
+GPUDynInst::numSrcVecDWORDs()
+{
+    return _staticInst->numSrcVecDWORDs();
+}
+
+int
+GPUDynInst::numDstVecDWORDs()
+{
+    return _staticInst->numDstVecDWORDs();
+}
+
+int
+GPUDynInst::numOpdDWORDs(int operandIdx)
+{
+    return _staticInst->numOpdDWORDs(operandIdx);
+}
+
 int
 GPUDynInst::getNumOperands()
 {
@@ -100,12 +156,6 @@ GPUDynInst::isScalarRegister(int operandIdx)
    return _staticInst->isScalarRegister(operandIdx);
 }

-bool
-GPUDynInst::isCondRegister(int operandIdx)
-{
-    return _staticInst->isCondRegister(operandIdx);
-}
-
 int
 GPUDynInst::getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst)
 {
@@ -130,13 +180,82 @@ GPUDynInst::isSrcOperand(int operandIdx)
    return _staticInst->isSrcOperand(operandIdx);
 }

+bool
+GPUDynInst::hasSourceSgpr() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::hasSourceVgpr() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isVectorRegister(i) && _staticInst->isSrcOperand(i)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::hasDestinationSgpr() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::srcIsVgpr(int index) const
+{
+    assert(index >= 0 && index < _staticInst->getNumOperands());
+    if (_staticInst->isVectorRegister(index) &&
+        _staticInst->isSrcOperand(index)) {
+        return true;
+    }
+    return false;
+}
+
+bool
+GPUDynInst::hasDestinationVgpr() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isVectorRegister(i) && _staticInst->isDstOperand(i)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::isOpcode(const std::string& opcodeStr,
+                     const std::string& extStr) const
+{
+    return _staticInst->opcode().find(opcodeStr) != std::string::npos &&
+        _staticInst->opcode().find(extStr) != std::string::npos;
+}
+
+bool
+GPUDynInst::isOpcode(const std::string& opcodeStr) const
+{
+    return _staticInst->opcode().find(opcodeStr) != std::string::npos;
+}
+
 const std::string&
 GPUDynInst::disassemble() const
 {
    return _staticInst->disassemble();
 }

-uint64_t
+InstSeqNum
 GPUDynInst::seqNum() const
 {
    return _seqNum;
@@ -148,6 +267,40 @@ GPUDynInst::executedAs()
    return _staticInst->executed_as;
 }

+bool
+GPUDynInst::hasVgprRawDependence(GPUDynInstPtr s)
+{
+    assert(s);
+    for (int i = 0; i < getNumOperands(); ++i) {
+        if (isVectorRegister(i) && isSrcOperand(i)) {
+            for (int j = 0; j < s->getNumOperands(); ++j) {
+                if (s->isVectorRegister(j) && s->isDstOperand(j)) {
+                    if (i == j)
+                        return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::hasSgprRawDependence(GPUDynInstPtr s)
+{
+    assert(s);
+    for (int i = 0; i < getNumOperands(); ++i) {
+        if (isScalarRegister(i) && isSrcOperand(i)) {
+            for (int j = 0; j < s->getNumOperands(); ++j) {
+                if (s->isScalarRegister(j) && s->isDstOperand(j)) {
+                    if (i == j)
+                        return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
 // Process a memory instruction and (if necessary) submit timing request
 void
 GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
@@ -156,12 +309,15 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
            cu->cu_id, simdId, wfSlotId, exec_mask);

    _staticInst->initiateAcc(gpuDynInst);
-    time = 0;
 }

 void
 GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
 {
+    DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="
+            "%#x\n complete",
+            cu->cu_id, simdId, wfSlotId, exec_mask);
+
    _staticInst->completeAcc(gpuDynInst);
 }

@@ -181,12 +337,42 @@ GPUDynInst::isBranch() const
    return _staticInst->isBranch();
 }

+bool
+GPUDynInst::isCondBranch() const
+{
+    return _staticInst->isCondBranch();
+}
+
 bool
 GPUDynInst::isNop() const
 {
    return _staticInst->isNop();
 }

+bool
+GPUDynInst::isEndOfKernel() const
+{
+    return _staticInst->isEndOfKernel();
+}
+
+bool
+GPUDynInst::isKernelLaunch() const
+{
+    return _staticInst->isKernelLaunch();
+}
+
+bool
+GPUDynInst::isSDWAInst() const
+{
+    return _staticInst->isSDWAInst();
+}
+
+bool
+GPUDynInst::isDPPInst() const
+{
+    return _staticInst->isDPPInst();
+}
+
 bool
 GPUDynInst::isReturn() const
 {
@@ -218,9 +404,9 @@ GPUDynInst::isBarrier() const
 }

 bool
-GPUDynInst::isMemFence() const
+GPUDynInst::isMemSync() const
 {
-    return _staticInst->isMemFence();
+    return _staticInst->isMemSync();
 }

 bool
@@ -265,6 +451,12 @@ GPUDynInst::isAtomicRet() const
    return _staticInst->isAtomicRet();
 }

+bool
+GPUDynInst::isVector() const
+{
+    return !_staticInst->isScalar();
+}
+
 bool
 GPUDynInst::isScalar() const
 {
@@ -295,6 +487,78 @@ GPUDynInst::writesVCC() const
    return _staticInst->writesVCC();
 }

+bool
+GPUDynInst::readsMode() const
+{
+    return _staticInst->readsMode();
+}
+
+bool
+GPUDynInst::writesMode() const
+{
+    return _staticInst->writesMode();
+}
+
+bool
+GPUDynInst::readsEXEC() const
+{
+    return _staticInst->readsEXEC();
+}
+
+bool
+GPUDynInst::writesEXEC() const
+{
+    return _staticInst->writesEXEC();
+}
+
+bool
+GPUDynInst::ignoreExec() const
+{
+    return _staticInst->ignoreExec();
+}
+
+bool
+GPUDynInst::writesExecMask() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        return _staticInst->isDstOperand(i) &&
+            _staticInst->isExecMaskRegister(i);
+    }
+    return false;
+}
+
+bool
+GPUDynInst::readsExecMask() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        return _staticInst->isSrcOperand(i) &&
+            _staticInst->isExecMaskRegister(i);
+    }
+    return false;
+}
+
+bool
+GPUDynInst::writesFlatScratch() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
+            return _staticInst->isFlatScratchRegister(i);
+        }
+    }
+    return false;
+}
+
+bool
+GPUDynInst::readsFlatScratch() const
+{
+    for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+        if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
+            return _staticInst->isFlatScratchRegister(i);
+        }
+    }
+    return false;
+}
+
 bool
 GPUDynInst::isAtomicAnd() const
 {
@@ -420,72 +684,6 @@ GPUDynInst::isSpillSeg() const
    return _staticInst->isSpillSeg();
 }

-bool
-GPUDynInst::isWorkitemScope() const
-{
-    return _staticInst->isWorkitemScope();
-}
-
-bool
-GPUDynInst::isWavefrontScope() const
-{
-    return _staticInst->isWavefrontScope();
-}
-
-bool
-GPUDynInst::isWorkgroupScope() const
-{
-    return _staticInst->isWorkgroupScope();
-}
-
-bool
-GPUDynInst::isDeviceScope() const
-{
-    return _staticInst->isDeviceScope();
-}
-
-bool
-GPUDynInst::isSystemScope() const
-{
-    return _staticInst->isSystemScope();
-}
-
-bool
-GPUDynInst::isNoScope() const
-{
-    return _staticInst->isNoScope();
-}
-
-bool
-GPUDynInst::isRelaxedOrder() const
-{
-    return _staticInst->isRelaxedOrder();
-}
-
-bool
-GPUDynInst::isAcquire() const
-{
-    return _staticInst->isAcquire();
-}
-
-bool
-GPUDynInst::isRelease() const
-{
-    return _staticInst->isRelease();
-}
-
-bool
-GPUDynInst::isAcquireRelease() const
-{
-    return _staticInst->isAcquireRelease();
-}
-
-bool
-GPUDynInst::isNoOrder() const
-{
-    return _staticInst->isNoOrder();
-}
-
 bool
 GPUDynInst::isGloballyCoherent() const
 {
@@ -498,12 +696,240 @@ GPUDynInst::isSystemCoherent() const
    return _staticInst->isSystemCoherent();
 }

+bool
+GPUDynInst::isF16() const
+{
+    return _staticInst->isF16();
+}
+
+bool
+GPUDynInst::isF32() const
+{
+    return _staticInst->isF32();
+}
+
+bool
+GPUDynInst::isF64() const
+{
+    return _staticInst->isF64();
+}
+
+bool
+GPUDynInst::isFMA() const
+{
+    return _staticInst->isFMA();
+}
+
+bool
+GPUDynInst::isMAC() const
+{
+    return _staticInst->isMAC();
+}
+
+bool
+GPUDynInst::isMAD() const
+{
+    return _staticInst->isMAD();
+}
+
+void
+GPUDynInst::doApertureCheck(const VectorMask &mask)
+{
+    assert(mask.any());
+    // find the segment of the first active address, after
+    // that we check that all other active addresses also
+    // fall within the same APE
+    for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+        if (mask[lane]) {
+            if (computeUnit()->shader->isLdsApe(addr[lane])) {
+                // group segment
+                staticInstruction()->executed_as = Enums::SC_GROUP;
+                break;
+            } else if (computeUnit()->shader->isScratchApe(addr[lane])) {
+                // private segment
+                staticInstruction()->executed_as = Enums::SC_PRIVATE;
+                break;
+            } else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {
+                // we won't support GPUVM
+                fatal("flat access is in GPUVM APE\n");
+            } else if (bits(addr[lane], 63, 47) != 0x1FFFF &&
+                       bits(addr[lane], 63, 47)) {
+                // we are in the "hole", this is a memory violation
+                fatal("flat access at addr %#x has a memory violation\n",
+                      addr[lane]);
+            } else {
+                // global memory segment
+                staticInstruction()->executed_as = Enums::SC_GLOBAL;
+                break;
+            }
+        }
+    }
+
+    // we should have found the segment
+    assert(executedAs() != Enums::SC_NONE);
+
+    // flat accesses should not straddle multiple APEs so we
+    // must check that all addresses fall within the same APE
+    if (executedAs() == Enums::SC_GROUP) {
+        for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+            if (mask[lane]) {
+                // if the first valid addr we found above was LDS,
+                // all the rest should be
+                assert(computeUnit()->shader->isLdsApe(addr[lane]));
+            }
+        }
+    } else if (executedAs() == Enums::SC_PRIVATE) {
+        for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+            if (mask[lane]) {
+                // if the first valid addr we found above was private,
+                // all the rest should be
+                assert(computeUnit()->shader->isScratchApe(addr[lane]));
+            }
+        }
+    } else {
+        for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+            if (mask[lane]) {
+                // if the first valid addr we found above was global,
+                // all the rest should be. because we don't have an
+                // explicit range of the global segment, we just make
+                // sure that the address fall in no other APE and that
+                // it is not a memory violation
+                assert(!computeUnit()->shader->isLdsApe(addr[lane]));
+                assert(!computeUnit()->shader->isScratchApe(addr[lane]));
+                assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));
+                assert(!(bits(addr[lane], 63, 47) != 0x1FFFF
+                       && bits(addr[lane], 63, 47)));
+            }
+        }
+    }
+}
+
+void
+GPUDynInst::resolveFlatSegment(const VectorMask &mask)
+{
+    doApertureCheck(mask);
+
+
+    // Now that we know the aperature, do the following:
+    // 1. Transform the flat address to its segmented equivalent.
+    // 2. Set the execUnitId based an the aperture check.
+    // 3. Decrement any extra resources that were reserved. Other
+    //    resources are released as normal, below.
+    if (executedAs() == Enums::SC_GLOBAL) {
+        // no transormation for global segment
+        wavefront()->execUnitId =  wavefront()->flatGmUnitId;
+        if (isLoad()) {
+            wavefront()->rdLmReqsInPipe--;
+        } else if (isStore()) {
+            wavefront()->wrLmReqsInPipe--;
+        } else if (isAtomic() || isMemSync()) {
+            wavefront()->wrLmReqsInPipe--;
+            wavefront()->rdLmReqsInPipe--;
+        } else {
+            panic("Invalid memory operation!\n");
+        }
+    } else if (executedAs() == Enums::SC_GROUP) {
+        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
+            if (mask[lane]) {
+                // flat address calculation goes here.
+                // addr[lane] = segmented address
+                panic("Flat group memory operation is unimplemented!\n");
+            }
+        }
+        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
+        if (isLoad()) {
+            wavefront()->rdGmReqsInPipe--;
+        } else if (isStore()) {
+            wavefront()->wrGmReqsInPipe--;
+        } else if (isAtomic() || isMemSync()) {
+            wavefront()->rdGmReqsInPipe--;
+            wavefront()->wrGmReqsInPipe--;
+        } else {
+            panic("Invalid memory operation!\n");
+        }
+    } else if (executedAs() == Enums::SC_PRIVATE) {
+        /**
+         * Flat instructions may resolve to the private segment (scratch),
+         * which is backed by main memory and provides per-lane scratch
+         * memory. Flat addressing uses apertures - registers that specify
+         * the address range in the VA space where LDS/private memory is
+         * mapped. The value of which is set by the kernel mode driver.
+         * These apertures use addresses that are not used by x86 CPUs.
+         * When the address of a Flat operation falls into one of the
+         * apertures, the Flat operation is redirected to either LDS or
+         * to the private memory segment.
+         *
+         * For private memory the SW runtime will allocate some space in
+         * the VA space for each AQL queue. The base address of which is
+         * stored in scalar registers per the AMD GPU ABI. The amd_queue_t
+         * scratch_backing_memory_location provides the base address in
+         * memory for the queue's private segment. Various other fields
+         * loaded into register state during kernel launch specify per-WF
+         * and per-work-item offsets so that individual lanes may access
+         * their private segment allocation.
+         *
+         * For more details about flat addressing see:
+         *     http://rocm-documentation.readthedocs.io/en/latest/
+         *     ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
+         *
+         *     https://github.com/ROCm-Developer-Tools/
+         *     ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
+         *     #flat-addressing
+         */
+
+        uint32_t numSgprs = wavefront()->maxSgprs;
+        uint32_t physSgprIdx =
+            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
+                                                          numSgprs - 3);
+        uint32_t offset =
+            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
+        physSgprIdx =
+            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
+                                                          numSgprs - 4);
+        uint32_t size =
+            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
+        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
+            if (mask[lane]) {
+                addr[lane] = addr[lane] + lane * size + offset +
+                    wavefront()->computeUnit->shader->getHiddenPrivateBase() -
+                    wavefront()->computeUnit->shader->getScratchBase();
+            }
+        }
+        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
+        if (isLoad()) {
+            wavefront()->rdGmReqsInPipe--;
+        } else if (isStore()) {
+            wavefront()->wrGmReqsInPipe--;
+        } else if (isAtomic() || isMemSync()) {
+            wavefront()->rdGmReqsInPipe--;
+            wavefront()->wrGmReqsInPipe--;
+        } else {
+            panic("Invalid memory operation!\n");
+        }
+    } else {
+        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
+            if (mask[lane]) {
+                panic("flat addr %#llx maps to bad segment %d\n",
+                      addr[lane], executedAs());
+            }
+        }
+    }
+}
+
+TheGpuISA::ScalarRegU32
+GPUDynInst::srcLiteral() const
+{
+    return _staticInst->srcLiteral();
+}
+
 void
 GPUDynInst::updateStats()
 {
    if (_staticInst->isLocalMem()) {
        // access to LDS (shared) memory
        cu->dynamicLMemInstrCnt++;
+    } else if (_staticInst->isFlat()) {
+        cu->dynamicFlatMemInstrCnt++;
    } else {
        // access to global memory

@@ -536,3 +962,28 @@ GPUDynInst::updateStats()
        cu->dynamicGMemInstrCnt++;
    }
 }
+
+void
+GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)
+{
+    // Only take the first measurement in the case of coalescing
+    if (roundTripTime.size() > hopId)
+        return;
+
+    roundTripTime.push_back(currentTime);
+}
+
+void
+GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
+{
+    if (lineAddressTime.count(addr)) {
+        if (lineAddressTime[addr].size() > hopId) {
+            return;
+        }
+
+        lineAddressTime[addr].push_back(currentTime);
+    } else if (hopId == 0) {
+        auto addressTimeVec = std::vector<Tick> { currentTime };
+        lineAddressTime.insert(std::make_pair(addr, addressTimeVec));
+    }
+}
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -39,7 +39,6 @@

 #include "base/amo.hh"
 #include "base/logging.hh"
-#include "enums/MemType.hh"
 #include "enums/StorageClassType.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_exec_context.hh"
@@ -68,20 +67,10 @@ class AtomicOpCAS : public TypedAtomicOpFunctor<T>
        } else {
            computeUnit->numFailedCASOps++;
        }
-
-        if (computeUnit->xact_cas_mode) {
-            computeUnit->xactCasLoadMap.clear();
-        }
    }
    AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
 };

-typedef enum
-{
-    VT_32,
-    VT_64,
-} vgpr_type;
-
 class GPUDynInst : public GPUExecContext
 {
  public:
@@ -91,27 +80,51 @@ class GPUDynInst : public GPUExecContext
    void execute(GPUDynInstPtr gpuDynInst);
    int numSrcRegOperands();
    int numDstRegOperands();
+    int numDstVecOperands();
+    int numSrcVecOperands();
+    int numSrcVecDWORDs();
+    int numDstVecDWORDs();
+    int numOpdDWORDs(int operandIdx);
    int getNumOperands();
    bool isVectorRegister(int operandIdx);
    bool isScalarRegister(int operandIdx);
-    bool isCondRegister(int operandIdx);
    int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
    int getOperandSize(int operandIdx);
    bool isDstOperand(int operandIdx);
    bool isSrcOperand(int operandIdx);

+    bool hasDestinationSgpr() const;
+    bool hasSourceSgpr() const;
+    bool hasDestinationVgpr() const;
+    bool hasSourceVgpr() const;
+
+    bool hasSgprRawDependence(GPUDynInstPtr s);
+    bool hasVgprRawDependence(GPUDynInstPtr s);
+
+    // returns true if the string "opcodeStr" is found in the
+    // opcode of the instruction
+    bool isOpcode(const std::string& opcodeStr) const;
+    bool isOpcode(const std::string& opcodeStr,
+                  const std::string& extStr) const;
+    // returns true if source operand at "index" is a vector register
+    bool srcIsVgpr(int index) const;
+
    const std::string &disassemble() const;

-    uint64_t seqNum() const;
+    InstSeqNum seqNum() const;

    Enums::StorageClassType executedAs();

-    // The address of the memory operation
+    // virtual address for scalar memory operations
+    Addr scalarAddr;
+    // virtual addressies for vector memory operations
    std::vector<Addr> addr;
    Addr pAddr;

-    // The data to get written
+    // vector data to get written
    uint8_t *d_data;
+    // scalar data to be transferred
+    uint8_t *scalar_data;
    // Additional data (for atomics)
    uint8_t *a_data;
    // Additional data (for atomics)
@@ -119,19 +132,6 @@ class GPUDynInst : public GPUExecContext
    // The execution mask
    VectorMask exec_mask;

-    // The memory type (M_U32, M_S32, ...)
-    Enums::MemType m_type;
-
-    // The equivalency class
-    int equiv;
-    // The return VGPR type (VT_32 or VT_64)
-    vgpr_type v_type;
-    // Number of VGPR's accessed (1, 2, or 4)
-    int n_reg;
-    // The return VGPR index
-    int dst_reg;
-    // There can be max 4 dest regs>
-    int dst_reg_vec[4];
    // SIMD where the WF of the memory instruction has been mapped to
    int simdId;
    // unique id of the WF where the memory instruction belongs to
@@ -140,21 +140,16 @@ class GPUDynInst : public GPUExecContext
    int kern_id;
    // The CU id of the requesting wf
    int cu_id;
+    // The workgroup id of the requesting wf
+    int wg_id;
    // HW slot id where the WF is mapped to inside a SIMD unit
    int wfSlotId;
    // execution pipeline id where the memory instruction has been scheduled
-    int pipeId;
+    int execUnitId;
    // The execution time of this operation
    Tick time;
    // The latency of this operation
    WaitClass latency;
-    // A list of bank conflicts for the 4 cycles.
-    uint32_t bc[4];
-
-    // A pointer to ROM
-    uint8_t *rom;
-    // The size of the READONLY segment
-    int sz_rom;

    // Initiate the specified memory operation, by creating a
    // memory request and sending it off to the memory system.
@@ -168,16 +163,23 @@ class GPUDynInst : public GPUExecContext

    GPUStaticInst* staticInstruction() { return _staticInst; }

+    TheGpuISA::ScalarRegU32 srcLiteral() const;
+
    bool isALU() const;
    bool isBranch() const;
+    bool isCondBranch() const;
    bool isNop() const;
    bool isReturn() const;
+    bool isEndOfKernel() const;
+    bool isKernelLaunch() const;
+    bool isSDWAInst() const;
+    bool isDPPInst() const;
    bool isUnconditionalJump() const;
    bool isSpecialOp() const;
    bool isWaitcnt() const;

    bool isBarrier() const;
-    bool isMemFence() const;
+    bool isMemSync() const;
    bool isMemRef() const;
    bool isFlat() const;
    bool isLoad() const;
@@ -188,10 +190,20 @@ class GPUDynInst : public GPUExecContext
    bool isAtomicRet() const;

    bool isScalar() const;
+    bool isVector() const;
    bool readsSCC() const;
    bool writesSCC() const;
    bool readsVCC() const;
    bool writesVCC() const;
+    bool readsEXEC() const;
+    bool writesEXEC() const;
+    bool readsMode() const;
+    bool writesMode() const;
+    bool ignoreExec() const;
+    bool readsFlatScratch() const;
+    bool writesFlatScratch() const;
+    bool readsExecMask() const;
+    bool writesExecMask() const;

    bool isAtomicAnd() const;
    bool isAtomicOr() const;
@@ -217,39 +229,25 @@ class GPUDynInst : public GPUExecContext
    bool isReadOnlySeg() const;
    bool isSpillSeg() const;

-    bool isWorkitemScope() const;
-    bool isWavefrontScope() const;
-    bool isWorkgroupScope() const;
-    bool isDeviceScope() const;
-    bool isSystemScope() const;
-    bool isNoScope() const;
-
-    bool isRelaxedOrder() const;
-    bool isAcquire() const;
-    bool isRelease() const;
-    bool isAcquireRelease() const;
-    bool isNoOrder() const;
-
    bool isGloballyCoherent() const;
    bool isSystemCoherent() const;

-    /*
-     * Loads/stores/atomics may have acquire/release semantics associated
-     * withthem. Some protocols want to see the acquire/release as separate
-     * requests from the load/store/atomic. We implement that separation
-     * using continuations (i.e., a function pointer with an object associated
-     * with it). When, for example, the front-end generates a store with
-     * release semantics, we will first issue a normal store and set the
-     * continuation in the GPUDynInst to a function that generate a
-     * release request. That continuation will be called when the normal
-     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
-     * continuation will be called in the context of the same GPUDynInst
-     * that generated the initial store.
-     */
-    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
+    bool isF16() const;
+    bool isF32() const;
+    bool isF64() const;

-    // when true, call execContinuation when response arrives
-    bool useContinuation;
+    bool isFMA() const;
+    bool isMAC() const;
+    bool isMAD() const;
+
+    // for FLAT memory ops. check the segment address
+    // against the APE registers to see if it falls
+    // within one of the APE ranges for LDS/SCRATCH/GPUVM.
+    // if it does not fall into one of the three APEs, it
+    // will be a regular global access.
+    void doApertureCheck(const VectorMask &mask);
+    // Function to resolve a flat accesses during execution stage.
+    void resolveFlatSegment(const VectorMask &mask);

    template<typename c0> AtomicOpFunctorPtr
    makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
@@ -282,62 +280,31 @@ class GPUDynInst : public GPUExecContext
    }

    void
-    setRequestFlags(RequestPtr req, bool setMemOrder=true)
+    setRequestFlags(RequestPtr req) const
    {
-        // currently these are the easy scopes to deduce
-        if (isPrivateSeg()) {
-            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
-        } else if (isSpillSeg()) {
-            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
-        } else if (isGlobalSeg()) {
-            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
-        } else if (isReadOnlySeg()) {
-            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
-        } else if (isGroupSeg()) {
-            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
-        } else if (isFlat()) {
-            panic("TODO: translate to correct scope");
-        } else {
-            fatal("%s has bad segment type\n", disassemble());
+        if (isGloballyCoherent()) {
+            req->setCacheCoherenceFlags(Request::GLC_BIT);
        }

-        if (isWavefrontScope()) {
-            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
-                                        Request::WAVEFRONT_SCOPE);
-        } else if (isWorkgroupScope()) {
-            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
-                                        Request::WORKGROUP_SCOPE);
-        } else if (isDeviceScope()) {
-            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
-                                        Request::DEVICE_SCOPE);
-        } else if (isSystemScope()) {
-            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
-                                        Request::SYSTEM_SCOPE);
-        } else if (!isNoScope() && !isWorkitemScope()) {
-            fatal("%s has bad scope type\n", disassemble());
+        if (isSystemCoherent()) {
+            req->setCacheCoherenceFlags(Request::SLC_BIT);
        }

-        if (setMemOrder) {
-            // set acquire and release flags
-            if (isAcquire()) {
-                req->setFlags(Request::ACQUIRE);
-            } else if (isRelease()) {
-                req->setFlags(Request::RELEASE);
-            } else if (isAcquireRelease()) {
-                req->setFlags(Request::ACQUIRE | Request::RELEASE);
-            } else if (!isNoOrder()) {
-                fatal("%s has bad memory order\n", disassemble());
-            }
-        }
-
-        // set atomic type
-        // currently, the instruction genenerator only produces atomic return
-        // but a magic instruction can produce atomic no return
        if (isAtomicRet()) {
            req->setFlags(Request::ATOMIC_RETURN_OP);
        } else if (isAtomicNoRet()) {
            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
        }
+
+        if (isMemSync()) {
+            // the path for kernel launch and kernel end is different
+            // from non-kernel mem sync.
+            assert(!isKernelLaunch());
+            assert(!isEndOfKernel());
+
+            // must be wbinv inst if not kernel launch/end
+            req->setCacheCoherenceFlags(Request::ACQUIRE);
+        }
    }

    // Map returned packets and the addresses they satisfy with which lane they
@@ -348,12 +315,39 @@ class GPUDynInst : public GPUExecContext
    // Track the status of memory requests per lane, a bit per lane
    VectorMask statusBitVector;
    // for ld_v# or st_v#
-    std::vector<int> statusVector;
    std::vector<int> tlbHitLevel;

+    // for misaligned scalar ops we track the number
+    // of outstanding reqs here
+    int numScalarReqs;
+
+    Tick getAccessTime() const { return accessTime; }
+
+    void setAccessTime(Tick currentTime) { accessTime = currentTime; }
+
+    void profileRoundTripTime(Tick currentTime, int hopId);
+    std::vector<Tick> getRoundTripTime() const { return roundTripTime; }
+
+    void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
+    const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
+    { return lineAddressTime; }
+
+    // inst used to save/restore a wavefront context
+    bool isSaveRestore;
  private:
    GPUStaticInst *_staticInst;
-    uint64_t _seqNum;
+    const InstSeqNum _seqNum;
+
+    // the time the request was started
+    Tick accessTime = -1;
+
+    // hold the tick when the instruction arrives at certain hop points
+    // on it's way to main memory
+    std::vector<Tick> roundTripTime;
+
+    // hold each cache block address for the instruction and a vector
+    // to hold the tick when the block arrives at certain hop points
+    std::map<Addr, std::vector<Tick>> lineAddressTime;
 };

 #endif // __GPU_DYN_INST_HH__
--- a/src/gpu-compute/gpu_exec_context.cc
+++ b/src/gpu-compute/gpu_exec_context.cc
@@ -59,8 +59,8 @@ GPUExecContext::readMiscReg(int opIdx) const
 }

 void
-GPUExecContext::writeMiscReg(int opIdx, RegVal operandVal)
+GPUExecContext::writeMiscReg(int opIdx, RegVal val)
 {
    assert(gpuISA);
-    gpuISA->writeMiscReg(opIdx, operandVal);
+    gpuISA->writeMiscReg(opIdx, val);
 }
--- a/src/gpu-compute/gpu_static_inst.cc
+++ b/src/gpu-compute/gpu_static_inst.cc
@@ -34,10 +34,10 @@
 #include "gpu-compute/gpu_static_inst.hh"

 GPUStaticInst::GPUStaticInst(const std::string &opcode)
-    : executed_as(Enums::SC_NONE), opcode(opcode),
-      _instNum(0), _instAddr(0)
+    : executed_as(Enums::SC_NONE), _opcode(opcode),
+      _instNum(0), _instAddr(0), srcVecOperands(-1), dstVecOperands(-1),
+      srcVecDWORDs(-1), dstVecDWORDs(-1)
 {
-    setFlag(NoOrder);
 }

 const std::string&
@@ -50,3 +50,80 @@ GPUStaticInst::disassemble()

    return disassembly;
 }
+
+int
+GPUStaticInst::numSrcVecOperands()
+{
+    if (srcVecOperands > -1)
+        return srcVecOperands;
+
+    srcVecOperands = 0;
+    if (!isScalar()) {
+        for (int k = 0; k < getNumOperands(); ++k) {
+            if (isVectorRegister(k) && isSrcOperand(k))
+                srcVecOperands++;
+        }
+    }
+    return srcVecOperands;
+}
+
+int
+GPUStaticInst::numDstVecOperands()
+{
+    if (dstVecOperands > -1)
+        return dstVecOperands;
+
+    dstVecOperands = 0;
+    if (!isScalar()) {
+        for (int k = 0; k < getNumOperands(); ++k) {
+            if (isVectorRegister(k) && isDstOperand(k))
+                dstVecOperands++;
+        }
+    }
+    return dstVecOperands;
+}
+
+int
+GPUStaticInst::numSrcVecDWORDs()
+{
+    if (srcVecDWORDs > -1) {
+        return srcVecDWORDs;
+    }
+
+    srcVecDWORDs = 0;
+    if (!isScalar()) {
+        for (int i = 0; i < getNumOperands(); i++) {
+            if (isVectorRegister(i) && isSrcOperand(i)) {
+                int dwords = numOpdDWORDs(i);
+                srcVecDWORDs += dwords;
+            }
+        }
+    }
+    return srcVecDWORDs;
+}
+
+int
+GPUStaticInst::numDstVecDWORDs()
+{
+    if (dstVecDWORDs > -1) {
+        return dstVecDWORDs;
+    }
+
+    dstVecDWORDs = 0;
+    if (!isScalar()) {
+        for (int i = 0; i < getNumOperands(); i++) {
+            if (isVectorRegister(i) && isDstOperand(i)) {
+                int dwords = numOpdDWORDs(i);
+                dstVecDWORDs += dwords;
+            }
+        }
+    }
+    return dstVecDWORDs;
+}
+
+int
+GPUStaticInst::numOpdDWORDs(int operandIdx)
+{
+    return getOperandSize(operandIdx) <= 4 ? 1
+        : getOperandSize(operandIdx) / 4;
+}
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -59,6 +59,7 @@ class GPUStaticInst : public GPUStaticInstFlags
 {
  public:
    GPUStaticInst(const std::string &opcode);
+    virtual ~GPUStaticInst() { }
    void instAddr(int inst_addr) { _instAddr = inst_addr; }
    int instAddr() const { return _instAddr; }
    int nextInstAddr() const { return _instAddr + instSize(); }
@@ -71,15 +72,18 @@ class GPUStaticInst : public GPUStaticInstFlags

    int ipdInstNum() const { return _ipdInstNum; }

+    virtual TheGpuISA::ScalarRegU32 srcLiteral() const { return 0; }
+
    virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
    virtual void generateDisassembly() = 0;
    const std::string& disassemble();
    virtual int getNumOperands() = 0;
-    virtual bool isCondRegister(int operandIndex) = 0;
    virtual bool isScalarRegister(int operandIndex) = 0;
    virtual bool isVectorRegister(int operandIndex) = 0;
    virtual bool isSrcOperand(int operandIndex) = 0;
    virtual bool isDstOperand(int operandIndex) = 0;
+    virtual bool isFlatScratchRegister(int opIdx) = 0;
+    virtual bool isExecMaskRegister(int opIdx) = 0;
    virtual int getOperandSize(int operandIndex) = 0;

    virtual int getRegisterIndex(int operandIndex,
@@ -88,12 +92,24 @@ class GPUStaticInst : public GPUStaticInstFlags
    virtual int numDstRegOperands() = 0;
    virtual int numSrcRegOperands() = 0;

-    virtual bool isValid() const = 0;
+    virtual int coalescerTokenCount() const { return 0; }
+
+    int numDstVecOperands();
+    int numSrcVecOperands();
+    int numDstVecDWORDs();
+    int numSrcVecDWORDs();
+
+    int numOpdDWORDs(int operandIdx);

    bool isALU() const { return _flags[ALU]; }
    bool isBranch() const { return _flags[Branch]; }
+    bool isCondBranch() const { return _flags[CondBranch]; }
    bool isNop() const { return _flags[Nop]; }
    bool isReturn() const { return _flags[Return]; }
+    bool isEndOfKernel() const { return _flags[EndOfKernel]; }
+    bool isKernelLaunch() const { return _flags[KernelLaunch]; }
+    bool isSDWAInst() const { return _flags[IsSDWA]; }
+    bool isDPPInst() const { return _flags[IsDPP]; }

    bool
    isUnconditionalJump() const
@@ -105,7 +121,7 @@ class GPUStaticInst : public GPUStaticInstFlags
    bool isWaitcnt() const { return _flags[Waitcnt]; }

    bool isBarrier() const { return _flags[MemBarrier]; }
-    bool isMemFence() const { return _flags[MemFence]; }
+    bool isMemSync() const { return _flags[MemSync]; }
    bool isMemRef() const { return _flags[MemoryRef]; }
    bool isFlat() const { return _flags[Flat]; }
    bool isLoad() const { return _flags[Load]; }
@@ -125,6 +141,13 @@ class GPUStaticInst : public GPUStaticInstFlags
    bool writesSCC() const { return _flags[WritesSCC]; }
    bool readsVCC() const { return _flags[ReadsVCC]; }
    bool writesVCC() const { return _flags[WritesVCC]; }
+    // Identify instructions that implicitly read the Execute mask
+    // as a source operand but not to dictate which threads execute.
+    bool readsEXEC() const { return _flags[ReadsEXEC]; }
+    bool writesEXEC() const { return _flags[WritesEXEC]; }
+    bool readsMode() const { return _flags[ReadsMode]; }
+    bool writesMode() const { return _flags[WritesMode]; }
+    bool ignoreExec() const { return _flags[IgnoreExec]; }

    bool isAtomicAnd() const { return _flags[AtomicAnd]; }
    bool isAtomicOr() const { return _flags[AtomicOr]; }
@@ -166,34 +189,29 @@ class GPUStaticInst : public GPUStaticInstFlags
    bool isReadOnlySeg() const { return _flags[ReadOnlySegment]; }
    bool isSpillSeg() const { return _flags[SpillSegment]; }

-    bool isWorkitemScope() const { return _flags[WorkitemScope]; }
-    bool isWavefrontScope() const { return _flags[WavefrontScope]; }
-    bool isWorkgroupScope() const { return _flags[WorkgroupScope]; }
-    bool isDeviceScope() const { return _flags[DeviceScope]; }
-    bool isSystemScope() const { return _flags[SystemScope]; }
-    bool isNoScope() const { return _flags[NoScope]; }
-
-    bool isRelaxedOrder() const { return _flags[RelaxedOrder]; }
-    bool isAcquire() const { return _flags[Acquire]; }
-    bool isRelease() const { return _flags[Release]; }
-    bool isAcquireRelease() const { return _flags[AcquireRelease]; }
-    bool isNoOrder() const { return _flags[NoOrder]; }
-
    /**
-     * Coherence domain of a memory instruction. Only valid for
-     * machine ISA. The coherence domain specifies where it is
-     * possible to perform memory synchronization, e.g., acquire
-     * or release, from the shader kernel.
+     * Coherence domain of a memory instruction. The coherence domain
+     * specifies where it is possible to perform memory synchronization
+     * (e.g., acquire or release) from the shader kernel.
     *
-     * isGloballyCoherent(): returns true if kernel is sharing memory
-     * with other work-items on the same device (GPU)
+     * isGloballyCoherent(): returns true if WIs share same device
+     * isSystemCoherent(): returns true if WIs or threads in different
+     *                     devices share memory
     *
-     * isSystemCoherent(): returns true if kernel is sharing memory
-     * with other work-items on a different device (GPU) or the host (CPU)
     */
    bool isGloballyCoherent() const { return _flags[GloballyCoherent]; }
    bool isSystemCoherent() const { return _flags[SystemCoherent]; }

+    // Floating-point instructions
+    bool isF16() const { return _flags[F16]; }
+    bool isF32() const { return _flags[F32]; }
+    bool isF64() const { return _flags[F64]; }
+
+    // FMA, MAC, MAD instructions
+    bool isFMA() const { return _flags[FMA]; }
+    bool isMAC() const { return _flags[MAC]; }
+    bool isMAD() const { return _flags[MAD]; }
+
    virtual int instSize() const = 0;

    // only used for memory instructions
@@ -217,37 +235,36 @@ class GPUStaticInst : public GPUStaticInstFlags
    // For flat memory accesses
    Enums::StorageClassType executed_as;

-    void setFlag(Flags flag) { _flags[flag] = true; }
+    void setFlag(Flags flag) {
+        _flags[flag] = true;

-    virtual void
-    execLdAcq(GPUDynInstPtr gpuDynInst)
-    {
-        fatal("calling execLdAcq() on a non-load instruction.\n");
-    }
-
-    virtual void
-    execSt(GPUDynInstPtr gpuDynInst)
-    {
-        fatal("calling execLdAcq() on a non-load instruction.\n");
-    }
-
-    virtual void
-    execAtomic(GPUDynInstPtr gpuDynInst)
-    {
-        fatal("calling execAtomic() on a non-atomic instruction.\n");
-    }
-
-    virtual void
-    execAtomicAcq(GPUDynInstPtr gpuDynInst)
-    {
-        fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
+        if (isGroupSeg()) {
+            executed_as = Enums::SC_GROUP;
+        } else if (isGlobalSeg()) {
+            executed_as = Enums::SC_GLOBAL;
+        } else if (isPrivateSeg()) {
+            executed_as = Enums::SC_PRIVATE;
+        } else if (isSpillSeg()) {
+            executed_as = Enums::SC_SPILL;
+        } else if (isReadOnlySeg()) {
+            executed_as = Enums::SC_READONLY;
+        } else if (isKernArgSeg()) {
+            executed_as = Enums::SC_KERNARG;
+        } else if (isArgSeg()) {
+            executed_as = Enums::SC_ARG;
+        }
    }
+    const std::string& opcode() const { return _opcode; }

  protected:
-    const std::string opcode;
+    const std::string _opcode;
    std::string disassembly;
    int _instNum;
    int _instAddr;
+    int srcVecOperands;
+    int dstVecOperands;
+    int srcVecDWORDs;
+    int dstVecDWORDs;
    /**
     * Identifier of the immediate post-dominator instruction.
     */
@@ -262,9 +279,9 @@ class KernelLaunchStaticInst : public GPUStaticInst
    KernelLaunchStaticInst() : GPUStaticInst("kernel_launch")
    {
        setFlag(Nop);
+        setFlag(KernelLaunch);
+        setFlag(MemSync);
        setFlag(Scalar);
-        setFlag(Acquire);
-        setFlag(SystemScope);
        setFlag(GlobalSegment);
    }

@@ -277,11 +294,14 @@ class KernelLaunchStaticInst : public GPUStaticInst
    void
    generateDisassembly() override
    {
-        disassembly = opcode;
+        disassembly = _opcode;
    }

    int getNumOperands() override { return 0; }
-    bool isCondRegister(int operandIndex) override { return false; }
+    bool isFlatScratchRegister(int opIdx) override { return false; }
+    // return true if the Execute mask is explicitly used as a source
+    // register operand
+    bool isExecMaskRegister(int opIdx) override { return false; }
    bool isScalarRegister(int operandIndex) override { return false; }
    bool isVectorRegister(int operandIndex) override { return false; }
    bool isSrcOperand(int operandIndex) override { return false; }
@@ -296,7 +316,6 @@ class KernelLaunchStaticInst : public GPUStaticInst

    int numDstRegOperands() override { return 0; }
    int numSrcRegOperands() override { return 0; }
-    bool isValid() const override { return true; }
    int instSize() const override { return 0; }
 };

--- a/src/gpu-compute/gpu_tlb.cc
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -74,7 +74,6 @@ namespace X86ISA
        allocationPolicy = p->allocationPolicy;
        hasMemSidePort = false;
        accessDistance = p->accessDistance;
-        clock = p->clk_domain->clockPeriod();

        tlb.assign(size, TlbEntry());

@@ -624,8 +623,8 @@ namespace X86ISA
    {
        bool delayedResponse;

-        return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
-                                 latency);
+        return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse,
+            false, latency);
    }

    void
@@ -803,13 +802,13 @@ namespace X86ISA
        }

        /*
-         * We now know the TLB lookup outcome (if it's a hit or a miss), as well
-         * as the TLB access latency.
+         * We now know the TLB lookup outcome (if it's a hit or a miss), as
+         * well as the TLB access latency.
         *
         * We create and schedule a new TLBEvent which will help us take the
-         * appropriate actions (e.g., update TLB on a hit, send request to lower
-         * level TLB on a miss, or start a page walk if this was the last-level
-         * TLB)
+         * appropriate actions (e.g., update TLB on a hit, send request to
+         * lower level TLB on a miss, or start a page walk if this was the
+         * last-level TLB)
         */
        TLBEvent *tlb_event =
            new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
@@ -823,15 +822,15 @@ namespace X86ISA
        assert(tlb_event);

        DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
-                curTick() + this->ticks(hitLatency));
+                curTick() + cyclesToTicks(Cycles(hitLatency)));

-        schedule(tlb_event, curTick() + this->ticks(hitLatency));
+        schedule(tlb_event, curTick() + cyclesToTicks(Cycles(hitLatency)));
    }

-    GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
-                               PacketPtr _pkt)
-        : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
-        outcome(tlb_outcome), pkt(_pkt)
+    GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr,
+        tlbOutcome tlb_outcome, PacketPtr _pkt)
+            : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
+              outcome(tlb_outcome), pkt(_pkt)
    {
    }

@@ -848,7 +847,8 @@ namespace X86ISA
        bool storeCheck = flags & (StoreCheck << FlagShift);

        // Do paging protection checks.
-        bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
+        bool inUser
+            = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
        CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);

        bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
@@ -874,10 +874,9 @@ namespace X86ISA
     * The latter calls handelHit with TLB miss as tlbOutcome.
     */
    void
-    GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
-            PacketPtr pkt)
+    GpuTLB::handleTranslationReturn(Addr virt_page_addr,
+        tlbOutcome tlb_outcome, PacketPtr pkt)
    {
-
        assert(pkt);
        Addr vaddr = pkt->req->getVaddr();

@@ -890,15 +889,18 @@ namespace X86ISA
        TlbEntry *local_entry, *new_entry;

        if (tlb_outcome == TLB_HIT) {
-            DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
+            DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n",
+                vaddr);
            local_entry = sender_state->tlbEntry;
        } else {
            DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
                    vaddr);

-            // We are returning either from a page walk or from a hit at a lower
-            // TLB level. The senderState should be "carrying" a pointer to the
-            // correct TLBEntry.
+            /**
+             * We are returning either from a page walk or from a hit at a
+             * lower TLB level. The senderState should be "carrying" a pointer
+             * to the correct TLBEntry.
+             */
            new_entry = sender_state->tlbEntry;
            assert(new_entry);
            local_entry = new_entry;
@@ -1024,7 +1026,8 @@ namespace X86ISA
                TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
                assert(tlb_event);
                tlb_event->updateOutcome(PAGE_WALK);
-                schedule(tlb_event, curTick() + ticks(missLatency2));
+                schedule(tlb_event,
+                         curTick() + cyclesToTicks(Cycles(missLatency2)));
            }
        } else if (outcome == PAGE_WALK) {
            if (update_stats)
@@ -1095,7 +1098,7 @@ namespace X86ISA
        return virtPageAddr;
    }

-    /*
+    /**
     * recvTiming receives a coalesced timing request from a TLBCoalescer
     * and it calls issueTLBLookup()
     * It only rejects the packet if we have exceeded the max
@@ -1145,9 +1148,11 @@ namespace X86ISA
            DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
                    "%#x\n", vaddr);

-            // We are returning either from a page walk or from a hit at a lower
-            // TLB level. The senderState should be "carrying" a pointer to the
-            // correct TLBEntry.
+            /**
+             * We are returning either from a page walk or from a hit at a
+             * lower TLB level. The senderState should be "carrying" a pointer
+             * to the correct TLBEntry.
+             */
            new_entry = sender_state->tlbEntry;
            assert(new_entry);
            local_entry = new_entry;
@@ -1267,8 +1272,8 @@ namespace X86ISA
                } else {
                    // If this was a prefetch, then do the normal thing if it
                    // was a successful translation.  Otherwise, send an empty
-                    // TLB entry back so that it can be figured out as empty and
-                    // handled accordingly.
+                    // TLB entry back so that it can be figured out as empty
+                    // and handled accordingly.
                    if (pte) {
                        DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
                                pte->paddr);
@@ -1343,7 +1348,7 @@ namespace X86ISA
        assert(virt_page_addr == tlb_event->getTLBEventVaddr());

        tlb_event->updateOutcome(MISS_RETURN);
-        tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
+        tlb->schedule(tlb_event, curTick()+tlb->clockPeriod());

        return true;
    }
@@ -1393,8 +1398,8 @@ namespace X86ISA
        tmp_access_info.sumDistance = 0;
        tmp_access_info.meanDistance = 0;

-        ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
-                                  tmp_access_info));
+        ret = TLBFootprint.insert(
+            AccessPatternTable::value_type(virt_page_addr, tmp_access_info));

        bool first_page_access = ret.second;

@@ -1428,74 +1433,74 @@ namespace X86ISA
            page_stat_file = simout.create(name().c_str())->stream();

            // print header
-            *page_stat_file << "page,max_access_distance,mean_access_distance, "
-                            << "stddev_distance" << std::endl;
+            *page_stat_file
+                << "page,max_access_distance,mean_access_distance, "
+                << "stddev_distance" << std::endl;
        }

        // update avg. reuse distance footprint
-        AccessPatternTable::iterator iter, iter_begin, iter_end;
        unsigned int sum_avg_reuse_distance_per_page = 0;

        // iterate through all pages seen by this TLB
-        for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
-            sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
-                                               iter->second.accessesPerPage;
+        for (auto &iter : TLBFootprint) {
+            sum_avg_reuse_distance_per_page += iter.second.totalReuseDistance /
+                                               iter.second.accessesPerPage;

            if (accessDistance) {
-                unsigned int tmp = iter->second.localTLBAccesses[0];
+                unsigned int tmp = iter.second.localTLBAccesses[0];
                unsigned int prev = tmp;

-                for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+                for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
                    if (i) {
                        tmp = prev + 1;
                    }

-                    prev = iter->second.localTLBAccesses[i];
+                    prev = iter.second.localTLBAccesses[i];
                    // update the localTLBAccesses value
                    // with the actual differece
-                    iter->second.localTLBAccesses[i] -= tmp;
+                    iter.second.localTLBAccesses[i] -= tmp;
                    // compute the sum of AccessDistance per page
                    // used later for mean
-                    iter->second.sumDistance +=
-                        iter->second.localTLBAccesses[i];
+                    iter.second.sumDistance +=
+                        iter.second.localTLBAccesses[i];
                }

-                iter->second.meanDistance =
-                    iter->second.sumDistance / iter->second.accessesPerPage;
+                iter.second.meanDistance =
+                    iter.second.sumDistance / iter.second.accessesPerPage;

                // compute std_dev and max  (we need a second round because we
                // need to know the mean value
                unsigned int max_distance = 0;
                unsigned int stddev_distance = 0;

-                for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+                for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
                    unsigned int tmp_access_distance =
-                        iter->second.localTLBAccesses[i];
+                        iter.second.localTLBAccesses[i];

                    if (tmp_access_distance > max_distance) {
                        max_distance = tmp_access_distance;
                    }

                    unsigned int diff =
-                        tmp_access_distance - iter->second.meanDistance;
+                        tmp_access_distance - iter.second.meanDistance;
                    stddev_distance += pow(diff, 2);

                }

                stddev_distance =
-                    sqrt(stddev_distance/iter->second.accessesPerPage);
+                    sqrt(stddev_distance/iter.second.accessesPerPage);

                if (page_stat_file) {
-                    *page_stat_file << std::hex << iter->first << ",";
+                    *page_stat_file << std::hex << iter.first << ",";
                    *page_stat_file << std::dec << max_distance << ",";
-                    *page_stat_file << std::dec << iter->second.meanDistance
+                    *page_stat_file << std::dec << iter.second.meanDistance
                                    << ",";
                    *page_stat_file << std::dec << stddev_distance;
                    *page_stat_file << std::endl;
                }

                // erase the localTLBAccesses array
-                iter->second.localTLBAccesses.clear();
+                iter.second.localTLBAccesses.clear();
            }
        }

--- a/src/gpu-compute/gpu_tlb.hh
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -69,26 +69,7 @@ namespace X86ISA

        uint32_t configAddress;

-        // TLB clock: will inherit clock from shader's clock period in terms
-        // of nuber of ticks of curTime (aka global simulation clock)
-        // The assignment of TLB clock from shader clock is done in the python
-        // config files.
-        int clock;
-
      public:
-        // clock related functions ; maps to-and-from Simulation ticks and
-        // object clocks.
-        Tick frequency() const { return SimClock::Frequency / clock; }
-
-        Tick
-        ticks(int numCycles) const
-        {
-            return (Tick)clock * numCycles;
-        }
-
-        Tick curCycle() const { return curTick() / clock; }
-        Tick tickToCycles(Tick val) const { return val / clock;}
-
        typedef X86GPUTLBParams Params;
        GpuTLB(const Params *p);
        ~GpuTLB();
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+/**
+ * @file
+ * HSAQueuEntry is the simulator's internal representation of an
+ * AQL queue entry (task). It encasulates all of the relevant info
+ * about a task, which is gathered from various runtime data
+ * structures including: the AQL MQD, the AQL packet, and the code
+ * object.
+ */
+
+#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
+#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
+
+#include <bitset>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "base/intmath.hh"
+#include "base/types.hh"
+#include "dev/hsa/hsa_packet.hh"
+#include "dev/hsa/hsa_queue.hh"
+#include "gpu-compute/kernel_code.hh"
+
+class HSAQueueEntry
+{
+  public:
+    HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
+                  int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
+                  Addr host_pkt_addr, Addr code_addr)
+        : kernName(kernel_name),
+          _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
+                  (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
+                  (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
+          _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
+                    (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
+                    (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
+          numVgprs(akc->workitem_vgpr_count),
+          numSgprs(akc->wavefront_sgpr_count),
+          _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
+          _hostDispPktAddr(host_pkt_addr),
+          _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
+                            ->completion_signal),
+          codeAddress(code_addr),
+          kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
+          _outstandingInvs(-1), _outstandingWbs(0),
+          _ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
+                   group_segment_size),
+          _privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
+                         private_segment_size),
+          _contextId(0), _wgId{{ 0, 0, 0 }},
+          _numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
+          _globalWgId(0), dispatchComplete(false)
+
+    {
+        initialVgprState.reset();
+        initialSgprState.reset();
+
+        for (int i = 0; i < MAX_DIM; ++i) {
+            _numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
+            _numWgTotal *= _numWg[i];
+        }
+
+        parseKernelCode(akc);
+    }
+
+    const std::string&
+    kernelName() const
+    {
+        return kernName;
+    }
+
+    int
+    wgSize(int dim) const
+    {
+        assert(dim < MAX_DIM);
+        return _wgSize[dim];
+    }
+
+    int
+    gridSize(int dim) const
+    {
+        assert(dim < MAX_DIM);
+        return _gridSize[dim];
+    }
+
+    int
+    numVectorRegs() const
+    {
+        return numVgprs;
+    }
+
+    int
+    numScalarRegs() const
+    {
+        return numSgprs;
+    }
+
+    uint32_t
+    queueId() const
+    {
+        return _queueId;
+    }
+
+    int
+    dispatchId() const
+    {
+        return _dispatchId;
+    }
+
+    void*
+    dispPktPtr()
+    {
+        return dispPkt;
+    }
+
+    Addr
+    hostDispPktAddr() const
+    {
+        return _hostDispPktAddr;
+    }
+
+    Addr
+    completionSignal() const
+    {
+        return _completionSignal;
+    }
+
+    Addr
+    codeAddr() const
+    {
+        return codeAddress;
+    }
+
+    Addr
+    kernargAddr() const
+    {
+        return kernargAddress;
+    }
+
+    int
+    ldsSize() const
+    {
+        return _ldsSize;
+    }
+
+    int privMemPerItem() const { return _privMemPerItem; }
+
+    int
+    contextId() const
+    {
+        return _contextId;
+    }
+
+    bool
+    dispComplete() const
+    {
+        return dispatchComplete;
+    }
+
+    int
+    wgId(int dim) const
+    {
+        assert(dim < MAX_DIM);
+        return _wgId[dim];
+    }
+
+    void
+    wgId(int dim, int val)
+    {
+        assert(dim < MAX_DIM);
+        _wgId[dim] = val;
+    }
+
+    int
+    globalWgId() const
+    {
+        return _globalWgId;
+    }
+
+    void
+    globalWgId(int val)
+    {
+        _globalWgId = val;
+    }
+
+    int
+    numWg(int dim) const
+    {
+        assert(dim < MAX_DIM);
+        return _numWg[dim];
+    }
+
+    void
+    notifyWgCompleted()
+    {
+        ++_numWgCompleted;
+    }
+
+    int
+    numWgCompleted() const
+    {
+        return _numWgCompleted;
+    }
+
+    int
+    numWgTotal() const
+    {
+        return _numWgTotal;
+    }
+
+    void
+    markWgDispatch()
+    {
+        ++_wgId[0];
+        ++_globalWgId;
+
+        if (wgId(0) * wgSize(0) >= gridSize(0)) {
+            _wgId[0] = 0;
+            ++_wgId[1];
+
+            if (wgId(1) * wgSize(1) >= gridSize(1)) {
+                _wgId[1] = 0;
+                ++_wgId[2];
+
+                if (wgId(2) * wgSize(2) >= gridSize(2)) {
+                    dispatchComplete = true;
+                }
+            }
+        }
+    }
+
+    int
+    numWgAtBarrier() const
+    {
+        return numWgArrivedAtBarrier;
+    }
+
+    bool vgprBitEnabled(int bit) const
+    {
+        return initialVgprState.test(bit);
+    }
+
+    bool sgprBitEnabled(int bit) const
+    {
+        return initialSgprState.test(bit);
+    }
+
+    /**
+     * Host-side addr of the amd_queue_t on which
+     * this task was queued.
+     */
+    Addr hostAMDQueueAddr;
+
+    /**
+     * Keep a copy of the AMD HSA queue because we
+     * need info from some of its fields to initialize
+     * register state.
+     */
+    _amd_queue_t amdQueue;
+
+    // the maximum number of dimensions for a grid or workgroup
+    const static int MAX_DIM = 3;
+
+    /* getter */
+    int
+    outstandingInvs() {
+        return _outstandingInvs;
+    }
+
+    /**
+     * Whether invalidate has started or finished -1 is the
+     * initial value indicating inv has not started for the
+     * kernel.
+     */
+    bool
+    isInvStarted()
+    {
+        return (_outstandingInvs != -1);
+    }
+
+    /**
+     * update the number of pending invalidate requests
+     *
+     * val: negative to decrement, positive to increment
+     */
+    void
+    updateOutstandingInvs(int val)
+    {
+        _outstandingInvs += val;
+        assert(_outstandingInvs >= 0);
+    }
+
+    /**
+     * Forcefully change the state to be inv done.
+     */
+    void
+    markInvDone()
+    {
+        _outstandingInvs = 0;
+    }
+
+    /**
+     * Is invalidate done?
+     */
+    bool
+    isInvDone() const
+    {
+        assert(_outstandingInvs >= 0);
+        return (_outstandingInvs == 0);
+    }
+
+    int
+    outstandingWbs() const
+    {
+        return _outstandingWbs;
+    }
+
+    /**
+     * Update the number of pending writeback requests.
+     *
+     * val: negative to decrement, positive to increment
+     */
+    void
+    updateOutstandingWbs(int val)
+    {
+        _outstandingWbs += val;
+        assert(_outstandingWbs >= 0);
+    }
+
+  private:
+    void
+    parseKernelCode(AMDKernelCode *akc)
+    {
+        /** set the enable bits for the initial SGPR state */
+        initialSgprState.set(PrivateSegBuf,
+            akc->enable_sgpr_private_segment_buffer);
+        initialSgprState.set(DispatchPtr,
+            akc->enable_sgpr_dispatch_ptr);
+        initialSgprState.set(QueuePtr,
+            akc->enable_sgpr_queue_ptr);
+        initialSgprState.set(KernargSegPtr,
+            akc->enable_sgpr_kernarg_segment_ptr);
+        initialSgprState.set(DispatchId,
+            akc->enable_sgpr_dispatch_id);
+        initialSgprState.set(FlatScratchInit,
+            akc->enable_sgpr_flat_scratch_init);
+        initialSgprState.set(PrivateSegSize,
+            akc->enable_sgpr_private_segment_size);
+        initialSgprState.set(GridWorkgroupCountX,
+            akc->enable_sgpr_grid_workgroup_count_x);
+        initialSgprState.set(GridWorkgroupCountY,
+            akc->enable_sgpr_grid_workgroup_count_y);
+        initialSgprState.set(GridWorkgroupCountZ,
+            akc->enable_sgpr_grid_workgroup_count_z);
+        initialSgprState.set(WorkgroupIdX,
+            akc->enable_sgpr_workgroup_id_x);
+        initialSgprState.set(WorkgroupIdY,
+            akc->enable_sgpr_workgroup_id_y);
+        initialSgprState.set(WorkgroupIdZ,
+            akc->enable_sgpr_workgroup_id_z);
+        initialSgprState.set(WorkgroupInfo,
+            akc->enable_sgpr_workgroup_info);
+        initialSgprState.set(PrivSegWaveByteOffset,
+            akc->enable_sgpr_private_segment_wave_byte_offset);
+
+        /**
+         * set the enable bits for the initial VGPR state. the
+         * workitem Id in the X dimension is always initialized.
+         */
+        initialVgprState.set(WorkitemIdX, true);
+        initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id_y);
+        initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id_z);
+    }
+
+    // name of the kernel associated with the AQL entry
+    std::string kernName;
+    // workgroup Size (3 dimensions)
+    std::array<int, MAX_DIM> _wgSize;
+    // grid Size (3 dimensions)
+    std::array<int, MAX_DIM> _gridSize;
+    // total number of VGPRs per work-item
+    int numVgprs;
+    // total number of SGPRs per wavefront
+    int numSgprs;
+    // id of AQL queue in which this entry is placed
+    uint32_t _queueId;
+    int _dispatchId;
+    // raw AQL packet pointer
+    void *dispPkt;
+    // host-side addr of the dispatch packet
+    Addr _hostDispPktAddr;
+    // pointer to bool
+    Addr _completionSignal;
+    // base address of the raw machine code
+    Addr codeAddress;
+    // base address of the kernel args
+    Addr kernargAddress;
+    /**
+     * Number of outstanding invs for the kernel.
+     * values:
+     *  -1: initial value, invalidate has not started for the kernel
+     *  0: 1)-1->0, about to start (a transient state, added in the same cycle)
+     *     2)+1->0, all inv requests are finished, i.e., invalidate done
+     *  ?: positive value, indicating the number of pending inv requests
+     */
+    int _outstandingInvs;
+    /**
+     * Number of outstanding wbs for the kernel
+     * values:
+     *  0: 1)initial value, flush has not started for the kernel
+     *     2)+1->0: all wb requests are finished, i.e., flush done
+     *  ?: positive value, indicating the number of pending wb requests
+     */
+    int _outstandingWbs;
+    int _ldsSize;
+    int _privMemPerItem;
+    int _contextId;
+    std::array<int, MAX_DIM> _wgId;
+    std::array<int, MAX_DIM> _numWg;
+    int _numWgTotal;
+    int numWgArrivedAtBarrier;
+    // The number of completed work groups
+    int _numWgCompleted;
+    int _globalWgId;
+    bool dispatchComplete;
+
+    std::bitset<NumVectorInitFields> initialVgprState;
+    std::bitset<NumScalarInitFields> initialSgprState;
+};
+
+#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__
--- a/src/gpu-compute/kernel_code.hh
+++ b/src/gpu-compute/kernel_code.hh
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#ifndef __GPU_COMPUTE_KERNEL_CODE_HH__
+#define __GPU_COMPUTE_KERNEL_CODE_HH__
+
+#include <bitset>
+#include <cstdint>
+
+/**
+ * these enums represent the indices into the
+ * initialRegState bitfields in HsaKernelInfo.
+ * each bit specifies whether or not the
+ * particular piece of state that the bit
+ * corresponds to should be initialized into
+ * the VGPRs/SGPRs. the order in which the
+ * fields are placed matters, as all enabled
+ * pieces of state will be initialized into
+ * contiguous registers in the same order
+ * as their position in the bitfield - which
+ * is specified in the HSA ABI.
+ */
+enum ScalarRegInitFields : int
+{
+    PrivateSegBuf = 0,
+    DispatchPtr = 1,
+    QueuePtr = 2,
+    KernargSegPtr = 3,
+    DispatchId = 4,
+    FlatScratchInit = 5,
+    PrivateSegSize = 6,
+    GridWorkgroupCountX = 7,
+    GridWorkgroupCountY = 8,
+    GridWorkgroupCountZ = 9,
+    WorkgroupIdX = 10,
+    WorkgroupIdY = 11,
+    WorkgroupIdZ = 12,
+    WorkgroupInfo = 13,
+    PrivSegWaveByteOffset = 14,
+    NumScalarInitFields = 15
+};
+
+enum VectorRegInitFields : int
+{
+    WorkitemIdX = 0,
+    WorkitemIdY = 1,
+    WorkitemIdZ = 2,
+    NumVectorInitFields = 3
+};
+
+struct AMDKernelCode
+{
+    uint32_t amd_kernel_code_version_major;
+    uint32_t amd_kernel_code_version_minor;
+    uint16_t amd_machine_kind;
+    uint16_t amd_machine_version_major;
+    uint16_t amd_machine_version_minor;
+    uint16_t amd_machine_version_stepping;
+    int64_t kernel_code_entry_byte_offset;
+    int64_t kernel_code_prefetch_byte_offset;
+    uint64_t kernel_code_prefetch_byte_size;
+    uint64_t max_scratch_backing_memory_byte_size;
+
+    /**
+     * The fields below are used to set program settings for
+     * compute shaders. Here they are primarily used to setup
+     * initial register state. See the following for full details
+     * about kernel launch, state initialization, and the AMD kernel
+     * code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/
+     *              blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
+     *              #initial-kernel-register-state
+     */
+
+    // the 32b below here represent the fields of
+    // the COMPUTE_PGM_RSRC1 register
+    uint32_t granulated_workitem_vgpr_count : 6;
+    uint32_t granulated_wavefront_sgpr_count : 4;
+    uint32_t priority : 2;
+    uint32_t float_mode_round_32 : 2;
+    uint32_t float_mode_round_16_64 : 2;
+    uint32_t float_mode_denorm_32 : 2;
+    uint32_t float_mode_denorm_16_64 : 2;
+    uint32_t priv : 1;
+    uint32_t enable_dx10_clamp : 1;
+    uint32_t debug_mode : 1;
+    uint32_t enable_ieee_mode : 1;
+    uint32_t bulky : 1;
+    uint32_t cdbg_user : 1;
+    uint32_t compute_pgm_rsrc1_reserved : 6;
+    // end COMPUTE_PGM_RSRC1 register
+
+    // the 32b below here represent the fields of
+    // the COMPUTE_PGM_RSRC2 register
+    uint32_t enable_sgpr_private_segment_wave_byte_offset : 1;
+    uint32_t user_sgpr_count : 5;
+    uint32_t enable_trap_handler : 1;
+    uint32_t enable_sgpr_workgroup_id_x : 1;
+    uint32_t enable_sgpr_workgroup_id_y : 1;
+    uint32_t enable_sgpr_workgroup_id_z : 1;
+    uint32_t enable_sgpr_workgroup_info : 1;
+    uint32_t enable_vgpr_workitem_id_y : 1;
+    uint32_t enable_vgpr_workitem_id_z : 1;
+    uint32_t enable_exception_address_watch : 1;
+    uint32_t enable_exception_memory_violation : 1;
+    uint32_t granulated_lds_size : 9;
+    uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
+    uint32_t enable_exception_fp_denormal_source : 1;
+    uint32_t enable_exception_ieee_754_fp_division_by_zero : 1;
+    uint32_t enable_exception_ieee_754_fp_overflow : 1;
+    uint32_t enable_exception_ieee_754_fp_underflow : 1;
+    uint32_t enable_exception_ieee_754_fp_inexact : 1;
+    uint32_t enable_exception_int_divide_by_zero : 1;
+    uint32_t compute_pgm_rsrc2_reserved : 1;
+    // end COMPUTE_PGM_RSRC2
+
+    // the 32b below here represent the fields of
+    // KERNEL_CODE_PROPERTIES
+    uint32_t enable_sgpr_private_segment_buffer : 1;
+    uint32_t enable_sgpr_dispatch_ptr : 1;
+    uint32_t enable_sgpr_queue_ptr : 1;
+    uint32_t enable_sgpr_kernarg_segment_ptr : 1;
+    uint32_t enable_sgpr_dispatch_id : 1;
+    uint32_t enable_sgpr_flat_scratch_init : 1;
+    uint32_t enable_sgpr_private_segment_size : 1;
+    uint32_t enable_sgpr_grid_workgroup_count_x : 1;
+    uint32_t enable_sgpr_grid_workgroup_count_y : 1;
+    uint32_t enable_sgpr_grid_workgroup_count_z : 1;
+    uint32_t kernel_code_properties_reserved1 : 6;
+    uint32_t enable_ordered_append_gds : 1;
+    uint32_t private_element_size : 2;
+    uint32_t is_ptr64 : 1;
+    uint32_t is_dynamic_callstack : 1;
+    uint32_t is_debug_enabled : 1;
+    uint32_t is_xnack_enabled : 1;
+    uint32_t kernel_code_properties_reserved2 : 9;
+    // end KERNEL_CODE_PROPERTIES
+
+    uint32_t workitem_private_segment_byte_size;
+    uint32_t workgroup_group_segment_byte_size;
+    uint32_t gds_segment_byte_size;
+    uint64_t kernarg_segment_byte_size;
+    uint32_t workgroup_fbarrier_count;
+    uint16_t wavefront_sgpr_count;
+    uint16_t workitem_vgpr_count;
+    uint16_t reserved_vgpr_first;
+    uint16_t reserved_vgpr_count;
+    uint16_t reserved_sgpr_first;
+    uint16_t reserved_sgpr_count;
+    uint16_t debug_wavefront_private_segment_offset_sgpr;
+    uint16_t debug_private_segment_buffer_sgpr;
+    uint8_t kernarg_segment_alignment;
+    uint8_t group_segment_alignment;
+    uint8_t private_segment_alignment;
+    uint8_t wavefront_size;
+    int32_t call_convention;
+    uint8_t reserved[12];
+    uint64_t runtime_loader_kernel_symbol;
+    uint64_t control_directives[16];
+};
+
+#endif // __GPU_COMPUTE_KERNEL_CODE_HH__
--- a/src/gpu-compute/lds_state.cc
+++ b/src/gpu-compute/lds_state.cc
@@ -210,8 +210,8 @@ LdsState::processPacket(PacketPtr packet)
        parent->loadBusLength();
    // delay for accessing the LDS
    Tick processingTime =
-        parent->shader->ticks(bankConflicts * bankConflictPenalty) +
-        parent->shader->ticks(busLength);
+        parent->cyclesToTicks(Cycles(bankConflicts * bankConflictPenalty)) +
+        parent->cyclesToTicks(Cycles(busLength));
    // choose (delay + last packet in queue) or (now + delay) as the time to
    // return this
    Tick doneAt = earliestReturnTime() + processingTime;
--- a/src/gpu-compute/lds_state.hh
+++ b/src/gpu-compute/lds_state.hh
@@ -41,7 +41,6 @@
 #include <utility>
 #include <vector>

-#include "enums/MemType.hh"
 #include "gpu-compute/misc.hh"
 #include "mem/port.hh"
 #include "params/LdsState.hh"
@@ -50,8 +49,8 @@
 class ComputeUnit;

 /**
- * this represents a slice of the overall LDS, intended to be associated with an
- * individual workgroup
+ * this represents a slice of the overall LDS, intended to be associated with
+ * an individual workgroup
 */
 class LdsChunk
 {
@@ -71,7 +70,8 @@ class LdsChunk
    read(const uint32_t index)
    {
        fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
-        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
+            "chunk");
        T *p0 = (T *) (&(chunk.at(index)));
        return *p0;
    }
@@ -84,7 +84,8 @@ class LdsChunk
    write(const uint32_t index, const T value)
    {
        fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
-        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
+            "chunk");
        T *p0 = (T *) (&(chunk.at(index)));
        *p0 = value;
    }
@@ -203,14 +204,16 @@ class LdsState: public ClockedObject

  protected:

-    // the lds reference counter
-    // The key is the workgroup ID and dispatch ID
-    // The value is the number of wavefronts that reference this LDS, as
-    // wavefronts are launched, the counter goes up for that workgroup and when
-    // they return it decreases, once it reaches 0 then this chunk of the LDS is
-    // returned to the available pool. However,it is deallocated on the 1->0
-    // transition, not whenever the counter is 0 as it always starts with 0 when
-    // the workgroup asks for space
+    /**
+     * the lds reference counter
+     * The key is the workgroup ID and dispatch ID
+     * The value is the number of wavefronts that reference this LDS, as
+     * wavefronts are launched, the counter goes up for that workgroup and when
+     * they return it decreases, once it reaches 0 then this chunk of the LDS
+     * is returned to the available pool. However,it is deallocated on the 1->0
+     * transition, not whenever the counter is 0 as it always starts with 0
+     * when the workgroup asks for space
+     */
    std::unordered_map<uint32_t,
                       std::unordered_map<uint32_t, int32_t>> refCounter;

@@ -356,22 +359,41 @@ class LdsState: public ClockedObject
            const uint32_t size)
    {
        if (chunkMap.find(dispatchId) != chunkMap.end()) {
-            fatal_if(
+            panic_if(
                chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
                "duplicate workgroup ID asking for space in the LDS "
                "did[%d] wgid[%d]", dispatchId, wgId);
        }

-        fatal_if(bytesAllocated + size > maximumSize,
-                 "request would ask for more space than is available");
+        if (bytesAllocated + size > maximumSize) {
+            return nullptr;
+        } else {
+            bytesAllocated += size;

-        bytesAllocated += size;
+            auto value = chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
+            panic_if(!value.second, "was unable to allocate a new chunkMap");

-        chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
-        // make an entry for this workgroup
-        refCounter[dispatchId][wgId] = 0;
+            // make an entry for this workgroup
+            refCounter[dispatchId][wgId] = 0;

-        return &chunkMap[dispatchId][wgId];
+            return &chunkMap[dispatchId][wgId];
+        }
+    }
+
+    /*
+     * return pointer to lds chunk for wgid
+     */
+    LdsChunk *
+    getLdsChunk(const uint32_t dispatchId, const uint32_t wgId)
+    {
+      fatal_if(chunkMap.find(dispatchId) == chunkMap.end(),
+          "fetch for unknown dispatch ID did[%d]", dispatchId);
+
+      fatal_if(chunkMap[dispatchId].find(wgId) == chunkMap[dispatchId].end(),
+          "fetch for unknown workgroup ID wgid[%d] in dispatch ID did[%d]",
+          wgId, dispatchId);
+
+      return &chunkMap[dispatchId][wgId];
    }

    bool
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -33,6 +33,7 @@

 #include "gpu-compute/local_memory_pipeline.hh"

+#include "debug/GPUMem.hh"
 #include "debug/GPUPort.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
@@ -62,24 +63,31 @@ LocalMemPipeline::exec()
    bool accessVrf = true;
    Wavefront *w = nullptr;

-    if ((m) && (m->isLoad() || m->isAtomicRet())) {
+    if ((m) && m->latency.rdy() && (m->isLoad() || m->isAtomicRet())) {
        w = m->wavefront();

-        accessVrf =
-            w->computeUnit->vrf[w->simdId]->
-            vrfOperandAccessReady(m->seqNum(), w, m,
-                                  VrfAccessType::WRITE);
+        accessVrf = w->computeUnit->vrf[w->simdId]->
+            canScheduleWriteOperandsFromLoad(w, m);
+
    }

    if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
-        computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
-                 || computeUnit->wfWait.at(m->pipeId).rdy())) {
+        computeUnit->locMemToVrfBus.rdy()
+        && (computeUnit->shader->coissue_return
+        || computeUnit->vectorSharedMemUnit.rdy())) {

        lmReturnedRequests.pop();
        w = m->wavefront();

+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
+                m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
        m->completeAcc(m);

+        if (m->isLoad() || m->isAtomicRet()) {
+            w->computeUnit->vrf[w->simdId]->
+                scheduleWriteOperandsFromLoad(w, m);
+        }
+
        // Decrement outstanding request count
        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);

@@ -96,7 +104,7 @@ LocalMemPipeline::exec()
        // Mark write bus busy for appropriate amount of time
        computeUnit->locMemToVrfBus.set(m->time);
        if (computeUnit->shader->coissue_return == 0)
-            w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+            w->computeUnit->vectorSharedMemUnit.set(m->time);
    }

    // If pipeline has executed a local memory instruction
@@ -114,6 +122,13 @@ LocalMemPipeline::exec()
    }
 }

+void
+LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
+{
+    gpuDynInst->setAccessTime(curTick());
+    lmIssuedRequests.push(gpuDynInst);
+}
+
 void
 LocalMemPipeline::regStats()
 {
--- a/src/gpu-compute/local_memory_pipeline.hh
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -58,10 +58,11 @@ class LocalMemPipeline
    LocalMemPipeline(const ComputeUnitParams *params);
    void init(ComputeUnit *cu);
    void exec();
-
-    std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
    std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }

+    void issueRequest(GPUDynInstPtr gpuDynInst);
+
+
    bool
    isLMRespFIFOWrRdy() const
    {
--- a/src/gpu-compute/misc.hh
+++ b/src/gpu-compute/misc.hh
@@ -39,34 +39,62 @@
 #include <memory>

 #include "base/logging.hh"
+#include "sim/clocked_object.hh"

 class GPUDynInst;

-typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
+typedef std::bitset<std::numeric_limits<unsigned long long>::digits>
+    VectorMask;
 typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;

+enum InstMemoryHop : int {
+    Initiate = 0,
+    CoalsrSend = 1,
+    CoalsrRecv = 2,
+    GMEnqueue = 3,
+    Complete = 4,
+    InstMemoryHopMax = 5
+};
+
+enum BlockMemoryHop : int {
+    BlockSend = 0,
+    BlockRecv = 1
+};
+
 class WaitClass
 {
  public:
-    WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
-    void init(uint64_t *_tcnt, uint32_t _numStages=0)
+    WaitClass() : nxtAvail(0), lookAheadAvail(0), clockedObject(nullptr) { }
+
+    WaitClass(ClockedObject *_clockedObject, uint64_t _numStages=0)
+        : nxtAvail(0), lookAheadAvail(0), clockedObject(_clockedObject),
+          numStages(_numStages) { }
+
+    void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
    {
-        tcnt = _tcnt;
+        clockedObject = _clockedObject;
        numStages = _numStages;
    }

-    void set(uint32_t i)
+    void set(uint64_t i)
    {
-        fatal_if(nxtAvail > *tcnt,
+        fatal_if(nxtAvail > clockedObject->clockEdge(),
                 "Can't allocate resource because it is busy!!!");
-        nxtAvail = *tcnt + i;
+        nxtAvail = clockedObject->clockEdge() + i;
    }
-    void preset(uint32_t delay)
+    void preset(uint64_t delay)
    {
-        lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
+        lookAheadAvail = std::max(lookAheadAvail, delay +
+                (clockedObject->clockEdge()) - numStages);
+    }
+    bool rdy(Cycles cycles = Cycles(0)) const
+    {
+        return clockedObject->clockEdge(cycles) >= nxtAvail;
+    }
+    bool prerdy() const
+    {
+        return clockedObject->clockEdge() >= lookAheadAvail;
    }
-    bool rdy() const { return *tcnt >= nxtAvail; }
-    bool prerdy() const { return *tcnt >= lookAheadAvail; }

  private:
    // timestamp indicating when resource will be available
@@ -75,11 +103,11 @@ class WaitClass
    // pending uses of the resource (when there is a cycle gap between
    // rdy() and set()
    uint64_t lookAheadAvail;
-    // current timestamp
-    uint64_t *tcnt;
+    // clockedObject for current timestamp
+    ClockedObject *clockedObject;
    // number of stages between checking if a resource is ready and
    // setting the resource's utilization
-    uint32_t numStages;
+    uint64_t numStages;
 };

 class Float16
@@ -93,7 +121,7 @@ class Float16

    Float16(float x)
    {
-        uint32_t ai = *(uint32_t *)&x;
+        uint32_t ai = *(reinterpret_cast<uint32_t *>(&x));

        uint32_t s = (ai >> 31) & 0x1;
        uint32_t exp = (ai >> 23) & 0xff;
@@ -139,7 +167,7 @@ class Float16
        val1 |= (exp << 23);
        val1 |= (mant << 13);

-        return *(float*)&val1;
+        return *(reinterpret_cast<float *>(&val1));
    }
 };

--- a/src/gpu-compute/pool_manager.cc
+++ b/src/gpu-compute/pool_manager.cc
@@ -33,8 +33,8 @@

 #include "gpu-compute/pool_manager.hh"

-PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
-    : _minAllocation(minAlloc), _poolSize(poolSize)
+PoolManager::PoolManager(const PoolManagerParams *p)
+    : SimObject(p), _minAllocation(p->min_alloc), _poolSize(p->pool_size)
 {
-    assert(poolSize > 0);
+    assert(_poolSize > 0);
 }
--- a/src/gpu-compute/pool_manager.hh
+++ b/src/gpu-compute/pool_manager.hh
@@ -38,11 +38,15 @@
 #include <cstdint>
 #include <string>

+#include "params/PoolManager.hh"
+#include "sim/sim_object.hh"
+
 // Pool Manager Logic
-class PoolManager
+class PoolManager : public SimObject
 {
  public:
-    PoolManager(uint32_t minAlloc, uint32_t poolSize);
+    PoolManager(const PoolManagerParams *p);
+    virtual ~PoolManager() { _poolSize = 0; }
    uint32_t minAllocation() { return _minAllocation; }
    virtual std::string printRegion() = 0;
    virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) = 0;
--- a/src/gpu-compute/register_file.cc
+++ b/src/gpu-compute/register_file.cc
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ *          Mark Wyse
+ */
+
+#include "gpu-compute/register_file.hh"
+
+#include <sstream>
+#include <string>
+
+#include "base/intmath.hh"
+#include "base/logging.hh"
+#include "debug/GPURF.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/RegisterFile.hh"
+
+RegisterFile::RegisterFile(const RegisterFileParams *p)
+    : SimObject(p), simdId(p->simd_id), _numRegs(p->num_regs)
+{
+    fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n");
+    fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+
+    busy.clear();
+    busy.resize(_numRegs, 0);
+}
+
+RegisterFile::~RegisterFile()
+{
+}
+
+void
+RegisterFile::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+}
+
+std::string
+RegisterFile::dump() const
+{
+    std::stringstream ss;
+    ss << "Busy: ";
+    for (int i = 0; i < busy.size(); i++) {
+        ss << (int)busy[i];
+    }
+    ss << "\n";
+    return ss.str();
+}
+
+// Scoreboard functions
+
+bool
+RegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+    return true;
+}
+
+bool
+RegisterFile::regBusy(int idx) const
+{
+    return busy.at(idx);
+}
+
+void
+RegisterFile::markReg(int regIdx, bool value)
+{
+    DPRINTF(GPURF, "SIMD[%d] markReg(): physReg[%d] = %d\n",
+            simdId, regIdx, (int)value);
+    busy.at(regIdx) = value;
+}
+
+void
+RegisterFile::enqRegFreeEvent(uint32_t regIdx, uint64_t delay)
+{
+    DPRINTF(GPURF, "SIMD[%d] enqRegFreeEvent physReg[%d] at %llu\n",
+            simdId, regIdx, curTick() + delay);
+    schedule(new MarkRegFreeScbEvent(this, regIdx),
+             curTick() + delay);
+}
+
+void
+RegisterFile::enqRegBusyEvent(uint32_t regIdx, uint64_t delay)
+{
+    DPRINTF(GPURF, "SIMD[%d] enqRegBusyEvent physReg[%d] at %llu\n",
+            simdId, regIdx, curTick() + delay);
+    schedule(new MarkRegBusyScbEvent(this, regIdx),
+             curTick() + delay);
+}
+
+// Schedule functions
+bool
+RegisterFile::canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+    return true;
+}
+
+void
+RegisterFile::scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+bool
+RegisterFile::canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+    return true;
+}
+
+void
+RegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+bool
+RegisterFile::canScheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii)
+{
+    return true;
+}
+
+void
+RegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+bool
+RegisterFile::operandReadComplete(Wavefront *w, GPUDynInstPtr ii)
+{
+    return true;
+}
+
+// Exec functions
+void
+RegisterFile::exec()
+{
+}
+
+void
+RegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+RegisterFile*
+RegisterFileParams::create()
+{
+    return new RegisterFile(this);
+}
+
+// Events
+
+// Mark a register as free in the scoreboard/busy vector
+void
+RegisterFile::MarkRegFreeScbEvent::process()
+{
+    rf->markReg(regIdx, false);
+}
+
+// Mark a register as busy in the scoreboard/busy vector
+void
+RegisterFile::MarkRegBusyScbEvent::process()
+{
+    rf->markReg(regIdx, true);
+}
+
+void
+RegisterFile::dispatchInstruction(GPUDynInstPtr ii)
+{
+}
+
+void
+RegisterFile::regStats()
+{
+    registerReads
+        .name(name() + ".register_reads")
+        .desc("Total number of DWORDs read from register file")
+        ;
+
+    registerWrites
+        .name(name() + ".register_writes")
+        .desc("Total number of DWORDS written to register file")
+        ;
+
+    sramReads
+        .name(name() + ".sram_reads")
+        .desc("Total number of register file bank SRAM activations for reads")
+        ;
+
+    sramWrites
+        .name(name() + ".sram_writes")
+        .desc("Total number of register file bank SRAM activations for writes")
+        ;
+}
--- a/src/gpu-compute/register_file.hh
+++ b/src/gpu-compute/register_file.hh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ *          Mark Wyse
+ */
+
+#ifndef __REGISTER_FILE_HH__
+#define __REGISTER_FILE_HH__
+
+#include <limits>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/misc.hh"
+#include "sim/sim_object.hh"
+
+class ComputeUnit;
+class Shader;
+class PoolManager;
+class Wavefront;
+
+struct RegisterFileParams;
+
+// Abstract Register File
+// This register file class can be inherited from to create both
+// scalar and vector register files.
+class RegisterFile : public SimObject
+{
+  public:
+    RegisterFile(const RegisterFileParams *p);
+    virtual ~RegisterFile();
+    virtual void setParent(ComputeUnit *_computeUnit);
+    int numRegs() const { return _numRegs; }
+    virtual void regStats() override;
+
+    // State functions
+
+    // Scoreboard functions
+    virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
+    virtual bool regBusy(int idx) const;
+    virtual void markReg(int regIdx, bool value);
+
+    // Abstract Register Event
+    class RegisterEvent : public Event
+    {
+      protected:
+        RegisterFile *rf;
+        int regIdx;
+
+      public:
+        RegisterEvent(RegisterFile *_rf, int _regIdx)
+            : rf(_rf), regIdx(_regIdx) { setFlags(AutoDelete); }
+    };
+
+    // Register Event to mark a register as free in the scoreboard/busy vector
+    class MarkRegFreeScbEvent : public RegisterEvent
+    {
+      public:
+        MarkRegFreeScbEvent(RegisterFile *_rf, int _regIdx)
+            : RegisterEvent(_rf, _regIdx) { }
+        void process();
+    };
+
+    // Register Event to mark a register as busy in the scoreboard/busy vector
+    class MarkRegBusyScbEvent : public RegisterEvent
+    {
+      public:
+        MarkRegBusyScbEvent(RegisterFile *_rf, int _regIdx)
+            : RegisterEvent(_rf, _regIdx) { }
+        void process();
+    };
+
+    // Schedule an event to mark a register as free/busy in
+    // the scoreboard/busy vector. Delay is already in Ticks
+    virtual void enqRegFreeEvent(uint32_t regIdx, uint64_t delay);
+    virtual void enqRegBusyEvent(uint32_t regIdx, uint64_t delay);
+
+    // Schedule functions
+
+    // The following functions are called by the SCH stage when attempting
+    // to move a wave from the readyList to the schList.
+    // canSchedule* checks if the RF is ready to provide operands for
+    // the instruction, while schedule* requests the RF to begin reading
+    // and writing of operands. Calling schedule* may only occur
+    // immediately after canSchedule* was called and returned True
+    virtual bool canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii);
+    virtual bool canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii);
+    virtual void scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii);
+    virtual void scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii);
+
+    // The following function is called to check if all operands
+    // have been read for the given instruction
+    virtual bool operandReadComplete(Wavefront *w, GPUDynInstPtr ii);
+
+    // The following two functions are only called by returning loads to
+    // check if the register file can support the incoming writes
+    virtual bool canScheduleWriteOperandsFromLoad(Wavefront *w,
+                                                  GPUDynInstPtr ii);
+    // Queue the register writes. Assumes canScheduleWriteOperandsFromLoad
+    // was called immediately prior and returned True
+    virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
+                                               GPUDynInstPtr ii);
+
+    // ExecRF is invoked every cycle by the compute unit and may be
+    // used to model detailed timing of the register file.
+    virtual void exec();
+
+    // Called to inform RF that an instruction is executing
+    // to schedule events for writeback, etc., as needed
+    virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii);
+
+    // Debug functions
+    virtual std::string dump() const;
+
+    virtual void dispatchInstruction(GPUDynInstPtr ii);
+
+  protected:
+    ComputeUnit* computeUnit;
+    int simdId;
+
+    // flag indicating if a register is busy
+    std::vector<bool> busy;
+
+    // numer of registers in this register file
+    int _numRegs;
+    // Stats
+    // Total number of register reads, incremented once per DWORD per thread
+    Stats::Scalar registerReads;
+    // Total number of register writes, incremented once per DWORD per thread
+    Stats::Scalar registerWrites;
+
+    // Number of register file SRAM activations for reads.
+    // The register file may be implemented with multiple SRAMs. This stat
+    // tracks how many times the SRAMs are accessed for reads.
+    Stats::Scalar sramReads;
+    // Number of register file SRAM activations for writes
+    Stats::Scalar sramWrites;
+};
+
+#endif // __REGISTER_FILE_HH__
--- a/src/gpu-compute/register_manager.cc
+++ b/src/gpu-compute/register_manager.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2016, 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Mark Wyse
+ */
+
+#include "gpu-compute/register_manager.hh"
+
+#include "config/the_gpu_isa.hh"
+#include "debug/GPURename.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/static_register_manager_policy.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/RegisterManager.hh"
+
+RegisterManager::RegisterManager(const RegisterManagerParams *p)
+    : SimObject(p), srfPoolMgrs(p->srf_pool_managers),
+      vrfPoolMgrs(p->vrf_pool_managers)
+{
+    if (p->policy == "static") {
+        policy = new StaticRegisterManagerPolicy();
+    } else {
+        fatal("Unimplemented Register Manager Policy");
+    }
+
+}
+
+RegisterManager::~RegisterManager()
+{
+    for (auto mgr : srfPoolMgrs) {
+        delete mgr;
+    }
+    for (auto mgr : vrfPoolMgrs) {
+        delete mgr;
+    }
+}
+
+void
+RegisterManager::exec()
+{
+    policy->exec();
+}
+
+void
+RegisterManager::setParent(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    policy->setParent(computeUnit);
+    for (int i = 0; i < srfPoolMgrs.size(); i++) {
+        fatal_if(computeUnit->srf[i]->numRegs() %
+                 srfPoolMgrs[i]->minAllocation(),
+                 "Min SGPR allocation is not multiple of VRF size\n");
+    }
+    for (int i = 0; i < vrfPoolMgrs.size(); i++) {
+        fatal_if(computeUnit->vrf[i]->numRegs() %
+                 vrfPoolMgrs[i]->minAllocation(),
+                 "Min VGPG allocation is not multiple of VRF size\n");
+    }
+}
+
+// compute mapping for vector register
+int
+RegisterManager::mapVgpr(Wavefront* w, int vgprIndex)
+{
+    return policy->mapVgpr(w, vgprIndex);
+}
+
+// compute mapping for scalar register
+int
+RegisterManager::mapSgpr(Wavefront* w, int sgprIndex)
+{
+    return policy->mapSgpr(w, sgprIndex);
+}
+
+// check if we can allocate registers
+bool
+RegisterManager::canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
+{
+    return policy->canAllocateVgprs(simdId, nWfs, demandPerWf);
+}
+
+bool
+RegisterManager::canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
+{
+    return policy->canAllocateSgprs(simdId, nWfs, demandPerWf);
+}
+
+// allocate registers
+void
+RegisterManager::allocateRegisters(Wavefront *w, int vectorDemand,
+                                   int scalarDemand)
+{
+    policy->allocateRegisters(w, vectorDemand, scalarDemand);
+}
+
+void
+RegisterManager::freeRegisters(Wavefront* w)
+{
+    policy->freeRegisters(w);
+}
+
+void
+RegisterManager::regStats()
+{
+    policy->regStats();
+}
+
+RegisterManager*
+RegisterManagerParams::create()
+{
+    return new RegisterManager(this);
+}
--- a/src/gpu-compute/register_manager.hh
+++ b/src/gpu-compute/register_manager.hh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#ifndef __REGISTER_MANAGER_HH__
+#define __REGISTER_MANAGER_HH__
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gpu-compute/pool_manager.hh"
+#include "gpu-compute/register_manager_policy.hh"
+#include "sim/sim_object.hh"
+#include "sim/stats.hh"
+
+class ComputeUnit;
+class Wavefront;
+
+struct RegisterManagerParams;
+
+/*
+ * Rename stage.
+ */
+class RegisterManager : public SimObject
+{
+  public:
+    RegisterManager(const RegisterManagerParams* params);
+    ~RegisterManager();
+    void setParent(ComputeUnit *cu);
+    void exec();
+
+    // Stats related variables and methods
+    void regStats();
+
+    // lookup virtual to physical register translation
+    int mapVgpr(Wavefront* w, int vgprIndex);
+    int mapSgpr(Wavefront* w, int sgprIndex);
+
+    // check if we can allocate registers
+    bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf);
+    bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf);
+
+    // allocate registers
+    void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand);
+
+    // free all registers used by the WF
+    void freeRegisters(Wavefront *w);
+
+    std::vector<PoolManager*> srfPoolMgrs;
+    std::vector<PoolManager*> vrfPoolMgrs;
+
+  private:
+    RegisterManagerPolicy *policy;
+
+    ComputeUnit *computeUnit;
+
+    std::string _name;
+};
+
+#endif // __REGISTER_MANAGER_HH__
--- a/src/gpu-compute/register_manager_policy.hh
+++ b/src/gpu-compute/register_manager_policy.hh
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#ifndef __REGISTER_MANAGER_POLICY_HH__
+#define __REGISTER_MANAGER_POLICY_HH__
+
+#include <cstdint>
+
+class ComputeUnit;
+class HSAQueueEntry;
+class Wavefront;
+
+/**
+ * Register Manager Policy abstract class
+ *
+ * A Register Manager Policy implements all of the functionality
+ * of the Register Manager, including register mapping, allocation,
+ * and freeing. Different policies may be implemented that support
+ * different architectures or different methods of mapping and
+ * allocation.
+ */
+class RegisterManagerPolicy
+{
+  public:
+    virtual void setParent(ComputeUnit *_cu) { cu = _cu; }
+
+    // Execute: called by RenameStage::execute()
+    virtual void exec() = 0;
+
+    // provide virtual to physical register mapping
+    virtual int mapVgpr(Wavefront* w, int vgprIndex) = 0;
+    virtual int mapSgpr(Wavefront* w, int sgprIndex) = 0;
+
+    // check if requested number of vector registers can be allocated
+    virtual bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) = 0;
+    // check if requested number of scalar registers can be allocated
+    // machine ISA only
+    virtual bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) = 0;
+
+    // allocate vector registers and reserve from register pool
+    virtual void allocateRegisters(Wavefront *w, int vectorDemand,
+        int scalarDemand) = 0;
+
+    // free all remaining registers held by specified WF
+    virtual void freeRegisters(Wavefront *w) = 0;
+
+    // stats
+    virtual void regStats() = 0;
+
+  protected:
+    ComputeUnit *cu;
+};
+
+#endif // __REGISTER_MANAGER_POLICY_HH__
--- a/src/gpu-compute/rr_scheduling_policy.hh
+++ b/src/gpu-compute/rr_scheduling_policy.hh
@@ -36,6 +36,7 @@

 #include <vector>

+#include "base/logging.hh"
 #include "gpu-compute/scheduling_policy.hh"
 #include "gpu-compute/wavefront.hh"

--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos
+ */
+
+#include "gpu-compute/scalar_memory_pipeline.hh"
+
+#include "debug/GPUMem.hh"
+#include "debug/GPUReg.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+ScalarMemPipeline::ScalarMemPipeline(const ComputeUnitParams* p) :
+    computeUnit(nullptr), queueSize(p->scalar_mem_queue_size),
+    inflightStores(0), inflightLoads(0)
+{
+}
+
+void
+ScalarMemPipeline::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ScalarMemPipeline";
+}
+
+void
+ScalarMemPipeline::exec()
+{
+    // afind oldest scalar request whose data has arrived
+    GPUDynInstPtr m = !returnedLoads.empty() ? returnedLoads.front() :
+        !returnedStores.empty() ? returnedStores.front() : nullptr;
+
+    Wavefront *w = nullptr;
+
+    bool accessSrf = true;
+    // check the SRF to see if the operands of a load (or load component
+    // of an atomic) are accessible
+    if ((m) && (m->isLoad() || m->isAtomicRet())) {
+        w = m->wavefront();
+
+        accessSrf =
+            w->computeUnit->srf[w->simdId]->
+                canScheduleWriteOperandsFromLoad(w, m);
+    }
+
+    if ((!returnedStores.empty() || !returnedLoads.empty()) &&
+        m->latency.rdy() && computeUnit->scalarMemToSrfBus.rdy() &&
+        accessSrf &&
+        (computeUnit->shader->coissue_return ||
+         computeUnit->scalarMemUnit.rdy())) {
+
+        w = m->wavefront();
+
+        if (m->isLoad() || m->isAtomicRet()) {
+            w->computeUnit->srf[w->simdId]->
+                scheduleWriteOperandsFromLoad(w, m);
+        }
+
+        m->completeAcc(m);
+
+        if (m->isLoad() || m->isAtomic()) {
+            returnedLoads.pop();
+            assert(inflightLoads > 0);
+            --inflightLoads;
+        } else {
+            returnedStores.pop();
+            assert(inflightStores > 0);
+            --inflightStores;
+        }
+
+        // Decrement outstanding register count
+        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+
+        if (m->isStore() || m->isAtomic()) {
+            computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
+                                             m->time, -1);
+        }
+
+        if (m->isLoad() || m->isAtomic()) {
+            computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
+                                             m->time, -1);
+        }
+
+        // Mark write bus busy for appropriate amount of time
+        computeUnit->scalarMemToSrfBus.set(m->time);
+        if (!computeUnit->shader->coissue_return)
+            w->computeUnit->scalarMemUnit.set(m->time);
+    }
+
+    // If pipeline has executed a global memory instruction
+    // execute global memory packets and issue global
+    // memory packets to DTLB
+    if (!issuedRequests.empty()) {
+        GPUDynInstPtr mp = issuedRequests.front();
+        if (mp->isLoad() || mp->isAtomic()) {
+
+            if (inflightLoads >= queueSize) {
+                return;
+            } else {
+                ++inflightLoads;
+            }
+        } else {
+            if (inflightStores >= queueSize) {
+                return;
+            } else {
+                ++inflightStores;
+            }
+        }
+        mp->initiateAcc(mp);
+        issuedRequests.pop();
+
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping scalar mem_op\n",
+                computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+    }
+}
+
+void
+ScalarMemPipeline::regStats()
+{
+}
--- a/src/gpu-compute/scalar_memory_pipeline.hh
+++ b/src/gpu-compute/scalar_memory_pipeline.hh
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos
+ */
+
+#ifndef __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
+#define __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file scalar_memory_pipeline.hh
+ *
+ * The scalar memory pipeline issues global memory packets
+ * from the scalar ALU to the DTLB and L1 Scalar Data Cache.
+ * The exec() method of the memory packet issues
+ * the packet to the DTLB if there is space available in the return fifo.
+ * This exec() method also retires previously issued loads and stores that have
+ * returned from the memory sub-system.
+ */
+
+class ComputeUnit;
+
+class ScalarMemPipeline
+{
+  public:
+    ScalarMemPipeline(const ComputeUnitParams *params);
+    void init(ComputeUnit *cu);
+    void exec();
+
+    std::queue<GPUDynInstPtr> &getGMReqFIFO() { return issuedRequests; }
+    std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return returnedStores; }
+    std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return returnedLoads; }
+
+    bool
+    isGMLdRespFIFOWrRdy() const
+    {
+        return returnedLoads.size() < queueSize;
+    }
+
+    bool
+    isGMStRespFIFOWrRdy() const
+    {
+        return returnedStores.size() < queueSize;
+    }
+
+    bool
+    isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
+    {
+        return (issuedRequests.size() + pendReqs) < queueSize;
+    }
+
+    const std::string &name() const { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    int queueSize;
+
+    // Counters to track and limit the inflight scalar loads and stores
+    // generated by this memory pipeline.
+    int inflightStores;
+    int inflightLoads;
+
+    // Scalar Memory Request FIFO: all global memory scalar requests
+    // are issued to this FIFO from the scalar memory pipelines
+    std::queue<GPUDynInstPtr> issuedRequests;
+
+    // Scalar Store Response FIFO: all responses of global memory
+    // scalar stores are sent to this FIFO from L1 Scalar Data Cache
+    std::queue<GPUDynInstPtr> returnedStores;
+
+    // Scalar Load Response FIFO: all responses of global memory
+    // scalar loads are sent to this FIFO from L1 Scalar Data Cache
+    std::queue<GPUDynInstPtr> returnedLoads;
+};
+
+#endif // __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
--- a/src/gpu-compute/scalar_register_file.cc
+++ b/src/gpu-compute/scalar_register_file.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ *          Mark Wyse
+ */
+
+#include "gpu-compute/scalar_register_file.hh"
+
+#include "base/logging.hh"
+#include "debug/GPUSRF.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/ScalarRegisterFile.hh"
+
+ScalarRegisterFile::ScalarRegisterFile(const ScalarRegisterFileParams *p)
+    : RegisterFile(p)
+{
+    regFile.resize(numRegs(), 0);
+}
+
+bool
+ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
+
+            int sgprIdx = ii->getRegisterIndex(i, ii);
+            int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;
+
+            for (int j = 0; j < nRegs; ++j) {
+                int pSgpr =
+                    computeUnit->registerManager->mapSgpr(w, sgprIdx + j);
+
+                if (regBusy(pSgpr)) {
+                    if (ii->isDstOperand(i)) {
+                        w->numTimesBlockedDueWAXDependencies++;
+                    } else if (ii->isSrcOperand(i)) {
+                        DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
+                                w->wfDynId, ii->disassemble(), pSgpr);
+                        w->numTimesBlockedDueRAWDependencies++;
+                    }
+                    return false;
+                }
+            } // nRegs
+        } // isScalar
+    } // operand
+    return true;
+}
+
+void
+ScalarRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+    // iterate over all register destination operands
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
+
+            int sgprIdx = ii->getRegisterIndex(i, ii);
+            int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;
+
+            for (int j = 0; j < nRegs; ++j) {
+                int physReg =
+                    computeUnit->registerManager->mapSgpr(w, sgprIdx + j);
+
+                // mark the destination scalar register as busy
+                markReg(physReg, true);
+            }
+        }
+    }
+}
+
+void
+ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
+{
+    for (int i = 0; i < ii->getNumOperands(); i++) {
+        if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
+            int DWORDs = ii->getOperandSize(i) <= 4 ? 1
+                : ii->getOperandSize(i) / 4;
+            registerReads += DWORDs;
+        }
+    }
+
+    if (!ii->isLoad() && !(ii->isAtomic() || ii->isMemSync())) {
+        Cycles delay(computeUnit->scalarPipeLength());
+        Tick tickDelay = computeUnit->cyclesToTicks(delay);
+
+        for (int i = 0; i < ii->getNumOperands(); i++) {
+            if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
+                int sgprIdx = ii->getRegisterIndex(i, ii);
+                int nRegs = ii->getOperandSize(i) <= 4 ? 1
+                    : ii->getOperandSize(i) / 4;
+                for (int j = 0; j < nRegs; j++) {
+                    int physReg = computeUnit->registerManager->
+                        mapSgpr(w, sgprIdx + j);
+                    enqRegFreeEvent(physReg, tickDelay);
+                }
+
+                registerWrites += nRegs;
+            }
+        }
+    }
+}
+
+void
+ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w,
+                                                  GPUDynInstPtr ii)
+{
+    assert(ii->isLoad() || ii->isAtomicRet());
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
+
+            int sgprIdx = ii->getRegisterIndex(i, ii);
+            int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;
+
+            for (int j = 0; j < nRegs; ++j) {
+                int physReg = computeUnit->registerManager->
+                    mapSgpr(w, sgprIdx + j);
+                enqRegFreeEvent(physReg, computeUnit->clockPeriod());
+            }
+
+            registerWrites += nRegs;
+        }
+    }
+}
+
+ScalarRegisterFile*
+ScalarRegisterFileParams::create()
+{
+    return new ScalarRegisterFile(this);
+}
--- a/src/gpu-compute/scalar_register_file.hh
+++ b/src/gpu-compute/scalar_register_file.hh
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ *          Mark Wyse
+ */
+
+#ifndef __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
+#define __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
+
+#include "arch/gpu_isa.hh"
+#include "base/statistics.hh"
+#include "base/trace.hh"
+#include "base/types.hh"
+#include "debug/GPUSRF.hh"
+#include "gpu-compute/register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+struct ScalarRegisterFileParams;
+
+// Scalar Register File
+class ScalarRegisterFile : public RegisterFile
+{
+  public:
+    using ScalarRegU32 = TheGpuISA::ScalarRegU32;
+
+    ScalarRegisterFile(const ScalarRegisterFileParams *p);
+    ~ScalarRegisterFile() { }
+
+    virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override;
+    virtual void scheduleWriteOperands(Wavefront *w,
+                                       GPUDynInstPtr ii) override;
+    virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
+                                               GPUDynInstPtr ii) override;
+    virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override;
+
+    void
+    setParent(ComputeUnit *_computeUnit) override
+    {
+        RegisterFile::setParent(_computeUnit);
+    }
+
+    // Read a register that is writeable (e.g., a DST operand)
+    ScalarRegU32&
+    readWriteable(int regIdx)
+    {
+        return regFile[regIdx];
+    }
+
+    // Read a register that is not writeable (e.g., src operand)
+    ScalarRegU32
+    read(int regIdx) const
+    {
+        return regFile[regIdx];
+    }
+
+    // Write a register
+    void
+    write(int regIdx, ScalarRegU32 value)
+    {
+        regFile[regIdx] = value;
+    }
+
+    void
+    printReg(Wavefront *wf, int regIdx) const
+    {
+        DPRINTF(GPUSRF, "WF[%d][%d]: Id%d s[%d] = %#x\n", wf->simdId,
+            wf->wfSlotId, wf->wfDynId, regIdx, regFile[regIdx]);
+    }
+
+  private:
+    std::vector<ScalarRegU32> regFile;
+};
+
+#endif // __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -33,24 +33,36 @@

 #include "gpu-compute/schedule_stage.hh"

+#include <unordered_set>
+
+#include "debug/GPUSched.hh"
+#include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"

-ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
-    : numSIMDs(p->num_SIMDs),
-      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu)
+    : vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
+      scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
+      locMemBusRdy(false), locMemIssueRdy(false)
 {
-    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+    for (int j = 0; j < cu->numExeUnits(); ++j) {
        scheduler.emplace_back(p);
    }
+    wavesInSch.clear();
+    schList.resize(cu->numExeUnits());
+    for (auto &dq : schList) {
+        dq.clear();
+    }
 }

 ScheduleStage::~ScheduleStage()
 {
    scheduler.clear();
-    waveStatusList.clear();
+    wavesInSch.clear();
+    schList.clear();
 }

 void
@@ -59,56 +71,597 @@ ScheduleStage::init(ComputeUnit *cu)
    computeUnit = cu;
    _name = computeUnit->name() + ".ScheduleStage";

-    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+    fatal_if(scheduler.size() != computeUnit->readyList.size(),
+             "Scheduler should have same number of entries as CU's readyList");
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
        scheduler[j].bindList(&computeUnit->readyList[j]);
    }

-    for (int j = 0; j < numSIMDs; ++j) {
-        waveStatusList.push_back(&computeUnit->waveStatusList[j]);
-    }
-
    dispatchList = &computeUnit->dispatchList;
+
+    assert(computeUnit->numVectorGlobalMemUnits == 1);
+    assert(computeUnit->numVectorSharedMemUnits == 1);
 }

 void
-ScheduleStage::arbitrate()
+ScheduleStage::exec()
 {
-    // iterate over all Memory pipelines
-    for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
-        if (dispatchList->at(j).first) {
-            Wavefront *waveToMemPipe = dispatchList->at(j).first;
-            // iterate over all execution pipelines
-            for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
-                if ((i != j) && (dispatchList->at(i).first)) {
-                    Wavefront *waveToExePipe = dispatchList->at(i).first;
-                    // if the two selected wavefronts are mapped to the same
-                    // SIMD unit then they share the VRF
-                    if (waveToMemPipe->simdId == waveToExePipe->simdId) {
-                        int simdId = waveToMemPipe->simdId;
-                        // Read VRF port arbitration:
-                        // If there are read VRF port conflicts between the
-                        // a memory and another instruction we drop the other
-                        // instruction. We don't need to check for write VRF
-                        // port conflicts because the memory instruction either
-                        // does not need to write to the VRF (store) or will
-                        // write to the VRF when the data comes back (load) in
-                        // which case the arbiter of the memory pipes will
-                        // resolve any conflicts
-                        if (computeUnit->vrf[simdId]->
-                            isReadConflict(waveToMemPipe->wfSlotId,
-                            waveToExePipe->wfSlotId)) {
-                            // FIXME: The "second" member variable is never
-                            // used in the model. I am setting it to READY
-                            // simply to follow the protocol of setting it
-                            // when the WF has an instruction ready to issue
-                            waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
-                                                    .second = READY;
+    // Update readyList
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+        // delete all ready wavefronts whose instruction buffers are now
+        // empty because the last instruction was executed
+        computeUnit->updateReadyList(j);
+        /**
+         * Remove any wave that already has an instruction present in SCH
+         * waiting for RF reads to complete. This prevents out of order
+         * execution within a wave.
+         */
+        for (auto wIt = computeUnit->readyList.at(j).begin();
+             wIt != computeUnit->readyList.at(j).end();) {
+            if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
+                *wIt = nullptr;
+                wIt = computeUnit->readyList.at(j).erase(wIt);
+            } else {
+                wIt++;
+            }
+        }
+    }

-                            dispatchList->at(i).first = nullptr;
-                            dispatchList->at(i).second = EMPTY;
-                            break;
-                        }
+    // Attempt to add another wave for each EXE type to schList queues
+    // VMEM resources are iterated first, effectively giving priority
+    // to VMEM over VALU for scheduling read of operands to the RFs.
+    // Scalar Memory are iterated after VMEM
+
+    // Iterate VMEM and SMEM
+    int firstMemUnit = computeUnit->firstMemUnit();
+    int lastMemUnit = computeUnit->lastMemUnit();
+    for (int j = firstMemUnit; j <= lastMemUnit; j++) {
+        int readyListSize = computeUnit->readyList[j].size();
+        // If no wave is ready to be scheduled on the execution resource
+        // then skip scheduling for this execution resource
+        if (!readyListSize) {
+            rdyListEmpty[j]++;
+            continue;
+        }
+        rdyListNotEmpty[j]++;
+
+        // Pick a wave and attempt to add it to schList
+        Wavefront *w = scheduler[j].chooseWave();
+        if (!addToSchList(j, w)) {
+            // For waves not added to schList, increment count of cycles
+            // this wave spends in SCH stage.
+            w->schCycles++;
+            addToSchListStalls[j]++;
+        }
+    }
+
+    // Iterate everything else
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+        // skip the VMEM resources
+        if (j >= firstMemUnit && j <= lastMemUnit) {
+            continue;
+        }
+        int readyListSize = computeUnit->readyList[j].size();
+        // If no wave is ready to be scheduled on the execution resource
+        // then skip scheduling for this execution resource
+        if (!readyListSize) {
+            rdyListEmpty[j]++;
+            continue;
+        }
+        rdyListNotEmpty[j]++;
+
+        // Pick a wave and attempt to add it to schList
+        Wavefront *w = scheduler[j].chooseWave();
+        if (!addToSchList(j, w)) {
+            // For waves not added to schList, increment count of cycles
+            // this wave spends in SCH stage.
+            w->schCycles++;
+            addToSchListStalls[j]++;
+        }
+    }
+
+    // At this point, the schList queue per EXE type may contain
+    // multiple waves, in order of age (oldest to youngest).
+    // Wave may be in RFBUSY, indicating they are waiting for registers
+    // to be read, or in RFREADY, indicating they are candidates for
+    // the dispatchList and execution
+
+    // Iterate schList queues and check if any of the waves have finished
+    // reading their operands, moving those waves to RFREADY status
+    checkRfOperandReadComplete();
+
+    // Fill the dispatch list with the oldest wave of each EXE type that
+    // is ready to execute
+    // Wave is picked if status in schList is RFREADY and it passes resource
+    // ready checks similar to those currently in SCB
+    fillDispatchList();
+
+    // Resource arbitration on waves in dispatchList
+    // Losing waves are re-inserted to the schList at a location determined
+    // by wave age
+
+    // Arbitrate access to the VRF->LDS bus
+    arbitrateVrfToLdsBus();
+
+    // Schedule write operations to the register files
+    scheduleRfDestOperands();
+
+    // Lastly, reserve resources for waves that are ready to execute.
+    reserveResources();
+}
+
+void
+ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
+                                        Wavefront *w)
+{
+    dispatchList->at(unitId).first = w;
+    dispatchList->at(unitId).second = s;
+}
+
+bool
+ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
+{
+    GPUDynInstPtr ii = w->instructionBuffer.front();
+    assert(ii);
+    bool accessVrfWr = true;
+    if (!ii->isScalar()) {
+        accessVrfWr =
+            computeUnit->vrf[w->simdId]->canScheduleWriteOperands(w, ii);
+    }
+    bool accessSrfWr =
+        computeUnit->srf[w->simdId]->canScheduleWriteOperands(w, ii);
+    bool accessRf = accessVrfWr && accessSrfWr;
+    if (accessRf) {
+        if (!ii->isScalar()) {
+            computeUnit->vrf[w->simdId]->scheduleWriteOperands(w, ii);
+        }
+        computeUnit->srf[w->simdId]->scheduleWriteOperands(w, ii);
+        return true;
+    } else {
+        rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+        if (!accessSrfWr) {
+            rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
+        }
+        if (!accessVrfWr) {
+            rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
+        }
+
+        // Increment stall counts for WF
+        w->schStalls++;
+        w->schRfAccessStalls++;
+    }
+    return false;
+}
+
+void
+ScheduleStage::scheduleRfDestOperands()
+{
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+        if (!dispatchList->at(j).first) {
+            continue;
+        }
+        // get the wave on dispatch list and attempt to allocate write
+        // resources in the RFs
+        Wavefront *w = dispatchList->at(j).first;
+        if (!schedRfWrites(j, w)) {
+            reinsertToSchList(j, w);
+            doDispatchListTransition(j, EMPTY);
+            // if this is a flat inst, also transition the LM pipe to empty
+            // Note: since FLAT/LM arbitration occurs before scheduling
+            // destination operands to the RFs, it is possible that a LM
+            // instruction lost arbitration, but would have been able to
+            // pass the RF destination operand check here, and execute
+            // instead of the FLAT.
+            if (w->instructionBuffer.front()->isFlat()) {
+                assert(dispatchList->at(w->localMem).second == SKIP);
+                doDispatchListTransition(w->localMem, EMPTY);
+            }
+        }
+    }
+}
+
+bool
+ScheduleStage::addToSchList(int exeType, Wavefront *w)
+{
+    // Attempt to add the wave to the schList if the VRF can support the
+    // wave's next instruction
+    GPUDynInstPtr ii = w->instructionBuffer.front();
+    assert(ii);
+    bool accessVrf = true;
+    if (!ii->isScalar()) {
+        accessVrf =
+            computeUnit->vrf[w->simdId]->canScheduleReadOperands(w, ii);
+    }
+    bool accessSrf =
+        computeUnit->srf[w->simdId]->canScheduleReadOperands(w, ii);
+    // If RFs can support instruction, add to schList in RFBUSY state,
+    // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
+    // to the VRF
+    bool accessRf = accessVrf && accessSrf;
+    if (accessRf) {
+        DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
+                exeType, w->simdId, w->wfDynId,
+                ii->seqNum(), ii->disassemble());
+
+        computeUnit->insertInPipeMap(w);
+        wavesInSch.emplace(w->wfDynId);
+        schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
+        if (w->isOldestInstWaitcnt()) {
+            w->setStatus(Wavefront::S_WAITCNT);
+        }
+        if (!ii->isScalar()) {
+            computeUnit->vrf[w->simdId]->scheduleReadOperands(w, ii);
+        }
+        computeUnit->srf[w->simdId]->scheduleReadOperands(w, ii);
+
+        DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
+                exeType, w->simdId, w->wfDynId,
+                ii->seqNum(), ii->disassemble());
+        return true;
+    } else {
+        // Number of stall cycles due to RF access denied
+        rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+        // Count number of denials due to each reason
+        // Multiple items may contribute to the denied request
+        if (!accessVrf) {
+            rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
+        }
+        if (!accessSrf) {
+            rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
+        }
+
+        // Increment stall counts for WF
+        w->schStalls++;
+        w->schRfAccessStalls++;
+        DPRINTF(GPUSched, "schList[%d]: Could not add: "
+                "SIMD[%d] WV[%d]: %d: %s\n",
+                exeType, w->simdId, w->wfDynId,
+                ii->seqNum(), ii->disassemble());
+    }
+    return false;
+}
+
+void
+ScheduleStage::reinsertToSchList(int exeType, Wavefront *w)
+{
+    // Insert wave w into schList for specified exeType.
+    // Wave is inserted in age order, with oldest wave being at the
+    // front of the schList
+    auto schIter = schList.at(exeType).begin();
+    while (schIter != schList.at(exeType).end()
+           && schIter->first->wfDynId < w->wfDynId) {
+        schIter++;
+    }
+    schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY));
+}
+
+void
+ScheduleStage::checkMemResources()
+{
+    // Check for resource availability in the next cycle
+    scalarMemBusRdy = false;
+    scalarMemIssueRdy = false;
+    // check if there is a SRF->Global Memory bus available and
+    if (computeUnit->srfToScalarMemPipeBus.rdy(Cycles(1))) {
+        scalarMemBusRdy = true;
+    }
+    // check if we can issue a scalar memory instruction
+    if (computeUnit->scalarMemUnit.rdy(Cycles(1))) {
+        scalarMemIssueRdy = true;
+    }
+
+    glbMemBusRdy = false;
+    glbMemIssueRdy = false;
+    // check if there is a VRF->Global Memory bus available
+    if (computeUnit->vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
+        glbMemBusRdy = true;
+    }
+    // check if we can issue a Global memory instruction
+    if (computeUnit->vectorGlobalMemUnit.rdy(Cycles(1))) {
+        glbMemIssueRdy = true;
+    }
+
+    locMemBusRdy = false;
+    locMemIssueRdy = false;
+    // check if there is a VRF->LDS bus available
+    if (computeUnit->vrfToLocalMemPipeBus.rdy(Cycles(1))) {
+        locMemBusRdy = true;
+    }
+    // check if we can issue a LDS instruction
+    if (computeUnit->vectorSharedMemUnit.rdy(Cycles(1))) {
+        locMemIssueRdy = true;
+    }
+}
+
+bool
+ScheduleStage::dispatchReady(Wavefront *w)
+{
+    vectorAluRdy = false;
+    scalarAluRdy = false;
+    // check for available vector/scalar ALUs in the next cycle
+    if (computeUnit->vectorALUs[w->simdId].rdy(Cycles(1))) {
+        vectorAluRdy = true;
+    }
+    if (computeUnit->scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
+        scalarAluRdy = true;
+    }
+    GPUDynInstPtr ii = w->instructionBuffer.front();
+
+    if (ii->isNop()) {
+        // S_NOP requires SALU. V_NOP requires VALU.
+        // TODO: Scalar NOP does not require SALU in hardware,
+        // and is executed out of IB directly.
+        if (ii->isScalar() && !scalarAluRdy) {
+            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            return false;
+        } else if (!ii->isScalar() && !vectorAluRdy) {
+            dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+            return false;
+        }
+    } else if (ii->isEndOfKernel()) {
+        // EndPgm instruction
+        if (ii->isScalar() && !scalarAluRdy) {
+            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            return false;
+        }
+    } else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) {
+        // Barrier, Branch, or ALU instruction
+        if (ii->isScalar() && !scalarAluRdy) {
+            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            return false;
+        } else if (!ii->isScalar() && !vectorAluRdy) {
+            dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+            return false;
+        }
+    } else if (!ii->isScalar() && ii->isGlobalMem()) {
+        // Vector Global Memory instruction
+        bool rdy = true;
+        if (!glbMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
+        }
+        if (!glbMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
+        }
+        if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else if (ii->isScalar() && ii->isGlobalMem()) {
+        // Scalar Global Memory instruction
+        bool rdy = true;
+        if (!scalarMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
+        }
+        if (!scalarMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit->scalarMemoryPipe.
+                isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
+                                 w->scalarWrGmReqsInPipe)) {
+            rdy = false;
+            dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else if (!ii->isScalar() && ii->isLocalMem()) {
+        // Vector Local Memory instruction
+        bool rdy = true;
+        if (!locMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
+        }
+        if (!locMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit->localMemoryPipe.
+                isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
+            rdy = false;
+            dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else if (!ii->isScalar() && ii->isFlat()) {
+        // Vector Flat memory instruction
+        bool rdy = true;
+        if (!glbMemIssueRdy || !locMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
+        }
+        if (!glbMemBusRdy || !locMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
+        }
+        if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
+        }
+        if (!computeUnit->localMemoryPipe.
+                isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else {
+        panic("%s: unknown instr checked for readiness", ii->disassemble());
+        return false;
+    }
+    dispNrdyStalls[SCH_RDY]++;
+    return true;
+}
+
+void
+ScheduleStage::fillDispatchList()
+{
+    // update execution resource status
+    checkMemResources();
+    // iterate execution resources
+    for (int j = 0; j < computeUnit->numExeUnits(); j++) {
+        assert(dispatchList->at(j).second == EMPTY);
+
+        // iterate waves in schList to pick one for dispatch
+        auto schIter = schList.at(j).begin();
+        bool dispatched = false;
+        while (schIter != schList.at(j).end()) {
+            // only attempt to dispatch if status is RFREADY
+            if (schIter->second == RFREADY) {
+                // Check if this wave is ready for dispatch
+                bool dispRdy = dispatchReady(schIter->first);
+                if (!dispatched && dispRdy) {
+                    // No other wave has been dispatched for this exe
+                    // resource, and this wave is ready. Place this wave
+                    // on dispatchList and make it ready for execution
+                    // next cycle.
+
+                    // Acquire a coalescer token if it is a global mem
+                    // operation.
+                    GPUDynInstPtr mp = schIter->first->
+                                       instructionBuffer.front();
+                    if (!mp->isMemSync() && !mp->isScalar() &&
+                        (mp->isGlobalMem() || mp->isFlat())) {
+                        computeUnit->globalMemoryPipe.acqCoalescerToken(mp);
                    }
+
+                    doDispatchListTransition(j, EXREADY, schIter->first);
+                    DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
+                            "EMPTY->EXREADY\n", j);
+                    schIter->first = nullptr;
+                    schIter = schList.at(j).erase(schIter);
+                    dispatched = true;
+                } else {
+                    // Either another wave has been dispatched, or this wave
+                    // was not ready, so it is stalled this cycle
+                    schIter->first->schStalls++;
+                    if (!dispRdy) {
+                        // not ready for dispatch, increment stall stat
+                        schIter->first->schResourceStalls++;
+                    }
+                    // Examine next wave for this resource
+                    schIter++;
+                }
+            } else {
+                // Wave not in RFREADY, try next wave
+                schIter++;
+            }
+        }
+
+        // Increment stall count if no wave sent to dispatchList for
+        // current execution resource
+        if (!dispatched) {
+            schListToDispListStalls[j]++;
+        } else {
+            schListToDispList[j]++;
+        }
+    }
+}
+
+void
+ScheduleStage::arbitrateVrfToLdsBus()
+{
+    // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
+    // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
+    // and a VRF->LDS bus. In GFx9, this is not the case.
+
+    // iterate the GM pipelines
+    for (int i = 0; i < computeUnit->numVectorGlobalMemUnits; i++) {
+        // get the GM pipe index in the dispatchList
+        int gm_exe_unit = computeUnit->firstMemUnit() + i;
+        // get the wave in the dispatchList
+        Wavefront *w = dispatchList->at(gm_exe_unit).first;
+        // If the WF is valid, ready to execute, and the instruction
+        // is a flat access, arbitrate with the WF's assigned LM pipe
+        if (w && dispatchList->at(gm_exe_unit).second == EXREADY &&
+            w->instructionBuffer.front()->isFlat()) {
+            // If the associated LM pipe also has a wave selected, block
+            // that wave and let the Flat instruction issue. The WF in the
+            // LM pipe is added back to the schList for consideration next
+            // cycle.
+            if (dispatchList->at(w->localMem).second == EXREADY) {
+                reinsertToSchList(w->localMem,
+                                  dispatchList->at(w->localMem).first);
+                // Increment stall stats for LDS-VRF arbitration
+                ldsBusArbStalls++;
+                dispatchList->at(w->localMem).first->schLdsArbStalls++;
+            }
+            // With arbitration of LM pipe complete, transition the
+            // LM pipe to SKIP state in the dispatchList to inform EX stage
+            // that a Flat instruction is executing next cycle
+            doDispatchListTransition(w->localMem, SKIP, w);
+            DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
+                    "EXREADY->SKIP\n", w->localMem);
+        }
+    }
+}
+
+void
+ScheduleStage::checkRfOperandReadComplete()
+{
+    // Iterate the schList queues and check if operand reads
+    // have completed in the RFs. If so, mark the wave as ready for
+    // selection for dispatchList
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+        for (auto &p : schList.at(j)) {
+            Wavefront *w = p.first;
+            assert(w);
+
+            // Increment the number of cycles the wave spends in the
+            // SCH stage, since this loop visits every wave in SCH.
+            w->schCycles++;
+
+            GPUDynInstPtr ii = w->instructionBuffer.front();
+            bool vrfRdy = true;
+            if (!ii->isScalar()) {
+                vrfRdy =
+                    computeUnit->vrf[w->simdId]->operandReadComplete(w, ii);
+            }
+            bool srfRdy =
+                computeUnit->srf[w->simdId]->operandReadComplete(w, ii);
+            bool operandsReady = vrfRdy && srfRdy;
+            if (operandsReady) {
+                DPRINTF(GPUSched,
+                        "schList[%d]: WV[%d] operands ready for: %d: %s\n",
+                         j, w->wfDynId, ii->seqNum(), ii->disassemble());
+                DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
+                        j, w->wfDynId);
+                p.second = RFREADY;
+            } else {
+                DPRINTF(GPUSched,
+                        "schList[%d]: WV[%d] operands not ready for: %d: %s\n",
+                         j, w->wfDynId, ii->seqNum(), ii->disassemble());
+
+                // operands not ready yet, increment SCH stage stats
+                // aggregate to all wavefronts on the CU
+                p.second = RFBUSY;
+
+                // Increment stall stats
+                w->schStalls++;
+                w->schOpdNrdyStalls++;
+
+                opdNrdyStalls[SCH_RF_OPD_NRDY]++;
+                if (!vrfRdy) {
+                    opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
+                }
+                if (!srfRdy) {
+                    opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
                }
            }
        }
@@ -116,33 +669,177 @@ ScheduleStage::arbitrate()
 }

 void
-ScheduleStage::exec()
+ScheduleStage::reserveResources()
 {
-    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
-         uint32_t readyListSize = computeUnit->readyList[j].size();
+    std::vector<bool> exeUnitReservations;
+    exeUnitReservations.resize(computeUnit->numExeUnits(), false);

-         // If no wave is ready to be scheduled on the execution resource
-         // then skip scheduling for this execution resource
-         if (!readyListSize) {
-             continue;
-         }
+    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+        Wavefront *dispatchedWave = dispatchList->at(j).first;
+        if (dispatchedWave) {
+            DISPATCH_STATUS s = dispatchList->at(j).second;
+            if (s == EMPTY) {
+                continue;
+            } else if (s == EXREADY) {
+                // Wave is ready for execution
+                std::vector<int> execUnitIds =
+                    dispatchedWave->reserveResources();
+                GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();

-         Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
-         dispatchList->at(j).first = waveToBeDispatched;
-         waveToBeDispatched->updateResources();
-         dispatchList->at(j).second = FILLED;
+                if (!ii->isScalar()) {
+                    computeUnit->vrf[dispatchedWave->simdId]->
+                        dispatchInstruction(ii);
+                }
+                computeUnit->srf[dispatchedWave->simdId]->
+                    dispatchInstruction(ii);

-         waveStatusList[waveToBeDispatched->simdId]->at(
-                 waveToBeDispatched->wfSlotId).second = BLOCKED;
+                std::stringstream ss;
+                for (auto id : execUnitIds) {
+                    ss << id << " ";
+                }
+                DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
+                        "    Reserving ExeRes[ %s]\n",
+                        j, dispatchedWave->simdId, dispatchedWave->wfDynId,
+                        ii->seqNum(), ii->disassemble(), ss.str());
+                // mark the resources as reserved for this cycle
+                for (auto execUnitId : execUnitIds) {
+                    panic_if(exeUnitReservations.at(execUnitId),
+                             "Execution unit %d is reserved!!!\n"
+                             "SIMD[%d] WV[%d]: %d: %s",
+                             execUnitId, dispatchedWave->simdId,
+                             dispatchedWave->wfDynId,
+                             ii->seqNum(), ii->disassemble());
+                    exeUnitReservations.at(execUnitId) = true;
+                }

-         assert(computeUnit->readyList[j].size() == readyListSize - 1);
+                // If wavefront::reserveResources reserved multiple resources,
+                // then we're executing a flat memory instruction. This means
+                // that we've reserved a global and local memory unit. Thus,
+                // we need to mark the latter execution unit as not available.
+                if (execUnitIds.size() > 1) {
+                    int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem;
+                    assert(dispatchList->at(lm_exec_unit).second == SKIP);
+                }
+            } else if (s == SKIP) {
+                // Shared Memory pipe reserved for FLAT instruction.
+                // Verify the GM pipe for this wave is ready to execute
+                // and the wave in the GM pipe is the same as the wave
+                // in the LM pipe
+                int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem;
+                assert(dispatchList->at(gm_exec_unit).first->wfDynId ==
+                       dispatchedWave->wfDynId);
+                assert(dispatchList->at(gm_exec_unit).second == EXREADY);
+            }
+        }
    }
-    // arbitrate over all shared resources among instructions being issued
-    // simultaneously
-    arbitrate();
+}
+
+void
+ScheduleStage::deleteFromSch(Wavefront *w)
+{
+    wavesInSch.erase(w->wfDynId);
 }

 void
 ScheduleStage::regStats()
 {
+    rdyListNotEmpty
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".rdy_list_not_empty")
+        .desc("number of cycles one or more wave on ready list per "
+              "execution resource")
+        ;
+
+    rdyListEmpty
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".rdy_list_empty")
+        .desc("number of cycles no wave on ready list per "
+              "execution resource")
+        ;
+
+    addToSchListStalls
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".sch_list_add_stalls")
+        .desc("number of cycles a wave is not added to schList per "
+              "execution resource when ready list is not empty")
+        ;
+
+    schListToDispList
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".sch_list_to_disp_list")
+        .desc("number of cycles a wave is added to dispatchList per "
+              "execution resource")
+        ;
+
+    schListToDispListStalls
+        .init(computeUnit->numExeUnits())
+        .name(name() + ".sch_list_to_disp_list_stalls")
+        .desc("number of cycles no wave is added to dispatchList per "
+              "execution resource")
+        ;
+
+    // Operand Readiness Stall Cycles
+    opdNrdyStalls
+        .init(SCH_RF_OPD_NRDY_CONDITIONS)
+        .name(name() + ".opd_nrdy_stalls")
+        .desc("number of stalls in SCH due to operands not ready")
+        ;
+    opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
+    opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
+    opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
+
+    // dispatchReady Stall Cycles
+    dispNrdyStalls
+        .init(SCH_NRDY_CONDITIONS)
+        .name(name() + ".disp_nrdy_stalls")
+        .desc("number of stalls in SCH due to resource not ready")
+        ;
+    dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
+    dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
+    dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
+                                  csprintf("VectorMemIssue"));
+    dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,
+                                  csprintf("VectorMemBusBusy"));
+    dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,
+                                  csprintf("VectorMemCoalescer"));
+    dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
+    dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,
+                                  csprintf("ScalarMemIssue"));
+    dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,
+                                  csprintf("ScalarMemBusBusy"));
+    dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,
+                                  csprintf("ScalarMemFIFO"));
+    dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,
+                                  csprintf("LocalMemIssue"));
+    dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,
+                                  csprintf("LocalMemBusBusy"));
+    dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,
+                                  csprintf("LocalMemFIFO"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,
+                                  csprintf("FlatMemIssue"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,
+                                  csprintf("FlatMemBusBusy"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,
+                                  csprintf("FlatMemCoalescer"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,
+                                  csprintf("FlatMemFIFO"));
+    dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
+
+    // RF Access Stall Cycles
+    rfAccessStalls
+        .init(SCH_RF_ACCESS_NRDY_CONDITIONS)
+        .name(name() + ".rf_access_stalls")
+        .desc("number of stalls due to RF access denied")
+        ;
+    rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
+    rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
+    rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
+    rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
+    rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
+
+    // Stall cycles due to wave losing LDS bus arbitration
+    ldsBusArbStalls
+        .name(name() + ".lds_bus_arb_stalls")
+        .desc("number of stalls due to VRF->LDS bus conflicts")
+        ;
 }
--- a/src/gpu-compute/schedule_stage.hh
+++ b/src/gpu-compute/schedule_stage.hh
@@ -34,6 +34,9 @@
 #ifndef __SCHEDULE_STAGE_HH__
 #define __SCHEDULE_STAGE_HH__

+#include <deque>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>

@@ -54,40 +57,169 @@ struct ComputeUnitParams;
 class ScheduleStage
 {
  public:
-    ScheduleStage(const ComputeUnitParams *params);
+    ScheduleStage(const ComputeUnitParams *params, ComputeUnit *cu);
    ~ScheduleStage();
    void init(ComputeUnit *cu);
    void exec();
-    void arbitrate();
+
    // Stats related variables and methods
    std::string name() { return _name; }
+    enum SchNonRdyType {
+        SCH_SCALAR_ALU_NRDY,
+        SCH_VECTOR_ALU_NRDY,
+        SCH_VECTOR_MEM_ISSUE_NRDY,
+        SCH_VECTOR_MEM_BUS_BUSY_NRDY,
+        SCH_VECTOR_MEM_COALESCER_NRDY,
+        SCH_VECTOR_MEM_REQS_NRDY,
+        SCH_CEDE_SIMD_NRDY,
+        SCH_SCALAR_MEM_ISSUE_NRDY,
+        SCH_SCALAR_MEM_BUS_BUSY_NRDY,
+        SCH_SCALAR_MEM_FIFO_NRDY,
+        SCH_LOCAL_MEM_ISSUE_NRDY,
+        SCH_LOCAL_MEM_BUS_BUSY_NRDY,
+        SCH_LOCAL_MEM_FIFO_NRDY,
+        SCH_FLAT_MEM_ISSUE_NRDY,
+        SCH_FLAT_MEM_BUS_BUSY_NRDY,
+        SCH_FLAT_MEM_COALESCER_NRDY,
+        SCH_FLAT_MEM_REQS_NRDY,
+        SCH_FLAT_MEM_FIFO_NRDY,
+        SCH_RDY,
+        SCH_NRDY_CONDITIONS
+    };
+    enum schopdnonrdytype_e {
+        SCH_VRF_OPD_NRDY,
+        SCH_SRF_OPD_NRDY,
+        SCH_RF_OPD_NRDY,
+        SCH_RF_OPD_NRDY_CONDITIONS
+    };
+    enum schrfaccessnonrdytype_e {
+        SCH_VRF_RD_ACCESS_NRDY,
+        SCH_VRF_WR_ACCESS_NRDY,
+        SCH_SRF_RD_ACCESS_NRDY,
+        SCH_SRF_WR_ACCESS_NRDY,
+        SCH_RF_ACCESS_NRDY,
+        SCH_RF_ACCESS_NRDY_CONDITIONS
+    };
+
    void regStats();

+    // Called by ExecStage to inform SCH of instruction execution
+    void deleteFromSch(Wavefront *w);
+
+    // Schedule List status
+    enum SCH_STATUS
+    {
+        RFBUSY = 0, // RF busy reading operands
+        RFREADY, // ready for exec
+    };
+
  private:
    ComputeUnit *computeUnit;
-    uint32_t numSIMDs;
-    uint32_t numMemUnits;
-
    // Each execution resource will have its own
    // scheduler and a dispatch list
    std::vector<Scheduler> scheduler;

-    // Stores the status of waves. A READY implies the
-    // wave is ready to be scheduled this cycle and
-    // is already present in the readyList
-    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
-        waveStatusList;
-
    // List of waves which will be dispatched to
-    // each execution resource. A FILLED implies
-    // dispatch list is non-empty and
-    // execution unit has something to execute
-    // this cycle. Currently, the dispatch list of
+    // each execution resource.
+    // Currently, the dispatch list of
    // an execution resource can hold only one wave because
    // an execution resource can execute only one wave in a cycle.
    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;

+    // Stats
+
+    // Number of cycles with empty (or not empty) readyList, per execution
+    // resource, when the CU is active (not sleeping)
+    Stats::Vector rdyListEmpty;
+    Stats::Vector rdyListNotEmpty;
+
+    // Number of cycles, per execution resource, when at least one wave
+    // was on the readyList and picked by scheduler, but was unable to be
+    // added to the schList, when the CU is active (not sleeping)
+    Stats::Vector addToSchListStalls;
+
+    // Number of cycles, per execution resource, when a wave is selected
+    // as candidate for dispatchList from schList
+    // Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
+    Stats::Vector schListToDispList;
+
+    // Per execution resource stat, incremented once per cycle if no wave
+    // was selected as candidate for dispatch and moved to dispatchList
+    Stats::Vector schListToDispListStalls;
+
+    // Number of times a wave is selected by the scheduler but cannot
+    // be added to the schList due to register files not being able to
+    // support reads or writes of operands. RF_ACCESS_NRDY condition is always
+    // incremented if at least one read/write not supported, other
+    // conditions are incremented independently from each other.
+    Stats::Vector rfAccessStalls;
+
+    // Number of times a wave is executing FLAT instruction and
+    // forces another wave occupying its required local memory resource
+    // to be deselected for execution, and placed back on schList
+    Stats::Scalar ldsBusArbStalls;
+
+    // Count of times VRF and/or SRF blocks waves on schList from
+    // performing RFBUSY->RFREADY transition
+    Stats::Vector opdNrdyStalls;
+
+    // Count of times resource required for dispatch is not ready and
+    // blocks wave in RFREADY state on schList from potentially moving
+    // to dispatchList
+    Stats::Vector dispNrdyStalls;
+
    std::string _name;
+
+    // called by exec() to add a wave to schList if the RFs can support it
+    bool addToSchList(int exeType, Wavefront *w);
+    // re-insert a wave to schList if wave lost arbitration
+    // wave is inserted such that age order (oldest to youngest) is preserved
+    void reinsertToSchList(int exeType, Wavefront *w);
+    // check waves in schList to see if RF reads complete
+    void checkRfOperandReadComplete();
+    // check execution resources for readiness
+    bool vectorAluRdy;
+    bool scalarAluRdy;
+    bool scalarMemBusRdy;
+    bool scalarMemIssueRdy;
+    bool glbMemBusRdy;
+    bool glbMemIssueRdy;
+    bool locMemBusRdy;
+    bool locMemIssueRdy;
+    // check status of memory pipes and RF to Mem buses
+    void checkMemResources();
+    // resource ready check called by fillDispatchList
+    bool dispatchReady(Wavefront *w);
+    // pick waves from schList and populate dispatchList with one wave
+    // per EXE resource type
+    void fillDispatchList();
+    // arbitrate Shared Mem Pipe VRF/LDS bus for waves in dispatchList
+    void arbitrateVrfToLdsBus();
+    // schedule destination operand writes to register files for waves in
+    // dispatchList
+    void scheduleRfDestOperands();
+    // invoked by scheduleRfDestOperands to schedule RF writes for a wave
+    bool schedRfWrites(int exeType, Wavefront *w);
+    // reserve resources for waves surviving arbitration in dispatchList
+    void reserveResources();
+
+    void doDispatchListTransition(int unitId, DISPATCH_STATUS s,
+                                  Wavefront *w = nullptr);
+
+    // Set tracking wfDynId for each wave present in schedule stage
+    // Used to allow only one instruction per wave in schedule
+    std::unordered_set<uint64_t> wavesInSch;
+
+    // List of waves (one list per exe resource) that are in schedule
+    // stage. Waves are added to this list after selected by scheduler
+    // from readyList. Waves are removed from this list and placed on
+    // dispatchList when status reaches SCHREADY.
+    // Waves are kept ordered by age for each resource, always favoring
+    // forward progress for the oldest wave.
+    // The maximum number of waves per resource can be determined by either
+    // the VRF/SRF availability or limits imposed by paremeters (to be added)
+    // of the SCH stage or CU.
+    std::vector<std::deque<std::pair<Wavefront*, SCH_STATUS>>> schList;
 };

 #endif // __SCHEDULE_STAGE_HH__
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -33,29 +33,23 @@

 #include "gpu-compute/scoreboard_check_stage.hh"

+#include "debug/GPUExec.hh"
+#include "debug/GPUSched.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 #include "params/ComputeUnit.hh"

 ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
-    : numSIMDs(p->num_SIMDs),
-      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
-      numShrMemPipes(p->num_shared_mem_pipes),
-      vectorAluInstAvail(nullptr),
-      lastGlbMemSimd(-1),
-      lastShrMemSimd(-1), glbMemInstAvail(nullptr),
-      shrMemInstAvail(nullptr)
 {
 }

 ScoreboardCheckStage::~ScoreboardCheckStage()
 {
    readyList.clear();
-    waveStatusList.clear();
-    shrMemInstAvail = nullptr;
-    glbMemInstAvail = nullptr;
 }

 void
@@ -64,102 +58,212 @@ ScoreboardCheckStage::init(ComputeUnit *cu)
    computeUnit = cu;
    _name = computeUnit->name() + ".ScoreboardCheckStage";

-    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+    for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
        readyList.push_back(&computeUnit->readyList[unitId]);
    }
-
-    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
-        waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
-    }
-
-    vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
-    glbMemInstAvail= &computeUnit->glbMemInstAvail;
-    shrMemInstAvail= &computeUnit->shrMemInstAvail;
 }

 void
-ScoreboardCheckStage::initStatistics()
+ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus)
 {
-    lastGlbMemSimd = -1;
-    lastShrMemSimd = -1;
-    *glbMemInstAvail = 0;
-    *shrMemInstAvail = 0;
-
-    for (int unitId = 0; unitId < numSIMDs; ++unitId)
-        vectorAluInstAvail->at(unitId) = false;
+    panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS,
+             "Instruction ready status %d is illegal!!!", rdyStatus);
+    stallCycles[rdyStatus]++;
 }

-void
-ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
+// Return true if this wavefront is ready
+// to execute an instruction of the specified type.
+// It also returns the reason (in rdyStatus) if the instruction is not
+// ready. Finally it sets the execution resource type (in exesResType)
+// of the instruction, only if it ready.
+bool
+ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
+                            int *exeResType, int wfSlot)
 {
-    if (curWave->instructionBuffer.empty())
-        return;
+    /**
+     * The waitCnt checks have to be done BEFORE checking for Instruction
+     * buffer empty condition. Otherwise, it will result into a deadlock if
+     * the last instruction in the Instruction buffer is a waitCnt: after
+     * executing the waitCnt, the Instruction buffer would be empty and the
+     * ready check logic will exit BEFORE checking for wait counters being
+     * satisfied.
+     */

-    // track which vector SIMD unit has at least one WV with a vector
-    // ALU as the oldest instruction in its Instruction buffer
-    vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
-                                     curWave->isOldestInstALU();
-
-    // track how many vector SIMD units have at least one WV with a
-    // vector Global memory instruction as the oldest instruction
-    // in its Instruction buffer
-    if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
-         curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
-        *glbMemInstAvail <= 1) {
-        (*glbMemInstAvail)++;
-        lastGlbMemSimd = unitId;
+    // waitCnt instruction has been dispatched or executed: next
+    // instruction should be blocked until waitCnts are satisfied.
+    if (w->getStatus() == Wavefront::S_WAITCNT) {
+        if (!w->waitCntsSatisfied()) {
+            *rdyStatus = NRDY_WAIT_CNT;
+            return false;
+        }
    }

-    // track how many vector SIMD units have at least one WV with a
-    // vector shared memory (LDS) instruction as the oldest instruction
-    // in its Instruction buffer
-    // TODO: parametrize the limit of the LDS units
-    if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
-        lastShrMemSimd != unitId) {
-        (*shrMemInstAvail)++;
-        lastShrMemSimd = unitId;
+    // Is the wave waiting at a barrier. Check this condition BEFORE checking
+    // for instruction buffer occupancy to avoid a deadlock when the barrier is
+    // the last instruction in the instruction buffer.
+    if (w->stalledAtBarrier) {
+        if (!computeUnit->AllAtBarrier(w->barrierId,w->barrierCnt,
+                        computeUnit->getRefCounter(w->dispatchId, w->wgId))) {
+            // Are all threads at barrier?
+            *rdyStatus = NRDY_BARRIER_WAIT;
+            return false;
+        }
+        w->oldBarrierCnt = w->barrierCnt;
+        w->stalledAtBarrier = false;
    }
+
+    // Check WF status: it has to be running
+    if (w->getStatus() == Wavefront::S_STOPPED ||
+        w->getStatus() == Wavefront::S_RETURNING ||
+        w->getStatus() == Wavefront::S_STALLED) {
+        *rdyStatus = NRDY_WF_STOP;
+        return false;
+    }
+
+    // is the Instruction buffer empty
+    if ( w->instructionBuffer.empty()) {
+        *rdyStatus = NRDY_IB_EMPTY;
+        return false;
+    }
+
+    // Check next instruction from instruction buffer
+    GPUDynInstPtr ii = w->nextInstr();
+    // Only instruction in the instruction buffer has been dispatched.
+    // No need to check it again for readiness
+    if (!ii) {
+        *rdyStatus = NRDY_IB_EMPTY;
+        return false;
+    }
+
+    // The following code is very error prone and the entire process for
+    // checking readiness will be fixed eventually.  In the meantime, let's
+    // make sure that we do not silently let an instruction type slip
+    // through this logic and always return not ready.
+    if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
+         ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
+         ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat())) {
+        panic("next instruction: %s is of unknown type\n", ii->disassemble());
+    }
+
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Ready for Inst : %s\n",
+            computeUnit->cu_id, w->simdId, w->wfSlotId, ii->disassemble());
+
+    // Non-scalar (i.e., vector) instructions may use VGPRs
+    if (!ii->isScalar()) {
+        if (!computeUnit->vrf[w->simdId]->operandsReady(w, ii)) {
+            *rdyStatus = NRDY_VGPR_NRDY;
+            return false;
+        }
+    }
+    // Scalar and non-scalar instructions may use SGPR
+    if (!computeUnit->srf[w->simdId]->operandsReady(w, ii)) {
+        *rdyStatus = NRDY_SGPR_NRDY;
+        return false;
+    }
+
+    // The hardware implicitly executes S_WAITCNT 0 before executing
+    // the S_ENDPGM instruction. Implementing this implicit S_WAITCNT.
+    // isEndOfKernel() is used to identify the S_ENDPGM instruction
+    // On identifying it, we do the following:
+    // 1. Wait for all older instruction to execute
+    // 2. Once all the older instruction are executed, we add a wait
+    //    count for the executed instruction(s) to complete.
+    if (ii->isEndOfKernel()) {
+        // Waiting for older instruction to execute
+        if (w->instructionBuffer.front()->seqNum() != ii->seqNum()) {
+            *rdyStatus = NRDY_WAIT_CNT;
+            return false;
+        }
+        // Older instructions have executed, adding implicit wait count
+        w->setStatus(Wavefront::S_WAITCNT);
+        w->setWaitCnts(0, 0, 0);
+        if (!w->waitCntsSatisfied()) {
+            *rdyStatus = NRDY_WAIT_CNT;
+            return false;
+        }
+    }
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
+            w->simdId, w->wfSlotId, ii->disassemble());
+    *exeResType = mapWaveToExeUnit(w);
+    *rdyStatus = INST_RDY;
+    return true;
+}
+
+int
+ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w)
+{
+    GPUDynInstPtr ii = w->nextInstr();
+    assert(ii);
+    if (ii->isFlat()) {
+        /**
+         * NOTE: Flat memory ops requires both GM and LM resources.
+         * The simulator models consumption of both GM and LM
+         * resources in the schedule stage. At instruction execution time,
+         * after the aperture check is performed, only the GM or LM pipe
+         * is actually reserved by the timing model. The GM unit is returned
+         * here since Flat ops occupy the GM slot in the ready and dispatch
+         * lists. They also consume the LM slot in the dispatch list.
+         */
+        return w->globalMem;
+    } else if (ii->isLocalMem()) {
+        return w->localMem;
+    } else if (ii->isGlobalMem()) {
+        if (!ii->isScalar()) {
+            return w->globalMem;
+        } else {
+            return w->scalarMem;
+        }
+    } else if (ii->isBranch() ||
+               ii->isALU() ||
+               (ii->isKernArgSeg() && ii->isLoad()) ||
+               ii->isArgSeg() ||
+               ii->isReturn() ||
+               ii->isEndOfKernel() ||
+               ii->isNop() ||
+               ii->isBarrier()) {
+        if (!ii->isScalar()) {
+            return w->simdId;
+        } else {
+            return w->scalarAluGlobalIdx;
+        }
+    }
+    panic("%s: unmapped to an execution resource", ii->disassemble());
+    return computeUnit->numExeUnits();
 }

 void
 ScoreboardCheckStage::exec()
 {
-    initStatistics();
-
    // reset the ready list for all execution units; it will be
    // constructed every cycle since resource availability may change
-    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+    for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+        // Reset wavefront pointers to nullptr so clear() on the vector
+        // does not accidentally destruct the wavefront object
+        for (int i = 0; i < readyList[unitId]->size(); i++) {
+            readyList[unitId]->at(i) = nullptr;
+        }
        readyList[unitId]->clear();
    }
-
-    // iterate over the Wavefronts of all SIMD units
-    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
-        for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
+    // iterate over all WF slots across all vector ALUs
+    for (int simdId = 0; simdId < computeUnit->numVectorALUs; ++simdId) {
+        for (int wfSlot = 0; wfSlot < computeUnit->shader->n_wf; ++wfSlot) {
            // reset the ready status of each wavefront
-            waveStatusList[unitId]->at(wvId).second = BLOCKED;
-            Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
-            collectStatistics(curWave, unitId);
-
-            if (curWave->ready(Wavefront::I_ALU)) {
-                readyList[unitId]->push_back(curWave);
-                waveStatusList[unitId]->at(wvId).second = READY;
-            } else if (curWave->ready(Wavefront::I_GLOBAL)) {
-                if (computeUnit->cedeSIMD(unitId, wvId)) {
-                    continue;
-                }
-
-                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
-                waveStatusList[unitId]->at(wvId).second = READY;
-            } else if (curWave->ready(Wavefront::I_SHARED)) {
-                readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
-                waveStatusList[unitId]->at(wvId).second = READY;
-            } else if (curWave->ready(Wavefront::I_FLAT)) {
-                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
-                waveStatusList[unitId]->at(wvId).second = READY;
-            } else if (curWave->ready(Wavefront::I_PRIVATE)) {
-                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
-                waveStatusList[unitId]->at(wvId).second = READY;
+            Wavefront *curWave = computeUnit->wfList[simdId][wfSlot];
+            nonrdytype_e rdyStatus = NRDY_ILLEGAL;
+            int exeResType = -1;
+            // check WF readiness: If the WF's oldest
+            // instruction is ready to issue then add the WF to the ready list
+            if (ready(curWave, &rdyStatus, &exeResType, wfSlot)) {
+                assert(curWave->simdId == simdId);
+                DPRINTF(GPUSched,
+                        "Adding to readyList[%d]: SIMD[%d] WV[%d]: %d: %s\n",
+                        exeResType,
+                        curWave->simdId, curWave->wfDynId,
+                        curWave->nextInstr()->seqNum(),
+                        curWave->nextInstr()->disassemble());
+                readyList.at(exeResType)->push_back(curWave);
            }
+            collectStatistics(rdyStatus);
        }
    }
 }
@@ -167,4 +271,16 @@ ScoreboardCheckStage::exec()
 void
 ScoreboardCheckStage::regStats()
 {
+    stallCycles
+        .init(NRDY_CONDITIONS)
+        .name(name() + ".stall_cycles")
+        .desc("number of cycles wave stalled in SCB")
+        ;
+    stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop"));
+    stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty"));
+    stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt"));
+    stallCycles.subname(NRDY_BARRIER_WAIT, csprintf("BarrierWait"));
+    stallCycles.subname(NRDY_VGPR_NRDY, csprintf("VgprBusy"));
+    stallCycles.subname(NRDY_SGPR_NRDY, csprintf("SgprBusy"));
+    stallCycles.subname(INST_RDY, csprintf("InstrReady"));
 }
--- a/src/gpu-compute/scoreboard_check_stage.hh
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -36,20 +36,17 @@

 #include <cstdint>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>

+#include "sim/stats.hh"
+
 class ComputeUnit;
 class Wavefront;

 struct ComputeUnitParams;

-enum WAVE_STATUS
-{
-    BLOCKED = 0,
-    READY
-};
-
 /*
 * Scoreboard check stage.
 * All wavefronts are analyzed to see if they are ready
@@ -61,6 +58,18 @@ enum WAVE_STATUS
 class ScoreboardCheckStage
 {
  public:
+    enum nonrdytype_e {
+        NRDY_ILLEGAL,
+        NRDY_WF_STOP,
+        NRDY_IB_EMPTY,
+        NRDY_WAIT_CNT,
+        NRDY_BARRIER_WAIT,
+        NRDY_VGPR_NRDY,
+        NRDY_SGPR_NRDY,
+        INST_RDY,
+        NRDY_CONDITIONS
+    };
+
    ScoreboardCheckStage(const ComputeUnitParams* params);
    ~ScoreboardCheckStage();
    void init(ComputeUnit *cu);
@@ -71,31 +80,18 @@ class ScoreboardCheckStage
    void regStats();

  private:
-    void collectStatistics(Wavefront *curWave, int unitId);
-    void initStatistics();
+    void collectStatistics(nonrdytype_e rdyStatus);
+    int mapWaveToExeUnit(Wavefront *w);
+    bool ready(Wavefront *w, nonrdytype_e *rdyStatus,
+               int *exeResType, int wfSlot);
    ComputeUnit *computeUnit;
-    uint32_t numSIMDs;
-    uint32_t numMemUnits;
-    uint32_t numShrMemPipes;

-    // flag per vector SIMD unit that is set when there is at least one
-    // WF that has a vector ALU instruction as the oldest in its
-    // Instruction Buffer
-    std::vector<bool> *vectorAluInstAvail;
-    int lastGlbMemSimd;
-    int lastShrMemSimd;
-
-    int *glbMemInstAvail;
-    int *shrMemInstAvail;
    // List of waves which are ready to be scheduled.
    // Each execution resource has a ready list
    std::vector<std::vector<Wavefront*>*> readyList;

-    // Stores the status of waves. A READY implies the
-    // wave is ready to be scheduled this cycle and
-    // is already present in the readyList
-    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
-        waveStatusList;
+    // Stats
+    Stats::Vector stallCycles;

    std::string _name;
 };
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -39,37 +39,63 @@
 #include "base/chunk_generator.hh"
 #include "debug/GPUDisp.hh"
 #include "debug/GPUMem.hh"
-#include "debug/HSAIL.hh"
+#include "debug/GPUShader.hh"
+#include "debug/GPUWgLatency.hh"
 #include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_command_processor.hh"
 #include "gpu-compute/gpu_static_inst.hh"
-#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/wavefront.hh"
 #include "mem/packet.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "sim/sim_exit.hh"

-Shader::Shader(const Params *p)
-    : ClockedObject(p), clock(p->clk_domain->clockPeriod()),
-      cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p->cpu_pointer),
-      tickEvent([this]{ processTick(); }, "Shader tick",
-                false, Event::CPU_Tick_Pri),
-      timingSim(p->timing), hsail_mode(SIMT),
-      impl_kern_boundary_sync(p->impl_kern_boundary_sync),
-      separate_acquire_release(p->separate_acquire_release), coissue_return(1),
-      trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
-      globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
-      box_tick_cnt(0), start_tick_cnt(0)
+Shader::Shader(const Params *p) : ClockedObject(p),
+    _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
+    gpuTc(nullptr), cpuPointer(p->cpu_pointer),
+    tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
+          false, Event::CPU_Tick_Pri),
+    timingSim(p->timing), hsail_mode(SIMT),
+    impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+    coissue_return(1),
+    trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
+    globalMemSize(p->globalmem),
+    nextSchedCu(0), sa_n(0), gpuCmdProc(*p->gpu_cmd_proc),
+    _dispatcher(*p->dispatcher),
+    max_valu_insts(p->max_valu_insts), total_valu_insts(0)
 {
+    gpuCmdProc.setShader(this);
+    _dispatcher.setShader(this);
+
+    _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
+    _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
+
+    _ldsApe.base = ((Addr)1 << 61) + 0x0;
+    _ldsApe.limit =  (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+
+    _scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
+    _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+
+    shHiddenPrivateBaseVmid = 0;

    cuList.resize(n_cu);

+    panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
+
    for (int i = 0; i < n_cu; ++i) {
        cuList[i] = p->CUs[i];
        assert(i == cuList[i]->cu_id);
        cuList[i]->shader = this;
+        cuList[i]->idleCUTimeout = p->idlecu_timeout;
    }
 }

+GPUDispatcher&
+Shader::dispatcher()
+{
+    return _dispatcher;
+}
+
 Addr
 Shader::mmap(int length)
 {
@@ -83,11 +109,11 @@ Shader::mmap(int length)
    auto mem_state = proc->memState;

    if (proc->mmapGrowsDown()) {
-        DPRINTF(HSAIL, "GROWS DOWN");
+        DPRINTF(GPUShader, "GROWS DOWN");
        start = mem_state->getMmapEnd() - length;
        mem_state->setMmapEnd(start);
    } else {
-        DPRINTF(HSAIL, "GROWS UP");
+        DPRINTF(GPUShader, "GROWS UP");
        start = mem_state->getMmapEnd();
        mem_state->setMmapEnd(start + length);

@@ -96,7 +122,7 @@ Shader::mmap(int length)
               mem_state->getMmapEnd());
    }

-    DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
+    DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);

    proc->allocateMem(start, length);

@@ -146,15 +172,15 @@ ShaderParams::create()
 }

 void
-Shader::exec()
+Shader::execScheduledAdds()
 {
-    tick_cnt = curTick();
-    box_tick_cnt = curTick() - start_tick_cnt;
+    assert(!sa_when.empty());

    // apply any scheduled adds
    for (int i = 0; i < sa_n; ++i) {
-        if (sa_when[i] <= tick_cnt) {
+        if (sa_when[i] <= curTick()) {
            *sa_val[i] += sa_x[i];
+            panic_if(*sa_val[i] < 0, "Negative counter value\n");
            sa_val.erase(sa_val.begin() + i);
            sa_x.erase(sa_x.begin() + i);
            sa_when.erase(sa_when.begin() + i);
@@ -162,14 +188,62 @@ Shader::exec()
            --i;
        }
    }
+    if (!sa_when.empty()) {
+        Tick shader_wakeup = *std::max_element(sa_when.begin(),
+                 sa_when.end());
+        DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
+        schedule(tickEvent, shader_wakeup);
+    } else {
+        DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
+    }
+}

-    // clock all of the cu's
-    for (int i = 0; i < n_cu; ++i)
-        cuList[i]->exec();
+/*
+ * dispatcher/shader arranges invalidate requests to the CUs
+ */
+void
+Shader::prepareInvalidate(HSAQueueEntry *task) {
+    // if invalidate has already started/finished, then do nothing
+    if (task->isInvStarted()) return;
+
+    // invalidate has never started; it can only perform once at kernel launch
+    assert(task->outstandingInvs() == -1);
+    int kernId = task->dispatchId();
+    // counter value is 0 now, indicating the inv is about to start
+    _dispatcher.updateInvCounter(kernId, +1);
+
+    // iterate all cus managed by the shader, to perform invalidate.
+    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+        // create a request to hold INV info; the request's fields will
+        // be updated in cu before use
+        auto req = std::make_shared<Request>(0, 0, 0,
+                                             cuList[i_cu]->masterId(),
+                                             0, -1);
+
+        _dispatcher.updateInvCounter(kernId, +1);
+        // all necessary INV flags are all set now, call cu to execute
+        cuList[i_cu]->doInvalidate(req, task->dispatchId());
+    }
+}
+
+/**
+ * dispatcher/shader arranges flush requests to the CUs
+ */
+void
+Shader::prepareFlush(GPUDynInstPtr gpuDynInst){
+    int kernId = gpuDynInst->kern_id;
+    // flush has never been started, performed only once at kernel end
+    assert(_dispatcher.getOutstandingWbs(kernId) == 0);
+
+    // iterate all cus, managed by the shader, to perform flush.
+    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+        _dispatcher.updateWbCounter(kernId, +1);
+        cuList[i_cu]->doFlush(gpuDynInst);
+    }
 }

 bool
-Shader::dispatch_workgroups(NDRange *ndr)
+Shader::dispatchWorkgroups(HSAQueueEntry *task)
 {
    bool scheduledSomething = false;
    int cuCount = 0;
@@ -182,32 +256,24 @@ Shader::dispatch_workgroups(NDRange *ndr)
        // dispatch workgroup iff the following two conditions are met:
        // (a) wg_rem is true - there are unassigned workgroups in the grid
        // (b) there are enough free slots in cu cuList[i] for this wg
-        if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
+        if (!task->dispComplete() && cuList[curCu]->hasDispResources(task)) {
            scheduledSomething = true;
-            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
+            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
+                            curCu, task->globalWgId());
+            DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
+                    curTick(), task->globalWgId(), curCu);

-            // ticks() member function translates cycles to simulation ticks.
-            if (!tickEvent.scheduled()) {
-                schedule(tickEvent, curTick() + this->ticks(1));
+            if (!cuList[curCu]->tickEvent.scheduled()) {
+                if (!_activeCus)
+                    _lastInactiveTick = curTick();
+                _activeCus++;
            }

-            cuList[curCu]->StartWorkgroup(ndr);
-            ndr->wgId[0]++;
-            ndr->globalWgId++;
-            if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
-                ndr->wgId[0] = 0;
-                ndr->wgId[1]++;
+            panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
+                     "Invalid activeCu size\n");
+            cuList[curCu]->dispWorkgroup(task);

-                if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
-                    ndr->wgId[1] = 0;
-                    ndr->wgId[2]++;
-
-                    if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
-                        ndr->wg_disp_rem = false;
-                        break;
-                    }
-                }
-            }
+            task->markWgDispatch();
        }

        ++cuCount;
@@ -218,9 +284,83 @@ Shader::dispatch_workgroups(NDRange *ndr)
 }

 void
-Shader::handshake(GpuDispatcher *_dispatcher)
+Shader::regStats()
 {
-    dispatcher = _dispatcher;
+    ClockedObject::regStats();
+
+    shaderActiveTicks
+        .name(name() + ".shader_active_ticks")
+        .desc("Total ticks that any CU attached to this shader is active")
+        ;
+    allLatencyDist
+        .init(0, 1600000, 10000)
+        .name(name() + ".allLatencyDist")
+        .desc("delay distribution for all")
+        .flags(Stats::pdf | Stats::oneline);
+
+    loadLatencyDist
+        .init(0, 1600000, 10000)
+        .name(name() + ".loadLatencyDist")
+        .desc("delay distribution for loads")
+        .flags(Stats::pdf | Stats::oneline);
+
+    storeLatencyDist
+        .init(0, 1600000, 10000)
+        .name(name() + ".storeLatencyDist")
+        .desc("delay distribution for stores")
+        .flags(Stats::pdf | Stats::oneline);
+
+    vectorInstSrcOperand
+        .init(4)
+        .name(name() + ".vec_inst_src_operand")
+        .desc("vector instruction source operand distribution");
+
+    vectorInstDstOperand
+        .init(4)
+        .name(name() + ".vec_inst_dst_operand")
+        .desc("vector instruction destination operand distribution");
+
+    initToCoalesceLatency
+        .init(0, 1600000, 10000)
+        .name(name() + ".initToCoalesceLatency")
+        .desc("Ticks from vmem inst initiateAcc to coalescer issue")
+        .flags(Stats::pdf | Stats::oneline);
+
+    rubyNetworkLatency
+        .init(0, 1600000, 10000)
+        .name(name() + ".rubyNetworkLatency")
+        .desc("Ticks from coalescer issue to coalescer hit callback")
+        .flags(Stats::pdf | Stats::oneline);
+
+    gmEnqueueLatency
+        .init(0, 1600000, 10000)
+        .name(name() + ".gmEnqueueLatency")
+        .desc("Ticks from coalescer hit callback to GM pipe enqueue")
+        .flags(Stats::pdf | Stats::oneline);
+
+    gmToCompleteLatency
+        .init(0, 1600000, 10000)
+        .name(name() + ".gmToCompleteLatency")
+        .desc("Ticks queued in GM pipes ordered response buffer")
+        .flags(Stats::pdf | Stats::oneline);
+
+    coalsrLineAddresses
+        .init(0, 20, 1)
+        .name(name() + ".coalsrLineAddresses")
+        .desc("Number of cache lines for coalesced request")
+        .flags(Stats::pdf | Stats::oneline);
+
+    int wfSize = cuList[0]->wfSize();
+    cacheBlockRoundTrip = new Stats::Distribution[wfSize];
+    for (int idx = 0; idx < wfSize; ++idx) {
+        std::stringstream namestr;
+        ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
+        cacheBlockRoundTrip[idx]
+            .init(0, 1600000, 10000)
+            .name(namestr.str())
+            .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
+            .flags(Stats::pdf | Stats::oneline);
+    }
 }

 void
@@ -251,7 +391,6 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
        RequestPtr req1, req2;
        req->splitOnVaddr(split_addr, req1, req2);

-
        PacketPtr pkt1 = new Packet(req2, cmd);
        PacketPtr pkt2 = new Packet(req1, cmd);

@@ -297,34 +436,22 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
    }
 }

-bool
-Shader::busy()
-{
-    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
-        if (!cuList[i_cu]->isDone()) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 void
-Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
+Shader::ScheduleAdd(int *val,Tick when,int x)
 {
    sa_val.push_back(val);
-    sa_when.push_back(tick_cnt + when);
+    when += curTick();
+    sa_when.push_back(when);
    sa_x.push_back(x);
    ++sa_n;
-}
-
-
-void
-Shader::processTick()
-{
-    if (busy()) {
-        exec();
-        schedule(tickEvent, curTick() + ticks(1));
+    if (!tickEvent.scheduled() || (when < tickEvent.when())) {
+        DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
+                "%lu\n", when);
+        reschedule(tickEvent, when, true);
+    } else {
+        assert(tickEvent.scheduled());
+        DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
+                "%lu\n", when);
    }
 }

@@ -356,7 +483,8 @@ void
 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
                bool suppress_func_errors)
 {
-    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
+    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
+        suppress_func_errors);
 }

 void
@@ -385,15 +513,11 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
    pkt->senderState =
        new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);

-    if (cu_id == n_cu) {
-        dispatcher->tlbPort->sendFunctional(pkt);
-    } else {
-        // even when the perLaneTLB flag is turned on
-        // it's ok tp send all accesses through lane 0
-        // since the lane # is not known here,
-        // This isn't important since these are functional accesses.
-        cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
-    }
+    // even when the perLaneTLB flag is turned on
+    // it's ok tp send all accesses through lane 0
+    // since the lane # is not known here,
+    // This isn't important since these are functional accesses.
+    cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);

    /* safe_cast the senderState */
    TheISA::GpuTLB::TranslationState *sender_state =
@@ -402,3 +526,82 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
    delete sender_state->tlbEntry;
    delete pkt->senderState;
 }
+
+/*
+ * allow the shader to sample stats from constituent devices
+ */
+void
+Shader::sampleStore(const Tick accessTime)
+{
+    storeLatencyDist.sample(accessTime);
+    allLatencyDist.sample(accessTime);
+}
+
+/*
+ * allow the shader to sample stats from constituent devices
+ */
+void
+Shader::sampleLoad(const Tick accessTime)
+{
+    loadLatencyDist.sample(accessTime);
+    allLatencyDist.sample(accessTime);
+}
+
+void
+Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)
+{
+    // Only sample instructions that go all the way to main memory
+    if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
+        return;
+    }
+
+    Tick t1 = roundTripTime[0];
+    Tick t2 = roundTripTime[1];
+    Tick t3 = roundTripTime[2];
+    Tick t4 = roundTripTime[3];
+    Tick t5 = roundTripTime[4];
+
+    initToCoalesceLatency.sample(t2-t1);
+    rubyNetworkLatency.sample(t3-t2);
+    gmEnqueueLatency.sample(t4-t3);
+    gmToCompleteLatency.sample(t5-t4);
+}
+
+void
+Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
+{
+    coalsrLineAddresses.sample(lineMap.size());
+    std::vector<Tick> netTimes;
+
+    // For each cache block address generated by a vmem inst, calculate
+    // the round-trip time for that cache block.
+    for (auto& it : lineMap) {
+        const std::vector<Tick>& timeVec = it.second;
+        if (timeVec.size() == 2) {
+            netTimes.push_back(timeVec[1] - timeVec[0]);
+        }
+    }
+
+    // Sort the cache block round trip times so that the first
+    // distrubtion is always measuring the fastests and the last
+    // distrubtion is always measuring the slowest cache block.
+    std::sort(netTimes.begin(), netTimes.end());
+
+    // Sample the round trip time for each N cache blocks into the
+    // Nth distribution.
+    int idx = 0;
+    for (auto& time : netTimes) {
+        cacheBlockRoundTrip[idx].sample(time);
+        ++idx;
+    }
+}
+
+void
+Shader::notifyCuSleep() {
+    // If all CUs attached to his shader are asleep, update shaderActiveTicks
+    panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
+             "Invalid activeCu size\n");
+    _activeCus--;
+    if (!_activeCus)
+        shaderActiveTicks += curTick() - _lastInactiveTick;
+}
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -14,9 +14,9 @@
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -30,7 +30,7 @@
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
- * Author: Steve Reinhardt
+ * Authors: Steve Reinhardt
 */

 #ifndef __SHADER_HH__
@@ -47,11 +47,11 @@
 #include "cpu/simple_thread.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/thread_state.hh"
-#include "enums/MemType.hh"
 #include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/gpu_tlb.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/lds_state.hh"
-#include "gpu-compute/qstruct.hh"
 #include "mem/page_table.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
@@ -61,7 +61,8 @@
 #include "sim/sim_object.hh"

 class BaseTLB;
-class GpuDispatcher;
+class GPUCommandProcessor;
+class GPUDispatcher;

 namespace TheISA
 {
@@ -70,36 +71,144 @@ namespace TheISA

 static const int LDS_SIZE = 65536;

+// aperture (APE) registers define the base/limit
+// pair for the ATC mapped memory space. currently
+// the only APEs we consider are for GPUVM/LDS/scratch.
+// the APEs are registered with unique values based
+// on a per-device basis
+struct ApertureRegister
+{
+    Addr base;
+    Addr limit;
+};
+
 // Class Shader: This describes a single shader instance. Most
 // configurations will only have a single shader.

 class Shader : public ClockedObject
 {
-  protected:
-      // Shader's clock period in terms of number of ticks of curTime,
-      // aka global simulation clock
-      Tick clock;
+  private:
+    ApertureRegister _gpuVmApe;
+    ApertureRegister _ldsApe;
+    ApertureRegister _scratchApe;
+    Addr shHiddenPrivateBaseVmid;
+
+    // Number of active Cus attached to this shader
+    int _activeCus;
+
+    // Last tick that all CUs attached to this shader were inactive
+    Tick _lastInactiveTick;
+
+    // some stats for measuring latency
+    Stats::Distribution allLatencyDist;
+    Stats::Distribution loadLatencyDist;
+    Stats::Distribution storeLatencyDist;
+
+    // average ticks from vmem inst initiateAcc to coalescer issue,
+    // average ticks from coalescer issue to coalescer hit callback,
+    // average ticks from coalescer hit callback to GM pipe enqueue,
+    // and average ticks spent in GM pipe's ordered resp buffer.
+    Stats::Distribution initToCoalesceLatency;
+    Stats::Distribution rubyNetworkLatency;
+    Stats::Distribution gmEnqueueLatency;
+    Stats::Distribution gmToCompleteLatency;
+
+    // average number of cache blocks requested by vmem inst, and
+    // average ticks for cache blocks to main memory for the Nth
+    // cache block generated by a vmem inst.
+    Stats::Distribution coalsrLineAddresses;
+    Stats::Distribution *cacheBlockRoundTrip;

  public:
    typedef ShaderParams Params;
    enum hsail_mode_e {SIMT,VECTOR_SCALAR};

-    // clock related functions ; maps to-and-from
-    // Simulation ticks and shader clocks.
-    Tick frequency() const { return SimClock::Frequency / clock; }
-
-    Tick ticks(int numCycles) const { return  (Tick)clock * numCycles; }
-
-    Tick getClock() const { return clock; }
-    Tick curCycle() const { return curTick() / clock; }
-    Tick tickToCycles(Tick val) const { return val / clock;}
-
+    GPUDispatcher &dispatcher();
+    void sampleLoad(const Tick accessTime);
+    void sampleStore(const Tick accessTime);
+    void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
+    void sampleLineRoundTrip(const std::map<Addr,
+        std::vector<Tick>> &roundTripTime);

    SimpleThread *cpuThread;
    ThreadContext *gpuTc;
    BaseCPU *cpuPointer;

-    void processTick();
+    const ApertureRegister&
+    gpuVmApe() const
+    {
+        return _gpuVmApe;
+    }
+
+    const ApertureRegister&
+    ldsApe() const
+    {
+        return _ldsApe;
+    }
+
+    const ApertureRegister&
+    scratchApe() const
+    {
+        return _scratchApe;
+    }
+
+    bool
+    isGpuVmApe(Addr addr) const
+    {
+        bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
+
+        return is_gpu_vm;
+    }
+
+    bool
+    isLdsApe(Addr addr) const
+    {
+        bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
+
+        return is_lds;
+    }
+
+    bool
+    isScratchApe(Addr addr) const
+    {
+        bool is_scratch
+            = addr >= _scratchApe.base && addr <= _scratchApe.limit;
+
+        return is_scratch;
+    }
+
+    Addr
+    getScratchBase()
+    {
+        return _scratchApe.base;
+    }
+
+    Addr
+    getHiddenPrivateBase()
+    {
+        return shHiddenPrivateBaseVmid;
+    }
+
+    void
+    initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
+    {
+        Addr sh_hidden_base_new = queueBase - offset;
+
+        // We are initializing sh_hidden_private_base_vmid from the
+        // amd queue descriptor from the first queue.
+        // The sh_hidden_private_base_vmid is supposed to be same for
+        // all the queues from the same process
+        if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
+            // Do not panic if shHiddenPrivateBaseVmid == 0,
+            // that is if it is uninitialized. Panic only
+            // if the value is initilized and we get
+            // a differnt base later.
+            panic_if(shHiddenPrivateBaseVmid != 0,
+                     "Currently we support only single process\n");
+        }
+        shHiddenPrivateBaseVmid = sh_hidden_base_new;
+    }
+
    EventFunctionWrapper tickEvent;

    // is this simulation going to be timing mode in the memory?
@@ -108,30 +217,18 @@ class Shader : public ClockedObject

    // If set, issue acq packet @ kernel launch
    int impl_kern_boundary_sync;
-    // If set, generate a separate packet for acquire/release on
-    // ld_acquire/st_release/atomic operations
-    int separate_acquire_release;
    // If set, fetch returns may be coissued with instructions
    int coissue_return;
    // If set, always dump all 64 gprs to trace
    int trace_vgpr_all;
    // Number of cu units in the shader
    int n_cu;
-    // Number of wavefront slots per cu
+    // Number of wavefront slots per SIMD per CU
    int n_wf;
+
    // The size of global memory
    int globalMemSize;

-    /*
-     * Bytes/work-item for call instruction
-     * The number of arguments for an hsail function will
-     * vary. We simply determine the maximum # of arguments
-     * required by any hsail function up front before the
-     * simulation (during parsing of the Brig) and record
-     * that number here.
-     */
-    int funcargs_size;
-
    // Tracks CU that rr dispatcher should attempt scheduling
    int nextSchedCu;

@@ -139,7 +236,7 @@ class Shader : public ClockedObject
    uint32_t sa_n;

    // Pointer to value to be increments
-    std::vector<uint32_t*> sa_val;
+    std::vector<int*> sa_val;
    // When to do the increment
    std::vector<uint64_t> sa_when;
    // Amount to increment by
@@ -148,24 +245,29 @@ class Shader : public ClockedObject
    // List of Compute Units (CU's)
    std::vector<ComputeUnit*> cuList;

-    uint64_t tick_cnt;
-    uint64_t box_tick_cnt;
-    uint64_t start_tick_cnt;
+    GPUCommandProcessor &gpuCmdProc;
+    GPUDispatcher &_dispatcher;

-    GpuDispatcher *dispatcher;
+    /**
+     * Statistics
+     */
+    Stats::Scalar shaderActiveTicks;
+    Stats::Vector vectorInstSrcOperand;
+    Stats::Vector vectorInstDstOperand;
+    void regStats();
+
+    int max_valu_insts;
+    int total_valu_insts;

    Shader(const Params *p);
    ~Shader();
    virtual void init();

-    // Run shader
-    void exec();
-
-    // Check to see if shader is busy
-    bool busy();
+    // Run shader scheduled adds
+    void execScheduledAdds();

    // Schedule a 32-bit value to be incremented some time in the future
-    void ScheduleAdd(uint32_t *val, Tick when, int x);
+    void ScheduleAdd(int *val, Tick when, int x);
    bool processTimingPacket(PacketPtr pkt);

    void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
@@ -190,12 +292,15 @@ class Shader : public ClockedObject
        cuList[cu_id] = compute_unit;
    }

-    void handshake(GpuDispatcher *dispatcher);
-    bool dispatch_workgroups(NDRange *ndr);
+    void prepareInvalidate(HSAQueueEntry *task);
+    void prepareFlush(GPUDynInstPtr gpuDynInst);
+
+    bool dispatchWorkgroups(HSAQueueEntry *task);
    Addr mmap(int length);
    void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
    void updateContext(int cid);
    void hostWakeUp(BaseCPU *cpu);
+    void notifyCuSleep();
 };

 #endif // __SHADER_HH__
--- a/src/gpu-compute/simple_pool_manager.cc
+++ b/src/gpu-compute/simple_pool_manager.cc
@@ -35,6 +35,12 @@

 #include "base/logging.hh"

+SimplePoolManager *
+SimplePoolManagerParams::create()
+{
+    return new SimplePoolManager(this);
+}
+
 // return the min number of elements that the manager can reserve given
 // a request for "size" elements
 uint32_t
@@ -64,8 +70,6 @@ SimplePoolManager::printRegion()
 bool
 SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
 {
-    assert(numRegions * minAllocatedElements(size) <= poolSize());
-
    return _reservedGroups == 0;
 }

--- a/src/gpu-compute/simple_pool_manager.hh
+++ b/src/gpu-compute/simple_pool_manager.hh
@@ -38,14 +38,15 @@
 #include <cstdint>

 #include "gpu-compute/pool_manager.hh"
+#include "params/SimplePoolManager.hh"

 // Simple Pool Manager: allows one region per pool. No region merging is
 // supported.
 class SimplePoolManager : public PoolManager
 {
  public:
-    SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
-        : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
+    SimplePoolManager(const PoolManagerParams *p)
+        : PoolManager(p), _regionSize(0), _nxtFreeIdx(0),
          _reservedGroups(0)
    {
    }
@@ -62,7 +63,7 @@ class SimplePoolManager : public PoolManager
    // be reserved)
    uint32_t _regionSize;
    // next index to allocate a region
-    uint8_t _nxtFreeIdx;
+    int _nxtFreeIdx;
    // number of groups that reserve a region
    uint32_t _reservedGroups;
 };
--- a/src/gpu-compute/static_register_manager_policy.cc
+++ b/src/gpu-compute/static_register_manager_policy.cc
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2016 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#include "gpu-compute/static_register_manager_policy.hh"
+
+#include "config/the_gpu_isa.hh"
+#include "debug/GPURename.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/pool_manager.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+StaticRegisterManagerPolicy::StaticRegisterManagerPolicy()
+{
+}
+
+void
+StaticRegisterManagerPolicy::exec()
+{
+}
+
+int
+StaticRegisterManagerPolicy::mapVgpr(Wavefront* w, int vgprIndex)
+{
+    panic_if((vgprIndex >= w->reservedVectorRegs)
+             || (w->reservedVectorRegs < 0),
+             "VGPR index %d is out of range: VGPR range=[0,%d]",
+             vgprIndex, w->reservedVectorRegs);
+
+    // add the offset from where the VGPRs of the wavefront have been assigned
+    int physicalVgprIndex = w->startVgprIndex + vgprIndex;
+
+    panic_if(!((w->startVgprIndex <= physicalVgprIndex) &&
+             (w->startVgprIndex + w->reservedVectorRegs - 1)
+             >= physicalVgprIndex),
+             "Invalid VGPR index %d\n", physicalVgprIndex);
+
+    // calculate physical VGPR index
+    return physicalVgprIndex % w->computeUnit->vrf[w->simdId]->numRegs();
+}
+
+int
+StaticRegisterManagerPolicy::mapSgpr(Wavefront* w, int sgprIndex)
+{
+    panic_if(!((sgprIndex < w->reservedScalarRegs)
+             && (w->reservedScalarRegs > 0)),
+             "SGPR index %d is out of range: SGPR range=[0,%d]\n",
+             sgprIndex, w->reservedScalarRegs);
+
+    // add the offset from where the SGPRs of the wavefront have been assigned
+    int physicalSgprIndex = w->startSgprIndex + sgprIndex;
+
+    panic_if(!((w->startSgprIndex <= physicalSgprIndex) &&
+             (w->startSgprIndex + w->reservedScalarRegs - 1)
+             >= physicalSgprIndex),
+             "Invalid SGPR index %d\n", physicalSgprIndex);
+
+    // calculate physical SGPR index
+    return physicalSgprIndex % w->computeUnit->srf[w->simdId]->numRegs();
+}
+
+bool
+StaticRegisterManagerPolicy::canAllocateVgprs(int simdId, int nWfs,
+                                              int demandPerWf)
+{
+    return cu->registerManager->vrfPoolMgrs[simdId]->
+        canAllocate(nWfs, demandPerWf);
+}
+
+bool
+StaticRegisterManagerPolicy::canAllocateSgprs(int simdId, int nWfs,
+                                              int demandPerWf)
+{
+    return cu->registerManager->srfPoolMgrs[simdId]->
+        canAllocate(nWfs, demandPerWf);
+}
+
+void
+StaticRegisterManagerPolicy::allocateRegisters(Wavefront *w, int vectorDemand,
+                                               int scalarDemand)
+{
+    uint32_t allocatedSize = 0;
+    w->startVgprIndex = cu->registerManager->vrfPoolMgrs[w->simdId]->
+        allocateRegion(vectorDemand, &allocatedSize);
+    w->reservedVectorRegs = allocatedSize;
+    cu->vectorRegsReserved[w->simdId] += w->reservedVectorRegs;
+    panic_if(cu->vectorRegsReserved[w->simdId] > cu->numVecRegsPerSimd,
+             "VRF[%d] has been overallocated %d > %d\n",
+             w->simdId, cu->vectorRegsReserved[w->simdId],
+             cu->numVecRegsPerSimd);
+
+    if (scalarDemand) {
+        w->startSgprIndex = cu->registerManager->srfPoolMgrs[w->simdId]->
+            allocateRegion(scalarDemand, &allocatedSize);
+        w->reservedScalarRegs = allocatedSize;
+        cu->scalarRegsReserved[w->simdId] += w->reservedScalarRegs;
+        panic_if(cu->scalarRegsReserved[w->simdId] > cu->numScalarRegsPerSimd,
+                 "SRF[%d] has been overallocated %d > %d\n",
+                 w->simdId, cu->scalarRegsReserved[w->simdId],
+                 cu->numScalarRegsPerSimd);
+    }
+}
+
+void
+StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
+{
+    // free the vector registers of the completed wavefront
+    w->computeUnit->vectorRegsReserved[w->simdId] -= w->reservedVectorRegs;
+    // free the scalar registers of the completed wavefront
+    w->computeUnit->scalarRegsReserved[w->simdId] -= w->reservedScalarRegs;
+
+    panic_if(w->computeUnit->vectorRegsReserved[w->simdId] < 0,
+             "Freeing VRF[%d] registers left %d registers reserved\n",
+             w->simdId,
+             w->computeUnit->vectorRegsReserved[w->simdId]);
+    panic_if(w->computeUnit->scalarRegsReserved[w->simdId] < 0,
+             "Freeing SRF[%d] registers left %d registers reserved\n",
+             w->simdId,
+             w->computeUnit->scalarRegsReserved[w->simdId]);
+
+    int endIndex = (w->startVgprIndex + w->reservedVectorRegs - 1) %
+        w->computeUnit->vrf[w->simdId]->numRegs();
+
+    w->computeUnit->registerManager->vrfPoolMgrs[w->simdId]->
+        freeRegion(w->startVgprIndex, endIndex);
+
+    // mark/pre-mark all registers as not busy
+    for (int i = 0; i < w->reservedVectorRegs; i++) {
+        uint32_t physVgprIdx = mapVgpr(w, i);
+        w->computeUnit->vrf[w->simdId]->markReg(physVgprIdx, false);
+    }
+
+    w->reservedVectorRegs = 0;
+    w->startVgprIndex = 0;
+
+    endIndex = (w->startSgprIndex + w->reservedScalarRegs - 1) %
+        w->computeUnit->srf[w->simdId]->numRegs();
+    w->computeUnit->registerManager->srfPoolMgrs[w->simdId]->
+        freeRegion(w->startSgprIndex, endIndex);
+
+    // mark/pre-mark all registers as not busy
+    for (int i = 0; i < w->reservedScalarRegs; i++) {
+        uint32_t physSgprIdx = mapSgpr(w, i);
+        w->computeUnit->srf[w->simdId]->markReg(physSgprIdx, false);
+    }
+
+    w->reservedScalarRegs = 0;
+    w->startSgprIndex = 0;
+}
+
+void
+StaticRegisterManagerPolicy::regStats()
+{
+}
--- a/src/gpu-compute/static_register_manager_policy.hh
+++ b/src/gpu-compute/static_register_manager_policy.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#ifndef __STATIC_REGISTER_MANAGER_POLICY_HH__
+#define __STATIC_REGISTER_MANAGER_POLICY_HH__
+
+#include "gpu-compute/register_manager_policy.hh"
+
+class HSAQueueEntry;
+
+class StaticRegisterManagerPolicy : public RegisterManagerPolicy
+{
+  public:
+
+    StaticRegisterManagerPolicy();
+
+    void exec() override;
+
+    int mapVgpr(Wavefront* w, int vgprIndex) override;
+    int mapSgpr(Wavefront* w, int sgprIndex) override;
+
+    bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) override;
+    bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) override;
+
+    void allocateRegisters(Wavefront *w, int vectorDemand,
+        int scalarDemand) override;
+
+    void freeRegisters(Wavefront *w) override;
+
+    void regStats() override;
+};
+
+#endif // __STATIC_REGISTER_MANAGER_POLICY_HH__
--- a/src/gpu-compute/tlb_coalescer.cc
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -41,7 +41,6 @@

 TLBCoalescer::TLBCoalescer(const Params *p)
    : ClockedObject(p),
-      clock(p->clk_domain->clockPeriod()),
      TLBProbesPerCycle(p->probesPerCycle),
      coalescingWindow(p->coalescingWindow),
      disableCoalescing(p->disableCoalescing),
@@ -317,7 +316,7 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
    //coalesced requests to the TLB
    if (!coalescer->probeTLBEvent.scheduled()) {
        coalescer->schedule(coalescer->probeTLBEvent,
-                curTick() + coalescer->ticks(1));
+                curTick() + coalescer->clockPeriod());
    }

    return true;
@@ -380,7 +379,7 @@ TLBCoalescer::MemSidePort::recvReqRetry()
    //we've receeived a retry. Schedule a probeTLBEvent
    if (!coalescer->probeTLBEvent.scheduled())
        coalescer->schedule(coalescer->probeTLBEvent,
-                curTick() + coalescer->ticks(1));
+                curTick() + coalescer->clockPeriod());
 }

 void
@@ -448,7 +447,7 @@ TLBCoalescer::processProbeTLBEvent()

            // send the coalesced request for virt_page_addr
            if (!memSidePort[0]->sendTimingReq(first_packet)) {
-                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
+                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
                       virt_page_addr);

                // No need for a retries queue since we are already buffering
--- a/src/gpu-compute/tlb_coalescer.hh
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -65,13 +65,6 @@ class ThreadContext;
 */
 class TLBCoalescer : public ClockedObject
 {
-   protected:
-    // TLB clock: will inherit clock from shader's clock period in terms
-    // of nuber of ticks of curTime (aka global simulation clock)
-    // The assignment of TLB clock from shader clock is done in the
-    // python config files.
-    int clock;
-
  public:
    typedef TLBCoalescerParams Params;
    TLBCoalescer(const Params *p);
@@ -105,7 +98,8 @@ class TLBCoalescer : public ClockedObject
     * option is to change it to curTick(), so we coalesce based
     * on the receive time.
     */
-    typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;
+    typedef std::unordered_map<int64_t, std::vector<coalescedReq>>
+        CoalescingFIFO;

    CoalescingFIFO coalescerFIFO;

@@ -143,13 +137,6 @@ class TLBCoalescer : public ClockedObject
    void updatePhysAddresses(PacketPtr pkt);
    void regStats() override;

-    // Clock related functions. Maps to-and-from
-    // Simulation ticks and object clocks.
-    Tick frequency() const { return SimClock::Frequency / clock; }
-    Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
-    Tick curCycle() const { return curTick() / clock; }
-    Tick tickToCycles(Tick val) const { return val / clock;}
-
    class CpuSidePort : public SlavePort
    {
      public:
@@ -171,7 +158,8 @@ class TLBCoalescer : public ClockedObject
        virtual void
        recvRespRetry()
        {
-            fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
+            fatal("recvRespRetry() is not implemented in the TLB "
+                "coalescer.\n");
        }

        virtual AddrRangeList getAddrRanges() const;
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -36,81 +36,21 @@
 #include <string>

 #include "base/logging.hh"
+#include "base/trace.hh"
+#include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
-#include "gpu-compute/shader.hh"
 #include "gpu-compute/simple_pool_manager.hh"
 #include "gpu-compute/wavefront.hh"
 #include "params/VectorRegisterFile.hh"

 VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
-    : SimObject(p),
-      manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)),
-      simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd),
-      vgprState(new VecRegisterState())
+    : RegisterFile(p)
 {
-    fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n");
-    fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+    regFile.resize(numRegs(), VecRegContainer());

-    fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not "
-             "multiple of VRF size\n");
-
-    busy.clear();
-    busy.resize(numRegsPerSimd, 0);
-    nxtBusy.clear();
-    nxtBusy.resize(numRegsPerSimd, 0);
-
-    vgprState->init(numRegsPerSimd, p->wfSize);
-}
-
-void
-VectorRegisterFile::setParent(ComputeUnit *_computeUnit)
-{
-    computeUnit = _computeUnit;
-    vgprState->setParent(computeUnit);
-}
-
-uint8_t
-VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const
-{
-    uint8_t status = nxtBusy.at(idx);
-
-    if (operandSize > 4) {
-        status = status | (nxtBusy.at((idx + 1) % numRegs()));
-    }
-
-    return status;
-}
-
-uint8_t
-VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const
-{
-    uint8_t status = busy.at(idx);
-
-    if (operandSize > 4) {
-        status = status | (busy.at((idx + 1) % numRegs()));
-    }
-
-    return status;
-}
-
-void
-VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value)
-{
-    nxtBusy.at(regIdx) = value;
-
-    if (operandSize > 4) {
-        nxtBusy.at((regIdx + 1) % numRegs()) = value;
-    }
-}
-
-void
-VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value)
-{
-    busy.at(regIdx) = value;
-
-    if (operandSize > 4) {
-        busy.at((regIdx + 1) % numRegs()) = value;
+    for (auto &reg : regFile) {
+        reg.zero();
    }
 }

@@ -118,127 +58,154 @@ bool
 VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
 {
    for (int i = 0; i < ii->getNumOperands(); ++i) {
-        if (ii->isVectorRegister(i)) {
-            uint32_t vgprIdx = ii->getRegisterIndex(i, ii);
-            uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1);
+        if (ii->isVectorRegister(i) && ii->isSrcOperand(i)) {
+            int vgprIdx = ii->getRegisterIndex(i, ii);

-            if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) {
-                if (ii->isDstOperand(i)) {
-                    w->numTimesBlockedDueWAXDependencies++;
-                } else if (ii->isSrcOperand(i)) {
-                    w->numTimesBlockedDueRAWDependencies++;
+            // determine number of registers
+            int nRegs =
+                ii->getOperandSize(i) <= 4 ? 1 : ii->getOperandSize(i) / 4;
+            for (int j = 0; j < nRegs; j++) {
+                int pVgpr = computeUnit->registerManager
+                    ->mapVgpr(w, vgprIdx + j);
+                if (regBusy(pVgpr)) {
+                    if (ii->isDstOperand(i)) {
+                        w->numTimesBlockedDueWAXDependencies++;
+                    } else if (ii->isSrcOperand(i)) {
+                        DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
+                                w->wfDynId, ii->disassemble(), pVgpr);
+                        w->numTimesBlockedDueRAWDependencies++;
+                    }
+                    return false;
                }
-
-                return false;
-            }
-
-            if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) {
-                if (ii->isDstOperand(i)) {
-                    w->numTimesBlockedDueWAXDependencies++;
-                } else if (ii->isSrcOperand(i)) {
-                    w->numTimesBlockedDueRAWDependencies++;
-                }
-
-                return false;
            }
        }
    }
-
    return true;
 }

 void
-VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w)
+VectorRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
 {
-    bool loadInstr = ii->isLoad();
-    bool atomicInstr = ii->isAtomic() || ii->isMemFence();
-
-    bool loadNoArgInstr = loadInstr && !ii->isArgLoad();
-
    // iterate over all register destination operands
    for (int i = 0; i < ii->getNumOperands(); ++i) {
        if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
-            uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii),
-                                        ii->getOperandSize(i), 1);
+            int vgprIdx = ii->getRegisterIndex(i, ii);
+            int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;

-            // mark the destination vector register as busy
-            markReg(physReg, ii->getOperandSize(i), 1);
-            // clear the in-flight status of the destination vector register
-            preMarkReg(physReg, ii->getOperandSize(i), 0);
+            for (int j = 0; j < nRegs; ++j) {
+                int physReg = computeUnit->registerManager
+                    ->mapVgpr(w, vgprIdx + j);

-            // FIXME: if we ever model correct timing behavior
-            // for load argument instructions then we should not
-            // set the destination register as busy now but when
-            // the data returns. Loads and Atomics should free
-            // their destination registers when the data returns,
-            // not now
-            if (!atomicInstr && !loadNoArgInstr) {
-                uint32_t pipeLen = ii->getOperandSize(i) <= 4 ?
-                    computeUnit->spBypassLength() :
-                    computeUnit->dpBypassLength();
-
-                // schedule an event for marking the register as ready
-                computeUnit->registerEvent(w->simdId, physReg,
-                                           ii->getOperandSize(i),
-                                           computeUnit->shader->tick_cnt +
-                                           computeUnit->shader->ticks(pipeLen),
-                                           0);
+                // If instruction is atomic instruction and
+                // the atomics do not return value, then
+                // do not mark this reg as busy.
+                if (!(ii->isAtomic() && !ii->isAtomicRet())) {
+                    /**
+                     * if the instruction is a load with EXEC = 0, then
+                     * we do not mark the reg. we do this to avoid a
+                     * deadlock that can occur because a load reserves
+                     * its destination regs before checking its exec mask,
+                     * and in the case it is 0, it will not send/recv any
+                     * packets, and therefore it will never free its dest
+                     * reg(s).
+                     */
+                    if (!ii->isLoad() || (ii->isLoad()
+                        && ii->exec_mask.any())) {
+                        markReg(physReg, true);
+                    }
+                }
            }
        }
    }
 }

-int
-VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w,
-                         std::vector<uint32_t> &regVec, uint32_t operandSize,
-                         uint64_t timestamp)
+void
+VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
 {
-    int delay = 0;
+    // increment count of number of DWORDs read from VRF
+    int DWORDs = ii->numSrcVecDWORDs();
+    registerReads += (DWORDs * w->execMask().count());

-    panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n",
-             regVec.size());
-
-    for (int i = 0; i < regVec.size(); ++i) {
-        // mark the destination VGPR as free when the timestamp expires
-        computeUnit->registerEvent(w->simdId, regVec[i], operandSize,
-                                   computeUnit->shader->tick_cnt + timestamp +
-                                   computeUnit->shader->ticks(delay), 0);
+    uint64_t mask = w->execMask().to_ullong();
+    int srams = w->execMask().size() / 4;
+    for (int i = 0; i < srams; i++) {
+        if (mask & 0xF) {
+            sramReads += DWORDs;
+        }
+        mask = mask >> 4;
    }

-    return delay;
-}
+    if (!ii->isLoad()
+        && !(ii->isAtomic() || ii->isMemSync())) {
+        int opSize = 4;
+        for (int i = 0; i < ii->getNumOperands(); i++) {
+            if (ii->getOperandSize(i) > opSize) {
+                opSize = ii->getOperandSize(i);
+            }
+        }
+        Cycles delay(opSize <= 4 ? computeUnit->spBypassLength()
+            : computeUnit->dpBypassLength());
+        Tick tickDelay = computeUnit->cyclesToTicks(delay);

-void
-VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii)
-{
-    // iterate over all register destination operands
-    for (int i = 0; i < ii->getNumOperands(); ++i) {
-        if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
-            uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii),
-                                        ii->getOperandSize(i), 1);
-            // set the in-flight status of the destination vector register
-            preMarkReg(physReg, ii->getOperandSize(i), 1);
+        for (int i = 0; i < ii->getNumOperands(); i++) {
+            if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+                int vgprIdx = ii->getRegisterIndex(i, ii);
+                int nRegs = ii->getOperandSize(i) <= 4 ? 1
+                    : ii->getOperandSize(i) / 4;
+                for (int j = 0; j < nRegs; j++) {
+                    int physReg = computeUnit->registerManager
+                        ->mapVgpr(w, vgprIdx + j);
+                    enqRegFreeEvent(physReg, tickDelay);
+                }
+            }
+        }
+
+        // increment count of number of DWORDs written to VRF
+        DWORDs = ii->numDstVecDWORDs();
+        registerWrites += (DWORDs * w->execMask().count());
+
+        mask = w->execMask().to_ullong();
+        srams = w->execMask().size() / 4;
+        for (int i = 0; i < srams; i++) {
+            if (mask & 0xF) {
+                sramWrites += DWORDs;
+            }
+            mask = mask >> 4;
        }
    }
 }

-bool
-VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
-                                          GPUDynInstPtr ii,
-                                          VrfAccessType accessType)
+void
+VectorRegisterFile::scheduleWriteOperandsFromLoad(
+    Wavefront *w, GPUDynInstPtr ii)
 {
-    bool ready = true;
+    assert(ii->isLoad() || ii->isAtomicRet());
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+            int vgprIdx = ii->getRegisterIndex(i, ii);
+            int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+                ii->getOperandSize(i) / 4;

-    return ready;
-}
+            for (int j = 0; j < nRegs; ++j) {
+                int physReg = computeUnit->registerManager
+                    ->mapVgpr(w, vgprIdx + j);
+                enqRegFreeEvent(physReg, computeUnit->clockPeriod());
+            }
+        }
+    }
+    // increment count of number of DWORDs written to VRF
+    int DWORDs = ii->numDstVecDWORDs();
+    registerWrites += (DWORDs * ii->exec_mask.count());

-bool
-VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
-                                          VrfAccessType accessType)
-{
-    bool ready = true;
-
-    return ready;
+    uint64_t mask = ii->exec_mask.to_ullong();
+    int srams = ii->exec_mask.size() / 4;
+    for (int i = 0; i < srams; i++) {
+        if (mask & 0xF) {
+            sramWrites += DWORDs;
+        }
+        mask = mask >> 4;
+    }
 }

 VectorRegisterFile*
--- a/src/gpu-compute/vector_register_file.hh
+++ b/src/gpu-compute/vector_register_file.hh
@@ -34,111 +34,76 @@
 #ifndef __VECTOR_REGISTER_FILE_HH__
 #define __VECTOR_REGISTER_FILE_HH__

-#include <list>
-
-#include "base/statistics.hh"
-#include "base/trace.hh"
-#include "base/types.hh"
+#include "arch/gpu_isa.hh"
+#include "config/the_gpu_isa.hh"
 #include "debug/GPUVRF.hh"
-#include "gpu-compute/vector_register_state.hh"
-#include "sim/sim_object.hh"
-
-class ComputeUnit;
-class Shader;
-class SimplePoolManager;
-class Wavefront;
+#include "gpu-compute/register_file.hh"
+#include "gpu-compute/wavefront.hh"

 struct VectorRegisterFileParams;

-enum class VrfAccessType : uint8_t
-{
-    READ = 0x01,
-    WRITE = 0x02,
-    RD_WR = READ | WRITE
-};
-
 // Vector Register File
-class VectorRegisterFile : public SimObject
+class VectorRegisterFile : public RegisterFile
 {
  public:
+    using VecRegContainer = TheGpuISA::VecRegContainerU32;
+
    VectorRegisterFile(const VectorRegisterFileParams *p);
+    ~VectorRegisterFile() { }

-    void setParent(ComputeUnit *_computeUnit);
+    virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override;
+    virtual void scheduleWriteOperands(Wavefront *w,
+                                       GPUDynInstPtr ii) override;
+    virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
+                                               GPUDynInstPtr ii) override;
+    virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override;

-    // Read a register
-    template<typename T>
-    T
-    read(int regIdx, int threadId=0)
+    void
+    setParent(ComputeUnit *_computeUnit) override
    {
-        T p0 = vgprState->read<T>(regIdx, threadId);
-        DPRINTF(GPUVRF, "reading vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)p0);
+        RegisterFile::setParent(_computeUnit);
+    }

-        return p0;
+    // Read a register that is writeable (e.g., a DST operand)
+    VecRegContainer&
+    readWriteable(int regIdx)
+    {
+        return regFile[regIdx];
+    }
+
+    // Read a register that is not writeable (e.g., src operand)
+    const VecRegContainer&
+    read(int regIdx) const
+    {
+        return regFile[regIdx];
    }

    // Write a register
-    template<typename T>
    void
-    write(int regIdx, T value, int threadId=0)
+    write(int regIdx, const VecRegContainer &value)
    {
-        DPRINTF(GPUVRF, "writing vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)value);
-        vgprState->write<T>(regIdx, value, threadId);
+        regFile[regIdx] = value;
    }

-    uint8_t regBusy(int idx, uint32_t operandSize) const;
-    uint8_t regNxtBusy(int idx, uint32_t operandSize) const;
-
-    int numRegs() const { return numRegsPerSimd; }
-
-    void markReg(int regIdx, uint32_t operandSize, uint8_t value);
-    void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value);
-
-    virtual void exec(GPUDynInstPtr ii, Wavefront *w);
-
-    virtual int exec(uint64_t dynamic_id, Wavefront *w,
-                     std::vector<uint32_t> &regVec, uint32_t operandSize,
-                     uint64_t timestamp);
-
-    bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
-    virtual void updateEvents() { }
-    virtual void updateResources(Wavefront *w, GPUDynInstPtr ii);
-
-    virtual bool
-    isReadConflict(int memWfId, int exeWfId) const
+    void
+    printReg(Wavefront *wf, int regIdx) const
    {
-        return false;
+#ifndef NDEBUG
+        const auto &vec_reg_cont = regFile[regIdx];
+        auto vgpr = vec_reg_cont.as<TheGpuISA::VecElemU32>();
+
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                DPRINTF(GPUVRF, "WF[%d][%d]: WV[%d] v[%d][%d] = %#x\n",
+                    wf->simdId, wf->wfSlotId, wf->wfDynId, regIdx, lane,
+                    vgpr[lane]);
+            }
+        }
+#endif
    }

-    virtual bool
-    isWriteConflict(int memWfId, int exeWfId) const
-    {
-        return false;
-    }
-
-    virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
-                                       GPUDynInstPtr ii,
-                                       VrfAccessType accessType);
-
-    virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
-                                       VrfAccessType accessType);
-
-    SimplePoolManager *manager;
-
-  protected:
-    ComputeUnit* computeUnit;
-    int simdId;
-
-    // flag indicating if a register is busy
-    std::vector<uint8_t> busy;
-    // flag indicating if a register will be busy (by instructions
-    // in the SIMD pipeline)
-    std::vector<uint8_t> nxtBusy;
-
-    // numer of registers (bank size) per simd unit (bank)
-    int numRegsPerSimd;
-
-    // vector register state
-    VecRegisterState *vgprState;
+  private:
+    std::vector<VecRegContainer> regFile;
 };

 #endif // __VECTOR_REGISTER_FILE_HH__
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -31,161 +31,116 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-#ifndef __WAVEFRONT_HH__
-#define __WAVEFRONT_HH__
+#ifndef __GPU_COMPUTE_WAVEFRONT_HH__
+#define __GPU_COMPUTE_WAVEFRONT_HH__

 #include <cassert>
 #include <deque>
+#include <list>
 #include <memory>
-#include <stack>
+#include <unordered_map>
 #include <vector>

 #include "arch/gpu_isa.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 #include "config/the_gpu_isa.hh"
-#include "gpu-compute/condition_register_state.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/lds_state.hh"
 #include "gpu-compute/misc.hh"
-#include "gpu-compute/ndrange.hh"
 #include "params/Wavefront.hh"
 #include "sim/sim_object.hh"

-static const int MAX_NUM_INSTS_PER_WF = 12;
-
-/**
- * A reconvergence stack entry conveys the necessary state to implement
- * control flow divergence.
- */
-struct ReconvergenceStackEntry {
-    /**
-     * PC of current instruction.
-     */
-    uint32_t pc;
-    /**
-     * PC of the immediate post-dominator instruction, i.e., the value of
-     * @a pc for the first instruction that will be executed by the wavefront
-     * when a reconvergence point is reached.
-     */
-    uint32_t rpc;
-    /**
-     * Execution mask.
-     */
-    VectorMask execMask;
-};
-
-/*
- * Arguments for the hsail opcode call, are user defined and variable length.
- * The hardware/finalizer can support arguments in hardware or use memory to
- * pass arguments. For now, let's assume that an unlimited number of arguments
- * are supported in hardware (the compiler inlines functions whenver it can
- * anyways, so unless someone is interested in the implications of linking/
- * library functions, I think this is a reasonable assumption given the typical
- * size of an OpenCL kernel).
- *
- * Note that call args are different than kernel arguments:
- *   * All work-items in a kernel refer the same set of kernel arguments
- *   * Each work-item has it's on set of call args. So a call argument at
- *     address 0x4 is different for work-item 0 and work-item 1.
- *
- * Ok, the table below shows an example of how we organize the call arguments in
- * the CallArgMem class.
- *
- * int foo(int arg1, double arg2)
- *  ___________________________________________________
- * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
- * |---------------------------------------------------|
- * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
- * |---------------------------------------------------|
- * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
- *  ___________________________________________________
- */
-class CallArgMem
-{
-  public:
-    // pointer to buffer for storing function arguments
-    uint8_t *mem;
-    int wfSize;
-    // size of function args
-    int funcArgsSizePerItem;
-
-    template<typename CType>
-    int
-    getLaneOffset(int lane, int addr)
-    {
-        return addr * wfSize + sizeof(CType) * lane;
-    }
-
-    CallArgMem(int func_args_size_per_item, int wf_size)
-        : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
-    {
-        mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
-    }
-
-    ~CallArgMem()
-    {
-        free(mem);
-    }
-
-    template<typename CType>
-    uint8_t*
-    getLaneAddr(int lane, int addr)
-    {
-        return mem + getLaneOffset<CType>(lane, addr);
-    }
-
-    template<typename CType>
-    void
-    setLaneAddr(int lane, int addr, CType val)
-    {
-        *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
-    }
-};
-
 class Wavefront : public SimObject
 {
  public:
-    enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
-    enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
-
-    // Base pointer for array of instruction pointers
-    uint64_t basePtr;
+    enum status_e {
+        // wavefront is stalled
+        S_STOPPED,
+        // wavefront is returning from a kernel
+        S_RETURNING,
+        // wavefront is running normally
+        S_RUNNING,
+        // wavefront is stalled
+        S_STALLED,
+        /**
+         * wavefront has unsatisfied wait counts
+         *
+         * while in this state the WF will only execute if
+         * the oldest instruction is the waitcnt. while in
+         * S_WAITCNT, the wavefront will not be ready until
+         * all of its waitcnts have been satisfied. the
+         * scoreboard ready() function will check the status
+         * of the waitcnts whenever the WF is in S_WAITCNT,
+         * and once they are satisfied, it will resume normal
+         * operation.
+         */
+        S_WAITCNT
+    };

    uint32_t oldBarrierCnt;
    uint32_t barrierCnt;
    uint32_t barrierId;
    uint32_t barrierSlots;
-    status_e status;
    // HW slot id where the WF is mapped to inside a SIMD unit
-    int wfSlotId;
+    const int wfSlotId;
    int kernId;
    // SIMD unit where the WV has been scheduled
-    int simdId;
+    const int simdId;
+    // id of the execution unit (or pipeline) where the oldest instruction
+    // of the WF is scheduled
+    int execUnitId;
+    int flatLmUnitId;
+    int flatGmUnitId;
    // pointer to parent CU
    ComputeUnit *computeUnit;
+    int maxIbSize;

    std::deque<GPUDynInstPtr> instructionBuffer;

    bool pendingFetch;
    bool dropFetch;
+    // last tick during which all WFs in the CU are not idle
+    Tick lastNonIdleTick;

-    // Condition Register State (for HSAIL simulations only)
-    class ConditionRegisterState *condRegState;
-    // number of single precision VGPRs required by WF
-    uint32_t maxSpVgprs;
-    // number of double precision VGPRs required by WF
-    uint32_t maxDpVgprs;
-    // map virtual to physical vector register
-    uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
-    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    // Execution unit resource ID's associated with this WF
+    // These are static mappings set at WF slot construction and
+    // based off of the simdId and wfSlotId.
+
+    // Index to scalarALUs resource vector in CU
+    int scalarAlu;
+
+    // Indices into readyList/dispatchList of resources used by this
+    // wavefront
+    int scalarAluGlobalIdx;
+    int globalMem;
+    int localMem;
+    int scalarMem;
+
+    // number of VGPRs required by WF
+    uint32_t maxVgprs;
+    // number of SGPRs required by WF
+    uint32_t maxSgprs;
+    void freeResources();
+    GPUDynInstPtr nextInstr();
+    void setStatus(status_e newStatus);
+    status_e getStatus() { return status; }
+    void resizeRegFiles(int num_vregs, int num_sregs);
    bool isGmInstruction(GPUDynInstPtr ii);
    bool isLmInstruction(GPUDynInstPtr ii);
+    bool isOldestInstWaitcnt();
    bool isOldestInstGMem();
    bool isOldestInstLMem();
    bool isOldestInstPrivMem();
    bool isOldestInstFlatMem();
-    bool isOldestInstALU();
+    bool isOldestInstVectorALU();
+    bool isOldestInstScalarALU();
+    bool isOldestInstScalarMem();
    bool isOldestInstBarrier();
+
    // used for passing spill address to DDInstGPU
    std::vector<Addr> lastAddr;
    std::vector<uint32_t> workItemId[3];
@@ -199,36 +154,44 @@ class Wavefront : public SimObject
    /* the actual WG size can differ than the maximum size */
    uint32_t actualWgSz[3];
    uint32_t actualWgSzTotal;
-    void computeActualWgSz(NDRange *ndr);
+    void computeActualWgSz(HSAQueueEntry *task);
    // wavefront id within a workgroup
    uint32_t wfId;
    uint32_t maxDynWaveId;
    uint32_t dispatchId;
-    // outstanding global+local memory requests
-    uint32_t outstandingReqs;
-    // memory requests between scoreboard
-    // and execute stage not yet executed
-    uint32_t memReqsInPipe;
+    // vector and scalar memory requests pending in memory system
+    int outstandingReqs;
    // outstanding global memory write requests
-    uint32_t outstandingReqsWrGm;
+    int outstandingReqsWrGm;
    // outstanding local memory write requests
-    uint32_t outstandingReqsWrLm;
+    int outstandingReqsWrLm;
    // outstanding global memory read requests
-    uint32_t outstandingReqsRdGm;
+    int outstandingReqsRdGm;
    // outstanding local memory read requests
-    uint32_t outstandingReqsRdLm;
-    uint32_t rdLmReqsInPipe;
-    uint32_t rdGmReqsInPipe;
-    uint32_t wrLmReqsInPipe;
-    uint32_t wrGmReqsInPipe;
+    int outstandingReqsRdLm;
+    // outstanding scalar memory read requests
+    int scalarOutstandingReqsRdGm;
+    // outstanding scalar memory write requests
+    int scalarOutstandingReqsWrGm;
+    int rdLmReqsInPipe;
+    int rdGmReqsInPipe;
+    int wrLmReqsInPipe;
+    int wrGmReqsInPipe;
+    int scalarRdGmReqsInPipe;
+    int scalarWrGmReqsInPipe;

    int memTraceBusy;
    uint64_t lastTrace;
-    // number of vector registers reserved by WF
+    // number of virtual vector registers reserved by WF
    int reservedVectorRegs;
+    // number of virtual scalar registers reserved by WF
+    int reservedScalarRegs;
    // Index into the Vector Register File's namespace where the WF's registers
    // will live while the WF is executed
    uint32_t startVgprIndex;
+    // Index into the Scalar Register File's namespace where the WF's registers
+    // will live while the WF is executed
+    uint32_t startSgprIndex;

    // Old value of destination gpr (for trace)
    std::vector<uint32_t> oldVgpr;
@@ -257,64 +220,63 @@ class Wavefront : public SimObject
    // to this workgroup (thus this wavefront)
    LdsChunk *ldsChunk;

-    // A pointer to the spill area
-    Addr spillBase;
-    // The size of the spill area
-    uint32_t spillSizePerItem;
-    // The vector width of the spill area
-    uint32_t spillWidth;
-
-    // A pointer to the private memory area
-    Addr privBase;
-    // The size of the private memory area
-    uint32_t privSizePerItem;
-
-    // A pointer ot the read-only memory area
-    Addr roBase;
-    // size of the read-only memory area
-    uint32_t roSize;
-
-    // pointer to buffer for storing kernel arguments
-    uint8_t *kernelArgs;
    // unique WF id over all WFs executed across all CUs
    uint64_t wfDynId;

-    // number of times instruction issue for this wavefront is blocked
-    // due to VRF port availability
-    Stats::Scalar numTimesBlockedDueVrfPortAvail;
+    // Wavefront slot stats
+
+    // Number of instructions executed by this wavefront slot across all
+    // dynamic wavefronts
+    Stats::Scalar numInstrExecuted;
+
+    // Number of cycles this WF spends in SCH stage
+    Stats::Scalar schCycles;
+
+    // Number of stall cycles encounterd by this WF in SCH stage
+    Stats::Scalar schStalls;
+
+    // The following stats sum to the value of schStalls, and record, per
+    // WF slot, what the cause of each stall was at a coarse granularity.
+
+    // Cycles WF is selected by scheduler, but RFs cannot support instruction
+    Stats::Scalar schRfAccessStalls;
+    // Cycles spent waiting for execution resources
+    Stats::Scalar schResourceStalls;
+    // cycles spent waiting for RF reads to complete in SCH stage
+    Stats::Scalar schOpdNrdyStalls;
+    // LDS arbitration stall cycles. WF attempts to execute LM instruction,
+    // but another wave is executing FLAT, which requires LM and GM and forces
+    // this WF to stall.
+    Stats::Scalar schLdsArbStalls;
+
    // number of times an instruction of a WF is blocked from being issued
    // due to WAR and WAW dependencies
    Stats::Scalar numTimesBlockedDueWAXDependencies;
    // number of times an instruction of a WF is blocked from being issued
    // due to WAR and WAW dependencies
    Stats::Scalar numTimesBlockedDueRAWDependencies;
-    // distribution of executed instructions based on their register
-    // operands; this is used to highlight the load on the VRF
-    Stats::Distribution srcRegOpDist;
-    Stats::Distribution dstRegOpDist;

-    // Functions to operate on call argument memory
-    // argument memory for hsail call instruction
-    CallArgMem *callArgMem;
-    void
-    initCallArgMem(int func_args_size_per_item, int wf_size)
-    {
-        callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
-    }
+    // dyn inst id (per SIMD) of last instruction exec from this wave
+    uint64_t lastInstExec;

-    template<typename CType>
-    CType
-    readCallArgMem(int lane, int addr)
-    {
-        return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
-    }
+    // Distribution to track the distance between producer and consumer
+    // for vector register values
+    Stats::Distribution vecRawDistance;
+    // Map to track the dyn instruction id of each vector register value
+    // produced, indexed by physical vector register ID
+    std::unordered_map<int,uint64_t> rawDist;

-    template<typename CType>
-    void
-    writeCallArgMem(int lane, int addr, CType val)
-    {
-        callArgMem->setLaneAddr<CType>(lane, addr, val);
-    }
+    // Distribution to track the number of times every vector register
+    // value produced is consumed.
+    Stats::Distribution readsPerWrite;
+    // Counts the number of reads performed to each physical register
+    // - counts are reset to 0 for each dynamic wavefront launched
+    std::vector<int> vecReads;
+
+    void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems);
+
+    // context for save/restore
+    uint8_t *context;

    typedef WavefrontParams Params;
    Wavefront(const Params *p);
@@ -327,50 +289,31 @@ class Wavefront : public SimObject
        computeUnit = cu;
    }

+    void validateRequestCounters();
    void start(uint64_t _wfDynId, uint64_t _base_ptr);
    void exec();
-    void updateResources();
-    int ready(itype_e type);
-    bool instructionBufferHasBranch();
+    // called by SCH stage to reserve
+    std::vector<int> reserveResources();
+    bool stopFetch();
    void regStats();
-    VectorMask getPred() { return execMask() & initMask; }

    bool waitingAtBarrier(int lane);

-    void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
-                                  const VectorMask& exec_mask);
-
-    void popFromReconvergenceStack();
-
-    uint32_t pc() const;
-
-    uint32_t rpc() const;
-
-    VectorMask execMask() const;
+    Addr pc() const;
+    void pc(Addr new_pc);

+    VectorMask& execMask();
    bool execMask(int lane) const;

-    void pc(uint32_t new_pc);

    void discardFetch();

-    /**
-     * Returns the size of the static hardware context of a particular wavefront
-     * This should be updated everytime the context is changed
-     */
-    uint32_t getStaticContextSize() const;
+    bool waitCntsSatisfied();
+    void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
+    void clearWaitCnts();

-    /**
-     * Returns the hardware context as a stream of bytes
-     * This method is designed for HSAIL execution
-     */
-    void getContext(const void *out);
-
-    /**
-     * Sets the hardware context fromt a stream of bytes
-     * This method is designed for HSAIL execution
-     */
-    void setContext(const void *in);
+    /** Freeing VRF space */
+    void freeRegisterFile();

    TheGpuISA::GPUISA&
    gpuISA()
@@ -380,14 +323,32 @@ class Wavefront : public SimObject

  private:
    TheGpuISA::GPUISA _gpuISA;
+
+    void reserveGmResource(GPUDynInstPtr ii);
+    void reserveLmResource(GPUDynInstPtr ii);
+
    /**
-     * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
-     * to be visited by the wavefront, and the associated execution masks. The
-     * reconvergence stack grows every time the wavefront reaches a divergence
-     * point (branch instruction), and shrinks every time the wavefront
-     * reaches a reconvergence point (immediate post-dominator instruction).
+     * the following are used for waitcnt instructions
+     * vmWaitCnt: once set, we wait for the oustanding
+     *            number of vector mem instructions to be
+     *            at, or below vmWaitCnt.
+     *
+     * expWaitCnt: once set, we wait for the outstanding
+     *             number outstanding VM writes or EXP
+     *             insts to be at, or below expWaitCnt.
+     *
+     * lgkmWaitCnt: once set, we wait for the oustanding
+     *              number of LDS, GDS, scalar memory,
+     *              and message instructions to be at, or
+     *              below lgkmCount. we currently do not
+     *              support GDS/message ops.
     */
-    std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
+    int vmWaitCnt;
+    int expWaitCnt;
+    int lgkmWaitCnt;
+    status_e status;
+    Addr _pc;
+    VectorMask _execMask;
 };

-#endif // __WAVEFRONT_HH__
+#endif // __GPU_COMPUTE_WAVEFRONT_HH__
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -86,6 +86,14 @@ MemCmd::commandInfo[] =
            WriteResp, "WriteReq" },
    /* WriteResp */
    { SET2(IsWrite, IsResponse), InvalidCmd, "WriteResp" },
+    /* WriteCompleteResp - The WriteCompleteResp command is needed
+     * because in the GPU memory model we use a WriteResp to indicate
+     * that a write has reached the cache controller so we can free
+     * resources at the coalescer. Later, when the write succesfully
+     * completes we send a WriteCompleteResp to the CU so its wait
+     * counters can be updated. Wait counters in the CU is how memory
+     * dependences are handled in the GPU ISA. */
+    { SET2(IsWrite, IsResponse), InvalidCmd, "WriteCompleteResp" },
    /* WritebackDirty */
    { SET5(IsWrite, IsRequest, IsEviction, HasData, FromCache),
            InvalidCmd, "WritebackDirty" },
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -83,6 +83,7 @@ class MemCmd
        ReadRespWithInvalidate,
        WriteReq,
        WriteResp,
+        WriteCompleteResp,
        WritebackDirty,
        WritebackClean,
        WriteClean,            // writes dirty data below without evicting
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -298,9 +298,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
            trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
          } else {
            if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
-              if (in_msg.segment == HSASegment:SPILL) {
-                trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
-              } else if (WB) {
+              if (WB) {
                trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
              } else {
                trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
--- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+structure (GPUCoalescer, external = "yes") {
+  void readCallback(Addr, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles, bool);
+  void writeCallback(Addr, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles, bool);
+  void evictionCallback(Addr);
+  void recordCPReadCallBack(MachineID, MachineID);
+  void recordCPWriteCallBack(MachineID, MachineID);
+}
+
+structure (VIPERCoalescer, external = "yes") {
+  void readCallback(Addr, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles, bool);
+  void writeCallback(Addr, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles, bool);
+  void invCallback(Addr);
+  void wbCallback(Addr);
+  void evictionCallback(Addr);
+}
--- a/src/mem/ruby/protocol/GPU_VIPER.slicc
+++ b/src/mem/ruby/protocol/GPU_VIPER.slicc
@@ -3,6 +3,7 @@ include "RubySlicc_interfaces.slicc";
 include "MOESI_AMD_Base-msg.sm";
 include "MOESI_AMD_Base-dir.sm";
 include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_VIPER-msg.sm";
 include "GPU_VIPER-TCP.sm";
 include "GPU_VIPER-SQC.sm";
 include "GPU_VIPER-TCC.sm";
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -135,7 +135,6 @@ structure(CPURequestMsg, desc="...", interface="Message") {
  CoherenceRequestType OriginalType, default="CoherenceRequestType_NA",  desc="Type of request from core fwded through region buffer";
  WriteMask writeMask, desc="Write Through Data";
  MachineID WTRequestor,            desc="Node who initiated the write through";
-  HSAScope scope,                      default="HSAScope_SYSTEM", desc="Request Scope";
  int wfid,                         default="0", desc="wavefront id";
  bool NoWriteConflict,             default="true", desc="write collided with CAB entry";
  int ProgramCounter,               desc="PC that accesses to this block";
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -103,26 +103,6 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent")
  NotPresent, desc="block is NotPresent";
  Busy,       desc="block is in a transient state, currently invalid";
 }
-//HSA scopes
-enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") {
-  UNSPECIFIED, desc="Unspecified scope";
-  NOSCOPE,     desc="Explictly unscoped";
-  WAVEFRONT,   desc="Wavefront scope";
-  WORKGROUP,   desc="Workgroup scope";
-  DEVICE,      desc="Device scope";
-  SYSTEM,      desc="System scope";
-}
-
-// HSA segment types
-enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") {
-  GLOBAL,   desc="Global segment";
-  GROUP,    desc="Group segment";
-  PRIVATE,  desc="Private segment";
-  KERNARG,  desc="Kernarg segment";
-  READONLY, desc="Readonly segment";
-  SPILL,    desc="Spill segment";
-  ARG,      desc="Arg segment";
-}

 // TesterStatus
 enumeration(TesterStatus, desc="...") {
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -138,42 +138,6 @@ structure (Sequencer, external = "yes") {
  bool checkResourceAvailable(CacheResourceType, Addr);
 }

-structure (GPUCoalescer, external = "yes") {
-  void readCallback(Addr, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles, bool);
-  void writeCallback(Addr, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles, bool);
-  void evictionCallback(Addr);
-  void recordCPReadCallBack(MachineID, MachineID);
-  void recordCPWriteCallBack(MachineID, MachineID);
-}
-
-structure (VIPERCoalescer, external = "yes") {
-  void readCallback(Addr, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles, bool);
-  void writeCallback(Addr, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles, bool);
-  void invCallback(Addr);
-  void wbCallback(Addr);
-  void evictionCallback(Addr);
-}
-
 structure(RubyRequest, desc="...", interface="Message", external="yes") {
  Addr LineAddress,       desc="Line address for this request";
  Addr PhysicalAddress,   desc="Physical address for this request";
@@ -186,8 +150,6 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
  WriteMask writeMask,       desc="Writethrough mask";
  DataBlock WTData,          desc="Writethrough data block";
  int wfid,                  desc="Writethrough wavefront";
-  HSAScope scope,            desc="HSA scope";
-  HSASegment segment,        desc="HSA segment";
  PacketPtr pkt,             desc="Packet associated with this request";
 }

--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -43,7 +43,6 @@
 #include "debug/RubyQueue.hh"
 #include "mem/ruby/network/Network.hh"
 #include "mem/ruby/protocol/MemoryMsg.hh"
-#include "mem/ruby/system/GPUCoalescer.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "mem/ruby/system/Sequencer.hh"
 #include "sim/system.hh"
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -35,8 +35,6 @@
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/DataBlock.hh"
 #include "mem/ruby/common/WriteMask.hh"
-#include "mem/ruby/protocol/HSAScope.hh"
-#include "mem/ruby/protocol/HSASegment.hh"
 #include "mem/ruby/protocol/Message.hh"
 #include "mem/ruby/protocol/PrefetchBit.hh"
 #include "mem/ruby/protocol/RubyAccessMode.hh"
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -61,58 +61,6 @@

 using namespace std;

-GPUCoalescer *
-RubyGPUCoalescerParams::create()
-{
-    return new GPUCoalescer(this);
-}
-
-HSAScope
-reqScopeToHSAScope(const RequestPtr &req)
-{
-    HSAScope accessScope = HSAScope_UNSPECIFIED;
-    if (req->isScoped()) {
-        if (req->isWavefrontScope()) {
-            accessScope = HSAScope_WAVEFRONT;
-        } else if (req->isWorkgroupScope()) {
-            accessScope = HSAScope_WORKGROUP;
-        } else if (req->isDeviceScope()) {
-            accessScope = HSAScope_DEVICE;
-        } else if (req->isSystemScope()) {
-            accessScope = HSAScope_SYSTEM;
-        } else {
-            fatal("Bad scope type");
-        }
-    }
-    return accessScope;
-}
-
-HSASegment
-reqSegmentToHSASegment(const RequestPtr &req)
-{
-    HSASegment accessSegment = HSASegment_GLOBAL;
-
-    if (req->isGlobalSegment()) {
-        accessSegment = HSASegment_GLOBAL;
-    } else if (req->isGroupSegment()) {
-        accessSegment = HSASegment_GROUP;
-    } else if (req->isPrivateSegment()) {
-        accessSegment = HSASegment_PRIVATE;
-    } else if (req->isKernargSegment()) {
-        accessSegment = HSASegment_KERNARG;
-    } else if (req->isReadonlySegment()) {
-        accessSegment = HSASegment_READONLY;
-    } else if (req->isSpillSegment()) {
-        accessSegment = HSASegment_SPILL;
-    } else if (req->isArgSegment()) {
-        accessSegment = HSASegment_ARG;
-    } else {
-        fatal("Bad segment type");
-    }
-
-    return accessSegment;
-}
-
 UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
    : coalescer(gc)
 {
@@ -152,6 +100,7 @@ UncoalescedTable::updateResources()
 {
    for (auto iter = instMap.begin(); iter != instMap.end(); ) {
        if (iter->second.empty()) {
+            DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
            instMap.erase(iter++);
            coalescer->getGMTokenPort().sendTokens(1);
        } else {
@@ -160,15 +109,27 @@ UncoalescedTable::updateResources()
    }
 }

+bool
+UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
+    // iterate the instructions held in UncoalescedTable to see whether there
+    // are more requests to issue; if yes, not yet done; otherwise, done
+    for (auto& inst : instMap) {
+        DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
+            ,inst.first, inst.second.size());
+        if (inst.first == instSeqNum) { return false; }
+    }
+
+    return true;
+}
+
 void
 UncoalescedTable::printRequestTable(std::stringstream& ss)
 {
-    ss << "UncoalescedTable contains " << instMap.size()
-       << " address entries." << std::endl;
+    ss << "Listing pending packets from " << instMap.size() << " instructions";
+
    for (auto& inst : instMap) {
-        ss << "Addr 0x" << std::hex << inst.first << std::dec
-           << " with " << inst.second.size() << " packets"
-           << std::endl;
+        ss << "\tAddr: " << printAddress(inst.first) << " with "
+           << inst.second.size() << " pending packets" << std::endl;
    }
 }

@@ -227,7 +188,6 @@ GPUCoalescer::GPUCoalescer(const Params *p)
    assert(m_dataCache_ptr);

    m_runningGarnetStandalone = p->garnet_standalone;
-    assumingRfOCoherence = p->assume_rfo;
 }

 GPUCoalescer::~GPUCoalescer()
@@ -254,18 +214,9 @@ GPUCoalescer::wakeup()
            if (current_time - req->getIssueTime() > m_deadlock_threshold) {
                std::stringstream ss;
                printRequestTable(ss);
-                ss << "Outstanding requests: " << m_outstanding_count
-                   << std::endl;
-
-                panic("Possible Deadlock detected. Aborting!\n"
-                     "version: %d request.paddr: 0x%x coalescedTable: %d "
-                     "current time: %u issue_time: %d difference: %d\n"
-                     "Request Tables:\n %s", m_version,
-                      req->getFirstPkt()->getAddr(),
-                      coalescedTable.size(), cyclesToTicks(current_time),
-                      cyclesToTicks(req->getIssueTime()),
-                      cyclesToTicks(current_time - req->getIssueTime()),
-                      ss.str());
+                warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
+                     m_version, ss.str());
+                panic("Aborting due to deadlock!\n");
            }
        }
    }
@@ -283,21 +234,27 @@ GPUCoalescer::wakeup()
 void
 GPUCoalescer::printRequestTable(std::stringstream& ss)
 {
-    uncoalescedTable.printRequestTable(ss);
+    ss << "Printing out " << coalescedTable.size()
+       << " outstanding requests in the coalesced table\n";

-    ss << "CoalescedTable contains " << coalescedTable.size()
-       << " address entries." << std::endl;
    for (auto& requestList : coalescedTable) {
-        ss << "Addr 0x" << std::hex << requestList.first << std::dec
-           << ": type-";
        for (auto& request : requestList.second) {
-            ss << RubyRequestType_to_string(request->getRubyType())
-               << " pkts-" << request->getPackets().size()
-               << " issued-" << request->getIssueTime() << " seqNum-"
-               << request->getSeqNum() << "; ";
+            ss << "\tAddr: " << printAddress(requestList.first) << "\n"
+               << "\tInstruction sequence number: "
+               << request->getSeqNum() << "\n"
+               << "\t\tType: "
+               << RubyRequestType_to_string(request->getRubyType()) << "\n"
+               << "\t\tNumber of associated packets: "
+               << request->getPackets().size() << "\n"
+               << "\t\tIssue time: "
+               << request->getIssueTime() * clockPeriod() << "\n"
+               << "\t\tDifference from current tick: "
+               << (curCycle() - request->getIssueTime()) * clockPeriod();
        }
-        ss << std::endl;
    }
+
+    // print out packets waiting to be issued in uncoalesced table
+    uncoalescedTable.printRequestTable(ss);
 }

 void
@@ -387,6 +344,7 @@ GPUCoalescer::writeCallback(Addr address,
    hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
                forwardRequestTime, firstResponseTime, isRegion);

+    // remove this crequest in coalescedTable
    delete crequest;
    coalescedTable.at(address).pop_front();

@@ -398,6 +356,36 @@ GPUCoalescer::writeCallback(Addr address,
    }
 }

+void
+GPUCoalescer::writeCompleteCallback(Addr address,
+                                    uint64_t instSeqNum,
+                                    MachineType mach)
+{
+    DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
+            " instSeqNum = %d\n", address, instSeqNum);
+
+    assert(pendingWriteInsts.count(instSeqNum) == 1);
+    PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
+
+    // check the uncoalescedTable to see whether all requests for the inst
+    // have been issued or not
+    bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
+    DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
+                    "reqsAllIssued=%d\n", reqsAllIssued,
+                    inst.getNumPendingStores()-1, reqsAllIssued);
+
+    if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
+        // if the pending write instruction has received all write completion
+        // callbacks for its issued Ruby requests, we can now start respond
+        // the requesting CU in one response packet.
+        inst.ackWriteCompletion(m_usingRubyTester);
+
+        DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
+                instSeqNum);
+        pendingWriteInsts.erase(instSeqNum);
+    }
+}
+
 void
 GPUCoalescer::readCallback(Addr address, DataBlock& data)
 {
@@ -477,7 +465,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
 {
    PacketPtr pkt = crequest->getFirstPkt();
    Addr request_address = pkt->getAddr();
-    Addr request_line_address = makeLineAddress(request_address);
+    Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);

    RubyRequestType type = crequest->getRubyType();

@@ -516,20 +504,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                    "%s\n",
                    RubyRequestType_to_string(type));
        }
-
-        // If using the RubyTester, update the RubyTester sender state's
-        // subBlock with the recieved data.  The tester will later access
-        // this state.
-        // Note: RubyPort will access it's sender state before the
-        // RubyTester.
-        if (m_usingRubyTester) {
-            RubyPort::SenderState *requestSenderState =
-                safe_cast<RubyPort::SenderState*>(pkt->senderState);
-            RubyTester::SenderState* testerSenderState =
-                safe_cast<RubyTester::SenderState*>
-                    (requestSenderState->predecessor);
-            testerSenderState->subBlock.mergeFrom(data);
-        }
    }


@@ -566,8 +540,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
    } else if (pkt->isWrite()) {
        req_type = RubyRequestType_ST;
    } else {
-        // Acquire and release packets will have been issued by
-        // makeRequest, so we do not need to check for it here.
        panic("Unsupported ruby packet type\n");
    }

@@ -579,71 +551,43 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
-    // Check for GPU Barrier Kernel End or Kernel Begin
-    // Leave these to be handled by the child class
-    // Kernel End/Barrier = isFlush + isRelease
-    // Kernel Begin = isFlush + isAcquire
-    if (pkt->req->isKernel()) {
-        if (pkt->req->isAcquire()){
-            // This is a Kernel Begin leave handling to
-            // virtual xCoalescer::makeRequest
-            return RequestStatus_Issued;
-        }else if (pkt->req->isRelease()) {
-            // This is a Kernel End leave handling to
-            // virtual xCoalescer::makeRequest
-            // If we are here then we didn't call
-            // a virtual version of this function
-            // so we will also schedule the callback
-            int wf_id = 0;
-            if (pkt->req->hasContextId()) {
-                wf_id = pkt->req->contextId();
-            }
-            insertKernel(wf_id, pkt);
-            newKernelEnds.push_back(wf_id);
-            if (!issueEvent.scheduled()) {
-                schedule(issueEvent, curTick());
-            }
-            return RequestStatus_Issued;
+    // all packets must have valid instruction sequence numbers
+    assert(pkt->req->hasInstSeqNum());
+
+    if (pkt->cmd == MemCmd::MemSyncReq) {
+        // issue mem_sync requests immedidately to the cache system without
+        // going though uncoalescedTable like normal LD/ST/Atomic requests
+        issueMemSyncRequest(pkt);
+    } else {
+        // otherwise, this must be either read or write command
+        assert(pkt->isRead() || pkt->isWrite());
+
+        // the pkt is temporarily stored in the uncoalesced table until
+        // it's picked for coalescing process later in this cycle or in a
+        // future cycle
+        uncoalescedTable.insertPacket(pkt);
+        DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
+                pkt->getAddr());
+
+        // we schedule an issue event here to process the uncoalesced table
+        // and try to issue Ruby request to cache system
+        if (!issueEvent.scheduled()) {
+            schedule(issueEvent, curTick());
        }
    }

-    if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
-        !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
-        (pkt->req->isRelease() || pkt->req->isAcquire())) {
-        if (assumingRfOCoherence) {
-            // If we reached here, this request must be a memFence
-            // and the protocol implements RfO, the coalescer can
-            // assume sequentially consistency and schedule the callback
-            // immediately.
-            // Currently the code implements fence callbacks
-            // by reusing the mechanism for kernel completions.
-            // This should be fixed.
-            int wf_id = 0;
-            if (pkt->req->hasContextId()) {
-                wf_id = pkt->req->contextId();
-            }
-            insertKernel(wf_id, pkt);
-            newKernelEnds.push_back(wf_id);
-            if (!issueEvent.scheduled()) {
-                schedule(issueEvent, curTick());
-            }
-            return RequestStatus_Issued;
-        } else {
-            // If not RfO, return issued here and let the child coalescer
-            // take care of it.
-            return RequestStatus_Issued;
-        }
-    }
-
-    uncoalescedTable.insertPacket(pkt);
-    DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
-
-    if (!issueEvent.scheduled())
-        schedule(issueEvent, curTick());
-    // TODO: issue hardware prefetches here
+    // we always return RequestStatus_Issued in this coalescer
+    // b/c the coalescer's resouce was checked ealier and the coalescer is
+    // queueing up aliased requets in its coalesced table
    return RequestStatus_Issued;
 }

+/**
+ * TODO: Figure out what do with this code. This code may go away
+ *       and/or be merged into the VIPER coalescer once the VIPER
+ *       protocol is re-integrated with GCN3 codes.
+ */
+/*
 void
 GPUCoalescer::issueRequest(CoalescedRequest* crequest)
 {
@@ -736,8 +680,8 @@ GPUCoalescer::issueRequest(CoalescedRequest* crequest)
    }

    assert(m_mandatory_q_ptr);
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
-}
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}*/

 template <class KEY, class VALUE>
 std::ostream &
@@ -760,12 +704,6 @@ GPUCoalescer::print(ostream& out) const
 }


-void
-GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
-    DPRINTF(RubyStats, "Recorded statistic: %s\n",
-            SequencerRequestType_to_string(requestType));
-}
-
 bool
 GPUCoalescer::coalescePacket(PacketPtr pkt)
 {
@@ -819,6 +757,41 @@ GPUCoalescer::coalescePacket(PacketPtr pkt)
        // be counted as outstanding requests.
        m_outstanding_count++;

+        // We track all issued or to-be-issued Ruby requests associated with
+        // write instructions. An instruction may have multiple Ruby
+        // requests.
+        if (pkt->cmd == MemCmd::WriteReq) {
+            DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
+                    " the pending write instruction list\n", seqNum,
+                    line_addr);
+
+            RubyPort::SenderState* ss =
+                    safe_cast<RubyPort::SenderState*>(pkt->senderState);
+
+            // we need to save this port because it will be used to call
+            // back the requesting CU when we receive write
+            // complete callbacks for all issued Ruby requests of this
+            // instruction.
+            RubyPort::MemSlavePort* mem_slave_port = ss->port;
+
+            GPUDynInstPtr gpuDynInst = nullptr;
+
+            if (!m_usingRubyTester) {
+                // If this coalescer is connected to a real CU, we need
+                // to save the corresponding gpu dynamic instruction.
+                // CU will use that instruction to decrement wait counters
+                // in the issuing wavefront.
+                // For Ruby tester, gpuDynInst == nullptr
+                ComputeUnit::DataPort::SenderState* cu_state =
+                    safe_cast<ComputeUnit::DataPort::SenderState*>
+                        (ss->predecessor);
+                gpuDynInst = cu_state->_gpuDynInst;
+            }
+
+            PendingWriteInst& inst = pendingWriteInsts[seqNum];
+            inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
+        }
+
        return true;
    }

@@ -907,34 +880,6 @@ GPUCoalescer::atomicCallback(Addr address,
    }
 }

-void
-GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
-{
-    if (myMachID == senderMachID) {
-        CP_TCPLdHits++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
-        CP_TCPLdTransfers++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
-        CP_TCCLdHits++;
-    } else {
-        CP_LdMiss++;
-    }
-}
-
-void
-GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
-{
-    if (myMachID == senderMachID) {
-        CP_TCPStHits++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
-        CP_TCPStTransfers++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
-        CP_TCCStHits++;
-    } else {
-        CP_StMiss++;
-    }
-}
-
 void
 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
@@ -970,74 +915,6 @@ GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
                                Cycles firstResponseTime,
                                bool success, bool isRegion)
 {
-    RubyRequestType type = crequest->getRubyType();
-    Cycles issued_time = crequest->getIssueTime();
-    Cycles completion_time = curCycle();
-    assert(completion_time >= issued_time);
-    Cycles total_lat = completion_time - issued_time;
-
-    // cache stats (valid for RfO protocol only)
-    if (mach == MachineType_TCP) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCPLdHits++;
-        } else {
-            GPU_TCPStHits++;
-        }
-    } else if (mach == MachineType_L1Cache_wCC) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCPLdTransfers++;
-        } else {
-            GPU_TCPStTransfers++;
-        }
-    } else if (mach == MachineType_TCC) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCCLdHits++;
-        } else {
-            GPU_TCCStHits++;
-        }
-    } else  {
-        if (type == RubyRequestType_LD) {
-            GPU_LdMiss++;
-        } else {
-            GPU_StMiss++;
-        }
-    }
-
-    // Profile all access latency, even zero latency accesses
-    m_latencyHist.sample(total_lat);
-    m_typeLatencyHist[type]->sample(total_lat);
-
-    // Profile the miss latency for all non-zero demand misses
-    if (total_lat != Cycles(0)) {
-        m_missLatencyHist.sample(total_lat);
-        m_missTypeLatencyHist[type]->sample(total_lat);
-
-        if (mach != MachineType_NUM) {
-            m_missMachLatencyHist[mach]->sample(total_lat);
-            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
-
-            if ((issued_time <= initialRequestTime) &&
-                (initialRequestTime <= forwardRequestTime) &&
-                (forwardRequestTime <= firstResponseTime) &&
-                (firstResponseTime <= completion_time)) {
-
-                m_IssueToInitialDelayHist[mach]->sample(
-                    initialRequestTime - issued_time);
-                m_InitialToForwardDelayHist[mach]->sample(
-                    forwardRequestTime - initialRequestTime);
-                m_ForwardToFirstResponseDelayHist[mach]->sample(
-                    firstResponseTime - forwardRequestTime);
-                m_FirstResponseToCompletionDelayHist[mach]->sample(
-                    completion_time - firstResponseTime);
-            }
-        }
-
-    }
-
-    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
-             curTick(), m_version, "Coal",
-             success ? "Done" : "SC_Failed", "", "",
-             printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
 }

 void
@@ -1085,74 +962,4 @@ GPUCoalescer::regStats()
            m_missTypeMachLatencyHist[i][j]->init(10);
        }
    }
-
-    // GPU cache stats
-    GPU_TCPLdHits
-        .name(name() + ".gpu_tcp_ld_hits")
-        .desc("loads that hit in the TCP")
-        ;
-    GPU_TCPLdTransfers
-        .name(name() + ".gpu_tcp_ld_transfers")
-        .desc("TCP to TCP load transfers")
-        ;
-    GPU_TCCLdHits
-        .name(name() + ".gpu_tcc_ld_hits")
-        .desc("loads that hit in the TCC")
-        ;
-    GPU_LdMiss
-        .name(name() + ".gpu_ld_misses")
-        .desc("loads that miss in the GPU")
-        ;
-
-    GPU_TCPStHits
-        .name(name() + ".gpu_tcp_st_hits")
-        .desc("stores that hit in the TCP")
-        ;
-    GPU_TCPStTransfers
-        .name(name() + ".gpu_tcp_st_transfers")
-        .desc("TCP to TCP store transfers")
-        ;
-    GPU_TCCStHits
-        .name(name() + ".gpu_tcc_st_hits")
-        .desc("stores that hit in the TCC")
-        ;
-    GPU_StMiss
-        .name(name() + ".gpu_st_misses")
-        .desc("stores that miss in the GPU")
-        ;
-
-    // CP cache stats
-    CP_TCPLdHits
-        .name(name() + ".cp_tcp_ld_hits")
-        .desc("loads that hit in the TCP")
-        ;
-    CP_TCPLdTransfers
-        .name(name() + ".cp_tcp_ld_transfers")
-        .desc("TCP to TCP load transfers")
-        ;
-    CP_TCCLdHits
-        .name(name() + ".cp_tcc_ld_hits")
-        .desc("loads that hit in the TCC")
-        ;
-    CP_LdMiss
-        .name(name() + ".cp_ld_misses")
-        .desc("loads that miss in the GPU")
-        ;
-
-    CP_TCPStHits
-        .name(name() + ".cp_tcp_st_hits")
-        .desc("stores that hit in the TCP")
-        ;
-    CP_TCPStTransfers
-        .name(name() + ".cp_tcp_st_transfers")
-        .desc("TCP to TCP store transfers")
-        ;
-    CP_TCCStHits
-        .name(name() + ".cp_tcc_st_hits")
-        .desc("stores that hit in the TCC")
-        ;
-    CP_StMiss
-        .name(name() + ".cp_st_misses")
-        .desc("stores that miss in the GPU")
-        ;
 }
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -38,11 +38,11 @@
 #include <unordered_map>

 #include "base/statistics.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
 #include "mem/request.hh"
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/Consumer.hh"
-#include "mem/ruby/protocol/HSAScope.hh"
-#include "mem/ruby/protocol/HSASegment.hh"
 #include "mem/ruby/protocol/PrefetchBit.hh"
 #include "mem/ruby/protocol/RubyAccessMode.hh"
 #include "mem/ruby/protocol/RubyRequestType.hh"
@@ -57,9 +57,6 @@ class CacheMemory;

 class RubyGPUCoalescerParams;

-HSAScope reqScopeToHSAScope(const RequestPtr &req);
-HSASegment reqSegmentToHSASegment(const RequestPtr &req);
-
 // List of packets that belongs to a specific instruction.
 typedef std::list<PacketPtr> PerInstPackets;

@@ -78,6 +75,7 @@ class UncoalescedTable
    // instructions at the offset.
    PerInstPackets* getInstPackets(int offset);
    void updateResources();
+    bool areRequestsDone(const uint64_t instSeqNum);

    // Check if a packet hasn't been removed from instMap in too long.
    // Panics if a deadlock is detected and returns nothing otherwise.
@@ -120,6 +118,86 @@ class CoalescedRequest
    std::vector<PacketPtr> pkts;
 };

+// PendingWriteInst tracks the number of outstanding Ruby requests
+// per write instruction. Once all requests associated with one instruction
+// are completely done in Ruby, we call back the requester to mark
+// that this instruction is complete.
+class PendingWriteInst
+{
+  public:
+    PendingWriteInst()
+        : numPendingStores(0),
+          originalPort(nullptr),
+          gpuDynInstPtr(nullptr)
+    {}
+
+    ~PendingWriteInst()
+    {}
+
+    void
+    addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
+                  bool usingRubyTester)
+    {
+        assert(port);
+        originalPort = port;
+
+        if (!usingRubyTester) {
+            gpuDynInstPtr = inst;
+        }
+
+        numPendingStores++;
+    }
+
+    // return true if no more ack is expected
+    bool
+    receiveWriteCompleteAck()
+    {
+        assert(numPendingStores > 0);
+        numPendingStores--;
+        return (numPendingStores == 0) ? true : false;
+    }
+
+    // ack the original requester that this write instruction is complete
+    void
+    ackWriteCompletion(bool usingRubyTester)
+    {
+        assert(numPendingStores == 0);
+
+        // make a response packet
+        PacketPtr pkt = new Packet(std::make_shared<Request>(),
+                                   MemCmd::WriteCompleteResp);
+
+        if (!usingRubyTester) {
+            assert(gpuDynInstPtr);
+            ComputeUnit::DataPort::SenderState* ss =
+                    new ComputeUnit::DataPort::SenderState
+                                            (gpuDynInstPtr, 0, nullptr);
+            pkt->senderState = ss;
+        }
+
+        // send the ack response to the requester
+        originalPort->sendTimingResp(pkt);
+    }
+
+    int
+    getNumPendingStores() {
+        return numPendingStores;
+    }
+
+  private:
+    // the number of stores waiting for writeCompleteCallback
+    int numPendingStores;
+    // The original port that sent one of packets associated with this
+    // write instruction. We may have more than one packet per instruction,
+    // which implies multiple ports per instruction. However, we need
+    // only 1 of the ports to call back the CU. Therefore, here we keep
+    // track the port that sent the first packet of this instruction.
+    RubyPort::MemSlavePort* originalPort;
+    // similar to the originalPort, this gpuDynInstPtr is set only for
+    // the first packet of this instruction.
+    GPUDynInstPtr gpuDynInstPtr;
+};
+
 class GPUCoalescer : public RubyPort
 {
  public:
@@ -159,6 +237,17 @@ class GPUCoalescer : public RubyPort
    void collateStats();
    void regStats() override;

+    // each store request needs two callbacks:
+    //  (1) writeCallback is called when the store is received and processed
+    //      by TCP. This writeCallback does not guarantee the store is actually
+    //      completed at its destination cache or memory. writeCallback helps
+    //      release hardware resources (e.g., its entry in coalescedTable)
+    //      allocated for the store so that subsequent requests will not be
+    //      blocked unnecessarily due to hardware resource constraints.
+    //  (2) writeCompleteCallback is called when the store is fully completed
+    //      at its destination cache or memory. writeCompleteCallback
+    //      guarantees that the store is fully completed. This callback
+    //      will decrement hardware counters in CU
    void writeCallback(Addr address, DataBlock& data);

    void writeCallback(Addr address,
@@ -180,6 +269,10 @@ class GPUCoalescer : public RubyPort
                       Cycles forwardRequestTime,
                       Cycles firstResponseTime);

+    void writeCompleteCallback(Addr address,
+                               uint64_t instSeqNum,
+                               MachineType mach);
+
    void readCallback(Addr address, DataBlock& data);

    void readCallback(Addr address,
@@ -200,18 +293,12 @@ class GPUCoalescer : public RubyPort
                      Cycles forwardRequestTime,
                      Cycles firstResponseTime,
                      bool isRegion);
-    /* atomics need their own callback because the data
-       might be const coming from SLICC */
+
    void atomicCallback(Addr address,
                        MachineType mach,
                        const DataBlock& data);

-    void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
-    void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
-
-    // Alternate implementations in VIPER Coalescer
-    virtual RequestStatus makeRequest(PacketPtr pkt) override;
-
+    RequestStatus makeRequest(PacketPtr pkt) override;
    int outstandingCount() const override { return m_outstanding_count; }

    bool
@@ -237,7 +324,6 @@ class GPUCoalescer : public RubyPort

    GMTokenPort& getGMTokenPort() { return gmTokenPort; }

-    void recordRequestType(SequencerRequestType requestType);
    Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }

    Stats::Histogram& getLatencyHist() { return m_latencyHist; }
@@ -271,15 +357,17 @@ class GPUCoalescer : public RubyPort
    getFirstResponseToCompletionDelayHist(const MachineType t) const
    { return *m_FirstResponseToCompletionDelayHist[t]; }

-  // Changed to protected to enable inheritance by VIPER Coalescer
  protected:
    bool tryCacheAccess(Addr addr, RubyRequestType type,
                        Addr pc, RubyAccessMode access_mode,
                        int size, DataBlock*& data_ptr);
-    // Alternate implementations in VIPER Coalescer
-    virtual void issueRequest(CoalescedRequest* crequest);

-    void kernelCallback(int wavfront_id);
+    // since the two following issue functions are protocol-specific,
+    // they must be implemented in a derived coalescer
+    virtual void issueRequest(CoalescedRequest* crequest) = 0;
+    virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
+
+    void kernelCallback(int wavefront_id);

    void hitCallback(CoalescedRequest* crequest,
                     MachineType mach,
@@ -297,7 +385,6 @@ class GPUCoalescer : public RubyPort
                           bool success, bool isRegion);
    void completeHitCallback(std::vector<PacketPtr> & mylist);

-
    virtual RubyRequestType getRequestType(PacketPtr pkt);

    // Attempt to remove a packet from the uncoalescedTable and coalesce
@@ -309,8 +396,6 @@ class GPUCoalescer : public RubyPort

    EventFunctionWrapper issueEvent;

-
-  // Changed to protected to enable inheritance by VIPER Coalescer
  protected:
    int m_max_outstanding_requests;
    Cycles m_deadlock_threshold;
@@ -334,6 +419,11 @@ class GPUCoalescer : public RubyPort
    // an address, the are serviced in age order.
    std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;

+    // a map btw an instruction sequence number and PendingWriteInst
+    // this is used to do a final call back for each write when it is
+    // completely done in the memory system
+    std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
+
    // Global outstanding request count, across all request tables
    int m_outstanding_count;
    bool m_deadlock_check_scheduled;
@@ -350,26 +440,28 @@ class GPUCoalescer : public RubyPort
    EventFunctionWrapper deadlockCheckEvent;
    bool assumingRfOCoherence;

-    // m5 style stats for TCP hit/miss counts
-    Stats::Scalar GPU_TCPLdHits;
-    Stats::Scalar GPU_TCPLdTransfers;
-    Stats::Scalar GPU_TCCLdHits;
-    Stats::Scalar GPU_LdMiss;
-
-    Stats::Scalar GPU_TCPStHits;
-    Stats::Scalar GPU_TCPStTransfers;
-    Stats::Scalar GPU_TCCStHits;
-    Stats::Scalar GPU_StMiss;
-
-    Stats::Scalar CP_TCPLdHits;
-    Stats::Scalar CP_TCPLdTransfers;
-    Stats::Scalar CP_TCCLdHits;
-    Stats::Scalar CP_LdMiss;
-
-    Stats::Scalar CP_TCPStHits;
-    Stats::Scalar CP_TCPStTransfers;
-    Stats::Scalar CP_TCCStHits;
-    Stats::Scalar CP_StMiss;
+// TODO - Need to update the following stats once the VIPER protocol
+//        is re-integrated.
+//    // m5 style stats for TCP hit/miss counts
+//    Stats::Scalar GPU_TCPLdHits;
+//    Stats::Scalar GPU_TCPLdTransfers;
+//    Stats::Scalar GPU_TCCLdHits;
+//    Stats::Scalar GPU_LdMiss;
+//
+//    Stats::Scalar GPU_TCPStHits;
+//    Stats::Scalar GPU_TCPStTransfers;
+//    Stats::Scalar GPU_TCCStHits;
+//    Stats::Scalar GPU_StMiss;
+//
+//    Stats::Scalar CP_TCPLdHits;
+//    Stats::Scalar CP_TCPLdTransfers;
+//    Stats::Scalar CP_TCCLdHits;
+//    Stats::Scalar CP_LdMiss;
+//
+//    Stats::Scalar CP_TCPStHits;
+//    Stats::Scalar CP_TCPStTransfers;
+//    Stats::Scalar CP_TCCStHits;
+//    Stats::Scalar CP_StMiss;

    //! Histogram for number of outstanding requests per cycle.
    Stats::Histogram m_outstandReqHist;
@@ -394,6 +486,21 @@ class GPUCoalescer : public RubyPort
    std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
    std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;

+// TODO - Need to update the following stats once the VIPER protocol
+//        is re-integrated.
+//    Stats::Distribution numHopDelays;
+//    Stats::Distribution tcpToTccDelay;
+//    Stats::Distribution tccToSdDelay;
+//    Stats::Distribution sdToSdDelay;
+//    Stats::Distribution sdToTccDelay;
+//    Stats::Distribution tccToTcpDelay;
+//
+//    Stats::Average avgTcpToTcc;
+//    Stats::Average avgTccToSd;
+//    Stats::Average avgSdToSd;
+//    Stats::Average avgSdToTcc;
+//    Stats::Average avgTccToTcp;
+
  private:
    // Token port is used to send/receive tokens to/from GPU's global memory
    // pipeline across the port boundary. There is one per <wave size> data
--- a/src/mem/ruby/system/GPUCoalescer.py
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -36,6 +36,7 @@ from m5.objects.Sequencer import *

 class RubyGPUCoalescer(RubyPort):
   type = 'RubyGPUCoalescer'
+   abstract = True
   cxx_class = 'GPUCoalescer'
   cxx_header = "mem/ruby/system/GPUCoalescer.hh"

@@ -44,8 +45,6 @@ class RubyGPUCoalescer(RubyPort):
                                "max requests (incl. prefetches) outstanding")
   max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
                                "coalesced in a single cycle")
-   assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
-                           "Ownership coherence");

   icache = Param.RubyCache("")
   dcache = Param.RubyCache("")
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -58,7 +58,7 @@ class VIPERCoalescer : public GPUCoalescer
    VIPERCoalescer(const Params *);
    ~VIPERCoalescer();

-    void issueMemSyncRequest(PacketPtr pkt);
+    void issueMemSyncRequest(PacketPtr pkt) override;
    void issueRequest(CoalescedRequest* crequest) override;
    void wbCallback(Addr address);
    void invCallback(Addr address);
--- a/src/mem/ruby/system/VIPERCoalescer.py
+++ b/src/mem/ruby/system/VIPERCoalescer.py
@@ -39,4 +39,3 @@ class VIPERCoalescer(RubyGPUCoalescer):
    cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
    max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
    max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
-    assume_rfo = False