gem5/src/gpu-compute/wavefront.cc

/*
 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "gpu-compute/wavefront.hh"

#include "base/bitfield.hh"
#include "debug/GPUExec.hh"
#include "debug/GPUInitAbi.hh"
#include "debug/WavefrontStack.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/register_file_cache.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/vector_register_file.hh"

namespace gem5
{

Wavefront::Wavefront(const Params &p)
  : SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
    maxIbSize(p.max_ib_size), _gpuISA(*this),
    vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
    vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
    sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
{
    lastTrace = 0;
    execUnitId = -1;
    status = S_STOPPED;
    reservedVectorRegs = 0;
    reservedScalarRegs = 0;
    startVgprIndex = 0;
    startSgprIndex = 0;
    outstandingReqs = 0;
    outstandingReqsWrGm = 0;
    outstandingReqsWrLm = 0;
    outstandingReqsRdGm = 0;
    outstandingReqsRdLm = 0;
    rdLmReqsInPipe = 0;
    rdGmReqsInPipe = 0;
    wrLmReqsInPipe = 0;
    wrGmReqsInPipe = 0;
    scalarRdGmReqsInPipe = 0;
    scalarWrGmReqsInPipe = 0;
    scalarOutstandingReqsRdGm = 0;
    scalarOutstandingReqsWrGm = 0;
    lastNonIdleTick = 0;
    ldsChunk = nullptr;

    memTraceBusy = 0;
    oldVgprTcnt = 0xffffffffffffffffll;
    oldDgprTcnt = 0xffffffffffffffffll;
    oldVgpr.resize(p.wf_size);

    pendingFetch = false;
    dropFetch = false;
    maxVgprs = 0;
    maxSgprs = 0;

    lastAddr.resize(p.wf_size);
    workItemFlatId.resize(p.wf_size);
    oldDgpr.resize(p.wf_size);
    for (int i = 0; i < 3; ++i) {
        workItemId[i].resize(p.wf_size);
    }

    _execMask.set();
    rawDist.clear();
    lastInstExec = 0;
    vecReads.clear();
}

void
Wavefront::init()
{
    reservedVectorRegs = 0;
    reservedScalarRegs = 0;
    startVgprIndex = 0;
    startSgprIndex = 0;

    scalarAlu = computeUnit->mapWaveToScalarAlu(this);
    scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
    globalMem = computeUnit->mapWaveToGlobalMem(this);
    localMem = computeUnit->mapWaveToLocalMem(this);
    scalarMem = computeUnit->mapWaveToScalarMem(this);
}

void
Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
{
    int regInitIdx = 0;
    gfxVersion = task->gfxVersion();

    // Iterate over all the init fields and check which
    // bits are enabled. Useful information can be found here:
    // https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/
    //                    blob/master/AMDGPU-ABI.md
    for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {

        if (task->sgprBitEnabled(en_bit)) {
            int physSgprIdx = 0;
            uint32_t firstWave = 0;
            int orderedAppendTerm = 0;
            int numWfsInWg = 0;
            uint32_t finalValue = 0;
            Addr host_disp_pkt_addr = task->hostDispPktAddr();
            Addr kernarg_addr = task->kernargAddr();
            Addr hidden_priv_base(0);

            switch (en_bit) {
              case PrivateSegBuf:
                    physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        task->amdQueue.scratch_resource_descriptor[0]);
                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting PrivateSegBuffer: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        task->amdQueue.scratch_resource_descriptor[0]);

                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        task->amdQueue.scratch_resource_descriptor[1]);
                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting PrivateSegBuffer: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        task->amdQueue.scratch_resource_descriptor[1]);

                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        task->amdQueue.scratch_resource_descriptor[2]);
                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting PrivateSegBuffer: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        task->amdQueue.scratch_resource_descriptor[2]);

                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        task->amdQueue.scratch_resource_descriptor[3]);

                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting PrivateSegBuffer: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        task->amdQueue.scratch_resource_descriptor[3]);
                break;
              case DispatchPtr:
                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        bits(host_disp_pkt_addr, 31, 0));
                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting DispatchPtr: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        bits(host_disp_pkt_addr, 31, 0));

                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        bits(host_disp_pkt_addr, 63, 32));
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting DispatchPtr: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        bits(host_disp_pkt_addr, 63, 32));

                ++regInitIdx;
                break;
              case QueuePtr:
                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        bits(task->hostAMDQueueAddr, 31, 0));
                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting QueuePtr: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        bits(task->hostAMDQueueAddr, 31, 0));

                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        bits(task->hostAMDQueueAddr, 63, 32));
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting QueuePtr: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        bits(task->hostAMDQueueAddr, 63, 32));

                ++regInitIdx;
                break;
              case KernargSegPtr:
                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        bits(kernarg_addr, 31, 0));
                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting KernargSegPtr: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        bits(kernarg_addr, 31, 0));

                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        bits(kernarg_addr, 63, 32));
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting KernargSegPtr: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        bits(kernarg_addr, 63, 32));

                ++regInitIdx;
                break;
              case DispatchId:
                physSgprIdx
                    = computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        task->dispatchId());
                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting DispatchId: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        task->dispatchId());

                // Dispatch ID in gem5 is an int. Set upper 32-bits to zero.
                physSgprIdx
                    = computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx, 0);
                ++regInitIdx;
                break;
              case FlatScratchInit:
                physSgprIdx
                    = computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                    (TheGpuISA::ScalarRegU32)(task->amdQueue
                        .scratch_backing_memory_location & 0xffffffff));
                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting FlatScratch Addr: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        (TheGpuISA::ScalarRegU32)(task->amdQueue
                        .scratch_backing_memory_location & 0xffffffff));

                physSgprIdx =
                       computeUnit->registerManager->mapSgpr(this, regInitIdx);
                // This vallue should be sizeof(DWORD) aligned, that is
                // 4 byte aligned
                computeUnit->srf[simdId]->write(physSgprIdx,
                    task->amdQueue.scratch_workitem_byte_size);
                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting FlatScratch size: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        task->amdQueue.scratch_workitem_byte_size);
                /**
                 * Since flat scratch init is needed for this kernel, this
                 * kernel is going to have flat memory instructions and we
                 * need to initialize the hidden private base for this queue.
                 * scratch_resource_descriptor[0] has this queue's scratch
                 * base address. scratch_backing_memory_location has the
                 * offset to this queue's scratch base address from the
                 * SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
                 * queue's scratch base address for address calculation
                 * (stored in scratch_resource_descriptor[0]). But that
                 * address calculation shoule be done by first finding the
                 * queue's scratch base address using the calculation
                 * "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
                 * SH_HIDDEN_PRIVATE_BASE_VMID.
                 *
                 * For more details see:
                 *     http://rocm-documentation.readthedocs.io/en/latest/
                 *     ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
                 *
                 *     https://github.com/ROCm-Developer-Tools/
                 *     ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
                 *     #flat-addressing
                 */
                hidden_priv_base =
                    (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
                    (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
                    & 0x000000000000ffff) << 32);
                computeUnit->shader->initShHiddenPrivateBase(
                       hidden_priv_base,
                       task->amdQueue.scratch_backing_memory_location);
                break;
              case PrivateSegSize:
                physSgprIdx
                    = computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                        task->privMemPerItem());
                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting private segment size: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        task->privMemPerItem());
                break;
              case WorkgroupIdX:
                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                                                     workGroupId[0]);

                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting WG ID X: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
                break;
              case WorkgroupIdY:
                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                                                     workGroupId[1]);

                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting WG ID Y: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
                break;
              case WorkgroupIdZ:
                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->write(physSgprIdx,
                                                     workGroupId[2]);

                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting WG ID Z: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
                break;
              case PrivSegWaveByteOffset:

                // For architected flat scratch, this enable is reused to set
                // the FLAT_SCRATCH register pair to the scratch backing
                // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
                if (task->gfxVersion() == GfxVersion::gfx942) {
                    Addr arch_flat_scratch =
                        task->amdQueue.scratch_backing_memory_location;
                    computeUnit->srf[simdId]->write(
                        VegaISA::REG_FLAT_SCRATCH_HI,
                        bits(arch_flat_scratch, 63, 32));
                    computeUnit->srf[simdId]->write(
                        VegaISA::REG_FLAT_SCRATCH_LO,
                        bits(arch_flat_scratch, 31, 0));

                    break;
                }

                // Not architected flat scratch. Write the scratch wavefront
                // offset: https://llvm.org/docs/AMDGPUUsage.html
                //              #amdgpu-amdhsa-initial-kernel-execution-state
                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                /**
                  * the compute_tmpring_size_wavesize specifies the number of
                  * kB allocated per wavefront, hence the multiplication by
                  * 1024.
                  *
                  * to get the per wavefront offset into the scratch
                  * memory, we also multiply this by the wfId. the wfId stored
                  * in the Wavefront class, however, is the wave ID within the
                  * WG, whereas here we need the global WFID because the
                  * scratch space will be divided amongst all waves in the
                  * kernel. to get the global ID we multiply the WGID by
                  * the WG size, then add the WFID of the wave within its WG.
                  */
                computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
                    (wgId * (wgSz / 64) + wfId) *
                    task->amdQueue.compute_tmpring_size_wavesize);

                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting Private Seg Offset: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx,
                        1024 * (wgId * (wgSz / 64) + wfId) *
                        task->amdQueue.compute_tmpring_size_wavesize);
                break;
              case WorkgroupInfo:
                firstWave = (wfId == 0) ? 1 : 0;
                numWfsInWg = divCeil(wgSizeInWorkItems,
                                         computeUnit->wfSize());
                finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
                finalValue |= (orderedAppendTerm << 6);
                finalValue |= numWfsInWg;
                physSgprIdx =
                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
                computeUnit->srf[simdId]->
                    write(physSgprIdx, finalValue);

                ++regInitIdx;
                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
                        "Setting WG Info: s[%d] = %x\n",
                        computeUnit->cu_id, simdId,
                        wfSlotId, wfDynId, physSgprIdx, finalValue);
                break;
              default:
                fatal("SGPR enable bit %i not supported\n", en_bit);
                break;
            }
        }
    }

    // Save the offset to the first accumulation VGPR number from HSA task.
    accumOffset = task->accumOffset();

    regInitIdx = 0;

    // VGPRs are initialized to the work item IDs for a given thread. There
    // are two ways to initialize the IDs based on number of dimensions. ISAs
    // will either have packed work-item IDs or not. LLVM lists them here:
    // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table
    // Default to false and set to true for gem5 supported ISAs.
    bool packed_work_item_id = false;

    if (task->gfxVersion() == GfxVersion::gfx90a ||
        task->gfxVersion() == GfxVersion::gfx942) {
        packed_work_item_id = true;
    }

    // For ISAs with packed work item IDs, only one VGPR is used and the
    // (X,Y,Z) dimensions are packed into a single 32-bit VGPR with 10-bits
    // for each dimension
    if (packed_work_item_id) {
        TheGpuISA::VecRegContainerU32 raw_vgpr;
        TheGpuISA::VecElemU32 *packed_vgpr
            = raw_vgpr.as<TheGpuISA::VecElemU32>();

        uint32_t physVgprIdx = computeUnit->registerManager
            ->mapVgpr(this, regInitIdx);
        for (int lane = 0; lane < workItemId[0].size(); ++lane) {
            packed_vgpr[lane] = workItemId[0][lane] & 0x3ff;
        }
        if (task->vgprBitEnabled(1)) {
            for (int lane = 0; lane < workItemId[1].size(); ++lane) {
                packed_vgpr[lane] |= ((workItemId[1][lane] & 0x3ff) << 10);
            }
        }
        if (task->vgprBitEnabled(2)) {
            for (int lane = 0; lane < workItemId[2].size(); ++lane) {
                packed_vgpr[lane] |= ((workItemId[2][lane] & 0x3ff) << 20);
            }
        }
        computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);

        return;
    }

    // For ISAs with non-packed work item IDs, map and initialize one VGPR
    // per dimensions. Do this by iterating over all the init fields and
    // checking which bits are enabled.
    for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
        if (task->vgprBitEnabled(en_bit)) {
            uint32_t physVgprIdx = 0;
            TheGpuISA::VecRegContainerU32 raw_vgpr;

            switch (en_bit) {
              case WorkitemIdX:
                {
                    physVgprIdx = computeUnit->registerManager
                        ->mapVgpr(this, regInitIdx);
                    TheGpuISA::VecElemU32 *vgpr_x
                        = raw_vgpr.as<TheGpuISA::VecElemU32>();

                    for (int lane = 0; lane < workItemId[0].size(); ++lane) {
                        vgpr_x[lane] = workItemId[0][lane];
                    }

                    computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
                    rawDist[regInitIdx] = 0;
                    ++regInitIdx;
                }
                break;
              case WorkitemIdY:
                {
                    physVgprIdx = computeUnit->registerManager
                        ->mapVgpr(this, regInitIdx);
                    TheGpuISA::VecElemU32 *vgpr_y
                        = raw_vgpr.as<TheGpuISA::VecElemU32>();

                    for (int lane = 0; lane < workItemId[1].size(); ++lane) {
                        vgpr_y[lane] = workItemId[1][lane];
                    }

                    computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
                    rawDist[regInitIdx] = 0;
                    ++regInitIdx;
                }
                break;
              case WorkitemIdZ:
                {
                    physVgprIdx = computeUnit->registerManager->
                        mapVgpr(this, regInitIdx);
                    TheGpuISA::VecElemU32 *vgpr_z
                        = raw_vgpr.as<TheGpuISA::VecElemU32>();

                    for (int lane = 0; lane < workItemId[2].size(); ++lane) {
                        vgpr_z[lane] = workItemId[2][lane];
                    }

                    computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
                    rawDist[regInitIdx] = 0;
                    ++regInitIdx;
                }
                break;
            }
        }
    }
}

void
Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
{
    maxVgprs = num_vregs;
    maxSgprs = num_sregs;
}

Wavefront::~Wavefront()
{
}

void
Wavefront::setStatus(status_e newStatus)
{
    if (computeUnit->idleCUTimeout > 0) {
        // Wavefront's status transitions to stalled or stopped
        if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
             newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
            (status != newStatus)) {
            computeUnit->idleWfs++;
            assert(computeUnit->idleWfs <=
                   (computeUnit->shader->n_wf * computeUnit->numVectorALUs));
            if (computeUnit->idleWfs ==
                (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
                lastNonIdleTick = curTick();
            }
            // Wavefront's status transitions to an active state (from
            // a stopped or stalled state)
        } else if ((status == S_STOPPED || status == S_STALLED ||
                    status == S_WAITCNT || status == S_BARRIER) &&
                   (status != newStatus)) {
            // if all WFs in the CU were idle then check if the idleness
            // period exceeded the timeout threshold
            if (computeUnit->idleWfs ==
                (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
                panic_if((curTick() - lastNonIdleTick) >=
                         computeUnit->idleCUTimeout,
                         "CU%d has been idle for %d ticks at tick %d",
                         computeUnit->cu_id, computeUnit->idleCUTimeout,
                         curTick());
            }
            computeUnit->idleWfs--;
            assert(computeUnit->idleWfs >= 0);
        }
    }
    status = newStatus;
}

void
Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
{
    wfDynId = _wf_dyn_id;
    _pc = init_pc;

    status = S_RUNNING;

    vecReads.resize(maxVgprs, 0);
}

bool
Wavefront::isGmInstruction(GPUDynInstPtr ii)
{
    if (ii->isGlobalMem() ||
        (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
        return true;
    }

    return false;
}

bool
Wavefront::isLmInstruction(GPUDynInstPtr ii)
{
    if (ii->isLocalMem() ||
        (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
        return true;
    }

    return false;
}

bool
Wavefront::isOldestInstSleep()
{
    if (instructionBuffer.empty())
        return false;

    GPUDynInstPtr ii = instructionBuffer.front();

    if (ii->isSleep()) {
        return true;
    }
    return false;
}

bool
Wavefront::isOldestInstWaitcnt()
{
    if (instructionBuffer.empty())
        return false;

    GPUDynInstPtr ii = instructionBuffer.front();

    if (ii->isWaitcnt()) {
        // waitcnt is a scalar
        assert(ii->isScalar());
        return true;
    }

    return false;
}

bool
Wavefront::isOldestInstScalarALU()
{
    assert(!instructionBuffer.empty());
    GPUDynInstPtr ii = instructionBuffer.front();

    if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
        || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
        (ii->isKernArgSeg() && ii->isLoad()))) {
        return true;
    }

    return false;
}

bool
Wavefront::isOldestInstVectorALU()
{
    assert(!instructionBuffer.empty());
    GPUDynInstPtr ii = instructionBuffer.front();

    if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
        ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
        || (ii->isKernArgSeg() && ii->isLoad()))) {
        return true;
    }

    return false;
}

bool
Wavefront::isOldestInstBarrier()
{
    assert(!instructionBuffer.empty());
    GPUDynInstPtr ii = instructionBuffer.front();

    if (status != S_STOPPED && ii->isBarrier()) {
        return true;
    }

    return false;
}

bool
Wavefront::isOldestInstGMem()
{
    assert(!instructionBuffer.empty());
    GPUDynInstPtr ii = instructionBuffer.front();

    if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
        return true;
    }

    return false;
}

bool
Wavefront::isOldestInstScalarMem()
{
    assert(!instructionBuffer.empty());
    GPUDynInstPtr ii = instructionBuffer.front();

    if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
        return true;
    }

    return false;
}

bool
Wavefront::isOldestInstLMem()
{
    assert(!instructionBuffer.empty());
    GPUDynInstPtr ii = instructionBuffer.front();

    if (status != S_STOPPED && ii->isLocalMem()) {
        return true;
    }

    return false;
}

bool
Wavefront::isOldestInstPrivMem()
{
    assert(!instructionBuffer.empty());
    GPUDynInstPtr ii = instructionBuffer.front();

    if (status != S_STOPPED && ii->isPrivateSeg()) {
        return true;
    }

    return false;
}

bool
Wavefront::isOldestInstFlatMem()
{
    assert(!instructionBuffer.empty());
    GPUDynInstPtr ii = instructionBuffer.front();

    if (status != S_STOPPED && ii->isFlat()) {
        return true;
    }

    return false;
}

bool
Wavefront::stopFetch()
{
    for (auto it : instructionBuffer) {
        GPUDynInstPtr ii = it;
        if (ii->isReturn() || ii->isBranch() ||
            ii->isEndOfKernel()) {
            return true;
        }
    }

    return false;
}

void
Wavefront::freeResources()
{
    execUnitId = -1;
}

void Wavefront::validateRequestCounters()
{
    panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
             wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
             outstandingReqs < 0,
             "Negative requests in pipe for WF%d for slot%d"
             " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
             " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
             " Outstanding Reqs=%d\n",
             wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
             rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
}

void
Wavefront::reserveGmResource(GPUDynInstPtr ii)
{
    if (!ii->isScalar()) {
        if (ii->isLoad()) {
            rdGmReqsInPipe++;
        } else if (ii->isStore()) {
            wrGmReqsInPipe++;
        } else if (ii->isAtomic() || ii->isMemSync()) {
            rdGmReqsInPipe++;
            wrGmReqsInPipe++;
        } else {
            panic("Invalid memory operation!\n");
        }
        execUnitId = globalMem;
    } else {
        if (ii->isLoad()) {
            scalarRdGmReqsInPipe++;
        } else if (ii->isStore()) {
            scalarWrGmReqsInPipe++;
        } else if (ii->isAtomic() || ii->isMemSync()) {
            scalarWrGmReqsInPipe++;
            scalarRdGmReqsInPipe++;
        } else {
            panic("Invalid memory operation!\n");
        }
        execUnitId = scalarMem;
    }
}

void
Wavefront::reserveLmResource(GPUDynInstPtr ii)
{
    fatal_if(ii->isScalar(),
             "Scalar instructions can not access Shared memory!!!");
    if (ii->isLoad()) {
        rdLmReqsInPipe++;
    } else if (ii->isStore()) {
        wrLmReqsInPipe++;
    } else if (ii->isAtomic() || ii->isMemSync()) {
        wrLmReqsInPipe++;
        rdLmReqsInPipe++;
    } else {
        panic("Invalid memory operation!\n");
    }
    execUnitId = localMem;
}

std::vector<int>
Wavefront::reserveResources()
{
    // vector of execution unit IDs to return to schedule stage
    // this return is only used for debugging and an assertion...
    std::vector<int> execUnitIds;

    // Get current instruction
    GPUDynInstPtr ii = instructionBuffer.front();
    assert(ii);

    // Single precision ALU or Branch or Return or Special instruction
    if (ii->isALU() || ii->isSpecialOp() ||
        ii->isBranch() || ii->isNop() ||
        (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
        ii->isReturn() || ii->isEndOfKernel()) {
        if (!ii->isScalar()) {
            execUnitId = simdId;
        } else {
            execUnitId = scalarAluGlobalIdx;
        }
        // this is to enforce a fixed number of cycles per issue slot per SIMD
    } else if (ii->isBarrier()) {
        execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
    } else if (ii->isFlat()) {
        assert(!ii->isScalar());
        reserveLmResource(ii);
        // add execUnitId, reserved by reserveLmResource, list before it is
        // overwriten by reserveGmResource
        execUnitIds.push_back(execUnitId);
        flatLmUnitId = execUnitId;
        reserveGmResource(ii);
        flatGmUnitId = execUnitId;
        execUnitIds.push_back(flatGmUnitId);
        execUnitId = -1;
    } else if (ii->isGlobalMem()) {
        reserveGmResource(ii);
    } else if (ii->isLocalMem()) {
        reserveLmResource(ii);
    } else if (ii->isPrivateSeg()) {
        fatal_if(ii->isScalar(),
                 "Scalar instructions can not access Private memory!!!");
        reserveGmResource(ii);
    } else {
        panic("reserveResources -> Couldn't process op!\n");
    }

    if (execUnitId != -1) {
        execUnitIds.push_back(execUnitId);
    }
    assert(execUnitIds.size());
    return execUnitIds;
}

void
Wavefront::exec()
{
    // ---- Exit if wavefront is inactive ----------------------------- //

    if (status == S_STOPPED || status == S_RETURNING ||
        status==S_STALLED || instructionBuffer.empty()) {
        return;
    }

    if (status == S_WAITCNT) {
        /**
         * if this wave is in S_WAITCNT state, then
         * it should enter exec() precisely one time
         * before the waitcnts are satisfied, in order
         * to execute the waitcnt instruction itself
         * thus we assert that the waitcnt is the
         * oldest instruction. if we enter exec() with
         * active waitcnts, and we're not executing
         * the waitcnt instruction, something must be
         * wrong
         */
        assert(isOldestInstWaitcnt());
    }

    // Get current instruction

    GPUDynInstPtr ii = instructionBuffer.front();

    const Addr old_pc = pc();
    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
            "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
            wfDynId, ii->disassemble(), old_pc, ii->seqNum());

    ii->execute(ii);
    // delete the dynamic instruction from the pipeline map
    computeUnit->deleteFromPipeMap(this);
    // update the instruction stats in the CU
    computeUnit->updateInstStats(ii);

    // inform VRF of instruction execution to schedule write-back
    // and scoreboard ready for registers
    if (!ii->isScalar()) {
        computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
        computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
    }
    computeUnit->srf[simdId]->waveExecuteInst(this, ii);

    computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
    computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
    computeUnit->stats.numInstrExecuted++;
    stats.numInstrExecuted++;
    computeUnit->instExecPerSimd[simdId]++;
    computeUnit->stats.execRateDist.sample(
                                    computeUnit->stats.totalCycles.value() -
                                    computeUnit->lastExecCycle[simdId]);
    computeUnit->lastExecCycle[simdId] =
        computeUnit->stats.totalCycles.value();

    if (lastInstExec) {
        computeUnit->stats.instInterleave[simdId].
            sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
    }
    lastInstExec = computeUnit->instExecPerSimd[simdId];

    // want to track:
    // number of reads that occur per value written

    // vector RAW dependency tracking
    for (const auto& srcVecOp : ii->srcVecRegOperands()) {
        for (const auto& virtIdx : srcVecOp.virtIndices()) {
            // This check should never fail, but to be safe we check
            if (rawDist.find(virtIdx) != rawDist.end()) {
                stats.vecRawDistance.sample(stats.numInstrExecuted.value() -
                                      rawDist[virtIdx]);
            }
            // increment number of reads to this register
            vecReads[virtIdx]++;
        }
    }

    for (const auto& dstVecOp : ii->dstVecRegOperands()) {
        for (const auto& virtIdx : dstVecOp.virtIndices()) {
            // rawDist is set on writes, but will not be set for the first
            // write to each physical register
            if (rawDist.find(virtIdx) != rawDist.end()) {
                // Sample the number of reads that were performed
                stats.readsPerWrite.sample(vecReads[virtIdx]);
            }
            // on a write, reset count of reads to 0
            vecReads[virtIdx] = 0;

            rawDist[virtIdx] = stats.numInstrExecuted.value();
        }
    }

    if (pc() == old_pc) {
        // PC not modified by instruction, proceed to next
        _gpuISA.advancePC(ii);
        instructionBuffer.pop_front();
    } else {
        DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
                computeUnit->cu_id, simdId, wfSlotId, wfDynId,
                ii->disassemble());
        discardFetch();
    }
    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
            computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());

    if (computeUnit->shader->hsail_mode==Shader::SIMT) {
        const int num_active_lanes = execMask().count();
        computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
        computeUnit->stats.numVecOpsExecuted += num_active_lanes;

        if (ii->isF16() && ii->isALU()) {
            if (ii->isF32() || ii->isF64()) {
                fatal("Instruction is tagged as both (1) F16, and (2)"
                       "either F32 or F64.");
            }
            computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
            if (ii->isFMA()) {
                computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
                computeUnit->stats.numVecOpsExecutedTwoOpFP
                    += num_active_lanes;
            }
            else if (ii->isMAC()) {
                computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
                computeUnit->stats.numVecOpsExecutedTwoOpFP
                    += num_active_lanes;
            }
            else if (ii->isMAD()) {
                computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
                computeUnit->stats.numVecOpsExecutedTwoOpFP
                    += num_active_lanes;
            }
        }
        if (ii->isF32() && ii->isALU()) {
            if (ii->isF16() || ii->isF64()) {
                fatal("Instruction is tagged as both (1) F32, and (2)"
                       "either F16 or F64.");
            }
            computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
            if (ii->isFMA()) {
                computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
                computeUnit->stats.numVecOpsExecutedTwoOpFP
                    += num_active_lanes;
            }
            else if (ii->isMAC()) {
                computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
                computeUnit->stats.numVecOpsExecutedTwoOpFP
                    += num_active_lanes;
            }
            else if (ii->isMAD()) {
                computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
                computeUnit->stats.numVecOpsExecutedTwoOpFP
                    += num_active_lanes;
            }
        }
        if (ii->isF64() && ii->isALU()) {
            if (ii->isF16() || ii->isF32()) {
                fatal("Instruction is tagged as both (1) F64, and (2)"
                       "either F16 or F32.");
            }
            computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
            if (ii->isFMA()) {
                computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
                computeUnit->stats.numVecOpsExecutedTwoOpFP
                    += num_active_lanes;
            }
            else if (ii->isMAC()) {
                computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
                computeUnit->stats.numVecOpsExecutedTwoOpFP
                    += num_active_lanes;
            }
            else if (ii->isMAD()) {
                computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
                computeUnit->stats.numVecOpsExecutedTwoOpFP
                    += num_active_lanes;
            }
        }
        if (isGmInstruction(ii)) {
            computeUnit->stats.activeLanesPerGMemInstrDist.sample(
                                                            num_active_lanes);
        } else if (isLmInstruction(ii)) {
            computeUnit->stats.activeLanesPerLMemInstrDist.sample(
                                                            num_active_lanes);
        }
    }

    /**
     * we return here to avoid spurious errors related to flat insts
     * and their address segment resolution.
     */
    if (execMask().none() && ii->needsToken()) {
        computeUnit->getTokenManager()->recvTokens(1);
        return;
    }

    // Update Vector ALU pipeline and other resources
    bool flat_as_gm = false;
    bool flat_as_lm = false;
    if (ii->isFlat()) {
        flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
                     (ii->executedAs() == enums::SC_PRIVATE);
        flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
    }

    // Single precision ALU or Branch or Return or Special instruction
    // Note, we use the same timing regardless of SP or DP ALU operation.
    if (ii->isALU() || ii->isSpecialOp() ||
        ii->isBranch() || ii->isNop() ||
        (ii->isKernArgSeg() && ii->isLoad()) ||
        ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
        // this is to enforce a fixed number of cycles per issue slot per SIMD
        if (!ii->isScalar()) {
            computeUnit->vectorALUs[simdId].set(computeUnit->
                cyclesToTicks(computeUnit->issuePeriod));
        } else {
            computeUnit->scalarALUs[scalarAlu].set(computeUnit->
                cyclesToTicks(computeUnit->issuePeriod));
        }
    // Barrier on Scalar ALU
    } else if (ii->isBarrier()) {
        computeUnit->scalarALUs[scalarAlu].set(computeUnit->
            cyclesToTicks(computeUnit->issuePeriod));
    // GM or Flat as GM Load
    } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
        if (!ii->isScalar()) {
            computeUnit->vrfToGlobalMemPipeBus.set(
                computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
            computeUnit->vectorGlobalMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
                computeUnit->vrf_gm_bus_latency;
        } else {
            computeUnit->srfToScalarMemPipeBus.set(computeUnit->
                cyclesToTicks(computeUnit->srf_scm_bus_latency));
            computeUnit->scalarMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
                computeUnit->srf_scm_bus_latency;
        }
    // GM or Flat as GM Store
    } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
        if (!ii->isScalar()) {
            computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
                cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
            computeUnit->vectorGlobalMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
                (2 * computeUnit->vrf_gm_bus_latency);
        } else {
            computeUnit->srfToScalarMemPipeBus.set(computeUnit->
                cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
            computeUnit->scalarMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
                (2 * computeUnit->srf_scm_bus_latency);
        }
    } else if ((ii->isAtomic() || ii->isMemSync()) &&
               (ii->isGlobalMem() || flat_as_gm)) {
        if (!ii->isScalar()) {
            computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
                cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
            computeUnit->vectorGlobalMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
                (2 * computeUnit->vrf_gm_bus_latency);
        } else {
            computeUnit->srfToScalarMemPipeBus.set(computeUnit->
                cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
            computeUnit->scalarMemUnit.
                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
                (2 * computeUnit->srf_scm_bus_latency);
        }
    // LM or Flat as LM Load
    } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
        computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
            cyclesToTicks(computeUnit->vrf_lm_bus_latency));
        computeUnit->vectorSharedMemUnit.
            set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
            computeUnit->vrf_lm_bus_latency;
    // LM or Flat as LM Store
    } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
        computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
            cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
        computeUnit->vectorSharedMemUnit.
            set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
            (2 * computeUnit->vrf_lm_bus_latency);
    // LM or Flat as LM, Atomic or MemFence
    } else if ((ii->isAtomic() || ii->isMemSync()) &&
               (ii->isLocalMem() || flat_as_lm)) {
        computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
            cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
        computeUnit->vectorSharedMemUnit.
            set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
            (2 * computeUnit->vrf_lm_bus_latency);
    } else {
        panic("Bad instruction type!\n");
    }
}

GPUDynInstPtr
Wavefront::nextInstr()
{
    // Read next instruction from instruction buffer
    GPUDynInstPtr ii = instructionBuffer.front();
    // if the WF has been dispatched in the schedule stage then
    // check the next oldest instruction for readiness
    if (computeUnit->pipeMap.find(ii->seqNum()) !=
        computeUnit->pipeMap.end()) {
        if (instructionBuffer.size() > 1) {
            auto it = instructionBuffer.begin() + 1;
            return *it;
        } else { // No new instructions to check
            return nullptr;
        }
    }
    return ii;
}

void
Wavefront::discardFetch()
{
    instructionBuffer.clear();
    dropFetch |= pendingFetch;

    /**
     * clear the fetch buffer for this wave in order to
     * remove any stale inst data
     */
    computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
}

bool
Wavefront::waitCntsSatisfied()
{
    // Both vmWaitCnt && lgkmWaitCnt uninitialized means
    // waitCnt instruction has been dispatched but not executed yet: next
    // instruction should be blocked until waitCnt is executed.
    if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
        return false;
    }

    /**
     * If we reach here, that means an s_waitcnt instruction was executed
     * and the waitcnts are set by the execute method. Check if waitcnts
     * are satisfied.
     */
    if (vmWaitCnt != -1) {
        if (vmemInstsIssued > vmWaitCnt) {
            // vmWaitCnt not satisfied
            return false;
        }
    }

    if (expWaitCnt != -1) {
        if (expInstsIssued > expWaitCnt) {
            // expWaitCnt not satisfied
            return false;
        }
    }

    if (lgkmWaitCnt != -1) {
        if (lgkmInstsIssued > lgkmWaitCnt) {
            // lgkmWaitCnt not satisfied
            return false;
        }
    }

    // if we get here all outstanding waitcnts must
    // be satisfied, so we resume normal operation
    clearWaitCnts();

    return true;
}

bool
Wavefront::sleepDone()
{
    assert(status == S_STALLED_SLEEP);

    // if the sleep count has not been set, then the sleep instruction has not
    // been executed yet, so we will return true without setting the wavefront
    // status
    if (sleepCnt == 0)
        return false;

    sleepCnt--;
    if (sleepCnt != 0)
        return false;

    status = S_RUNNING;
    return true;
}

void
Wavefront::setSleepTime(int sleep_time)
{
    assert(sleepCnt == 0);
    sleepCnt = sleep_time;
}

void
Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
{
    // the scoreboard should have set the status
    // to S_WAITCNT once a waitcnt instruction
    // was marked as ready
    assert(status == S_WAITCNT);

    // waitcnt instruction shouldn't be sending
    // negative counts
    assert(vm_wait_cnt >= 0);
    assert(exp_wait_cnt >= 0);
    assert(lgkm_wait_cnt >= 0);
    // waitcnts are a max of 15 because we have
    // only 1 nibble (4 bits) to set the counts
    assert(vm_wait_cnt <= 0xf);
    assert(exp_wait_cnt <= 0x7);
    assert(lgkm_wait_cnt <= 0x1f);

    /**
     * prior waitcnts should be satisfied,
     * at which time the WF resets them
     * back to -1, indicating they are no
     * longer active
     */
    assert(vmWaitCnt == -1);
    assert(expWaitCnt == -1);
    assert(lgkmWaitCnt == -1);

    /**
     * if the instruction encoding
     * indicates a waitcnt of 0xf,
     * that means the waitcnt is
     * not being used
     */
    if (vm_wait_cnt != 0xf)
        vmWaitCnt = vm_wait_cnt;

    if (exp_wait_cnt != 0x7)
        expWaitCnt = exp_wait_cnt;

    if (lgkm_wait_cnt != 0x1f)
        lgkmWaitCnt = lgkm_wait_cnt;
}

void
Wavefront::clearWaitCnts()
{
    // reset the waitcnts back to
    // -1, indicating they are no
    // longer valid
    vmWaitCnt = -1;
    expWaitCnt = -1;
    lgkmWaitCnt = -1;

    // resume running normally
    status = S_RUNNING;
}

void
Wavefront::incVMemInstsIssued()
{
    ++vmemInstsIssued;
}

void
Wavefront::incExpInstsIssued()
{
    ++expInstsIssued;
}

void
Wavefront::incLGKMInstsIssued()
{
    ++lgkmInstsIssued;
}

void
Wavefront::decVMemInstsIssued()
{
    --vmemInstsIssued;
}

void
Wavefront::decExpInstsIssued()
{
    --expInstsIssued;
}

void
Wavefront::decLGKMInstsIssued()
{
    --lgkmInstsIssued;
}

Addr
Wavefront::pc() const
{
    return _pc;
}

void
Wavefront::pc(Addr new_pc)
{
    _pc = new_pc;
}

VectorMask&
Wavefront::execMask()
{
    return _execMask;
}

bool
Wavefront::execMask(int lane) const
{
    return _execMask[lane];
}

void
Wavefront::freeRegisterFile()
{
    /* clear busy registers */
    for (int i=0; i < maxVgprs; i++) {
        int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
        computeUnit->vrf[simdId]->markReg(vgprIdx, false);
    }

    /* Free registers used by this wavefront */
    uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
                         computeUnit->vrf[simdId]->numRegs();
    computeUnit->registerManager->vrfPoolMgrs[simdId]->
        freeRegion(startVgprIndex, endIndex);
}

void
Wavefront::computeActualWgSz(HSAQueueEntry *task)
{
    actualWgSzTotal = 1;
    for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
        actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
                                 - task->wgId(d) * workGroupSz[d]);
        actualWgSzTotal *= actualWgSz[d];
    }
}

void
Wavefront::barrierId(int bar_id)
{
    assert(bar_id >= WFBarrier::InvalidID);
    assert(bar_id < computeUnit->numBarrierSlots());
    barId = bar_id;
}

int
Wavefront::barrierId() const
{
    return barId;
}

bool
Wavefront::hasBarrier() const
{
    return barId > WFBarrier::InvalidID;
}

void
Wavefront::releaseBarrier()
{
    barId = WFBarrier::InvalidID;
}

Wavefront::WavefrontStats::WavefrontStats(statistics::Group *parent)
    : statistics::Group(parent),
      ADD_STAT(numInstrExecuted,
               "number of instructions executed by this WF slot"),
      ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
      ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
      ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
               "RF denied adding instruction"),
      ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
               " not available"),
      ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
               "RF reads to complete"),
      ADD_STAT(schLdsArbStalls,
               "number of cycles wave stalled due to LDS-VRF arbitration"),
      // FIXME: the name of the WF needs to be unique
      ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
               "instructions are blocked due to WAW or WAR dependencies"),
      // FIXME: the name of the WF needs to be unique
      ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
               "instructions are blocked due to RAW dependencies"),
      ADD_STAT(vecRawDistance,
               "Count of RAW distance in dynamic instructions for this WF"),
      ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
{
    vecRawDistance.init(0, 20, 1);
    readsPerWrite.init(0, 4, 1);
}

} // namespace gem5