gem5/src/gpu-compute/hsa_queue_entry.hh

/*
 * Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/**
 * @file
 * HSAQueuEntry is the simulator's internal representation of an
 * AQL queue entry (task). It encasulates all of the relevant info
 * about a task, which is gathered from various runtime data
 * structures including: the AQL MQD, the AQL packet, and the code
 * object.
 */

#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__

#include <bitset>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <vector>

#include "base/intmath.hh"
#include "base/types.hh"
#include "dev/hsa/hsa_packet.hh"
#include "dev/hsa/hsa_queue.hh"
#include "enums/GfxVersion.hh"
#include "gpu-compute/kernel_code.hh"

namespace gem5
{

class HSAQueueEntry
{
  public:
    HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
                  int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
                  Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version)
        : _gfxVersion(gfx_version), kernName(kernel_name),
          _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
                  (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
                  (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
          _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
                    (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
                    (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
          _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
          _hostDispPktAddr(host_pkt_addr),
          _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
                            ->completion_signal),
          codeAddress(code_addr),
          kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
          _outstandingInvs(-1), _outstandingWbs(0),
          _ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
                   group_segment_size),
          _privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
                         private_segment_size),
          _contextId(0), _wgId{{ 0, 0, 0 }},
          _numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
          _globalWgId(0), dispatchComplete(false)

    {
        // Use the resource descriptors to determine number of GPRs. This will
        // round up in some cases, however the exact number field in the AMD
        // kernel code struct is not backwards compatible and that field is
        // not populated in newer compiles. The resource descriptor dword must
        // be backwards compatible, so use that always.
        // LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
        //     #code-object-v3-kernel-descriptor
        //
        // Currently, the only supported gfx versions in gem5 that compute
        // VGPR count differently are gfx90a and gfx942.
        if (gfx_version == GfxVersion::gfx90a ||
            gfx_version == GfxVersion::gfx942) {
            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
        } else {
            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
        }

        // SGPR allocation granulary is 16 in GFX9
        // Source: https://llvm.org/docs/AMDGPUUsage.html
        if (gfx_version == GfxVersion::gfx900 ||
                gfx_version == GfxVersion::gfx902 ||
                gfx_version == GfxVersion::gfx908 ||
                gfx_version == GfxVersion::gfx90a ||
                gfx_version == GfxVersion::gfx942) {
            numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
        } else {
            panic("Saw unknown gfx version setting up GPR counts\n");
        }

        initialVgprState.reset();
        initialSgprState.reset();

        for (int i = 0; i < MAX_DIM; ++i) {
            _numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
            _numWgTotal *= _numWg[i];
        }

        parseKernelCode(akc);

        // Offset of a first AccVGPR in the unified register file.
        // Granularity 4. Value 0-63. 0 - accum-offset = 4,
        // 1 - accum-offset = 8, ..., 63 - accum-offset = 256.
        _accumOffset = (akc->accum_offset + 1) * 4;
    }

    const GfxVersion&
    gfxVersion() const
    {
        return _gfxVersion;
    }

    const std::string&
    kernelName() const
    {
        return kernName;
    }

    int
    wgSize(int dim) const
    {
        assert(dim < MAX_DIM);
        return _wgSize[dim];
    }

    int
    gridSize(int dim) const
    {
        assert(dim < MAX_DIM);
        return _gridSize[dim];
    }

    int
    numVectorRegs() const
    {
        return numVgprs;
    }

    int
    numScalarRegs() const
    {
        return numSgprs;
    }

    uint32_t
    queueId() const
    {
        return _queueId;
    }

    int
    dispatchId() const
    {
        return _dispatchId;
    }

    void*
    dispPktPtr()
    {
        return dispPkt;
    }

    Addr
    hostDispPktAddr() const
    {
        return _hostDispPktAddr;
    }

    Addr
    completionSignal() const
    {
        return _completionSignal;
    }

    Addr
    codeAddr() const
    {
        return codeAddress;
    }

    Addr
    kernargAddr() const
    {
        return kernargAddress;
    }

    int
    ldsSize() const
    {
        return _ldsSize;
    }

    int privMemPerItem() const { return _privMemPerItem; }

    int
    contextId() const
    {
        return _contextId;
    }

    bool
    dispComplete() const
    {
        return dispatchComplete;
    }

    int
    wgId(int dim) const
    {
        assert(dim < MAX_DIM);
        return _wgId[dim];
    }

    void
    wgId(int dim, int val)
    {
        assert(dim < MAX_DIM);
        _wgId[dim] = val;
    }

    int
    globalWgId() const
    {
        return _globalWgId;
    }

    void
    globalWgId(int val)
    {
        _globalWgId = val;
    }

    int
    numWg(int dim) const
    {
        assert(dim < MAX_DIM);
        return _numWg[dim];
    }

    void
    notifyWgCompleted()
    {
        ++_numWgCompleted;
    }

    int
    numWgCompleted() const
    {
        return _numWgCompleted;
    }

    int
    numWgTotal() const
    {
        return _numWgTotal;
    }

    void
    markWgDispatch()
    {
        ++_wgId[0];
        ++_globalWgId;

        if (wgId(0) * wgSize(0) >= gridSize(0)) {
            _wgId[0] = 0;
            ++_wgId[1];

            if (wgId(1) * wgSize(1) >= gridSize(1)) {
                _wgId[1] = 0;
                ++_wgId[2];

                if (wgId(2) * wgSize(2) >= gridSize(2)) {
                    dispatchComplete = true;
                }
            }
        }
    }

    int
    numWgAtBarrier() const
    {
        return numWgArrivedAtBarrier;
    }

    bool vgprBitEnabled(int bit) const
    {
        return initialVgprState.test(bit);
    }

    bool sgprBitEnabled(int bit) const
    {
        return initialSgprState.test(bit);
    }

    /**
     * Host-side addr of the amd_queue_t on which
     * this task was queued.
     */
    Addr hostAMDQueueAddr;

    /**
     * Keep a copy of the AMD HSA queue because we
     * need info from some of its fields to initialize
     * register state.
     */
    _amd_queue_t amdQueue;

    // the maximum number of dimensions for a grid or workgroup
    const static int MAX_DIM = 3;

    /* getter */
    int
    outstandingInvs() {
        return _outstandingInvs;
    }

    /**
     * Whether invalidate has started or finished -1 is the
     * initial value indicating inv has not started for the
     * kernel.
     */
    bool
    isInvStarted()
    {
        return (_outstandingInvs != -1);
    }

    /**
     * update the number of pending invalidate requests
     *
     * val: negative to decrement, positive to increment
     */
    void
    updateOutstandingInvs(int val)
    {
        _outstandingInvs += val;
        assert(_outstandingInvs >= 0);
    }

    /**
     * Forcefully change the state to be inv done.
     */
    void
    markInvDone()
    {
        _outstandingInvs = 0;
    }

    /**
     * Is invalidate done?
     */
    bool
    isInvDone() const
    {
        assert(_outstandingInvs >= 0);
        return (_outstandingInvs == 0);
    }

    int
    outstandingWbs() const
    {
        return _outstandingWbs;
    }

    /**
     * Update the number of pending writeback requests.
     *
     * val: negative to decrement, positive to increment
     */
    void
    updateOutstandingWbs(int val)
    {
        _outstandingWbs += val;
        assert(_outstandingWbs >= 0);
    }

    unsigned
    accumOffset() const
    {
        return _accumOffset;
    }

  private:
    void
    parseKernelCode(AMDKernelCode *akc)
    {
        /** set the enable bits for the initial SGPR state */
        initialSgprState.set(PrivateSegBuf,
            akc->enable_sgpr_private_segment_buffer);
        initialSgprState.set(DispatchPtr,
            akc->enable_sgpr_dispatch_ptr);
        initialSgprState.set(QueuePtr,
            akc->enable_sgpr_queue_ptr);
        initialSgprState.set(KernargSegPtr,
            akc->enable_sgpr_kernarg_segment_ptr);
        initialSgprState.set(DispatchId,
            akc->enable_sgpr_dispatch_id);
        initialSgprState.set(FlatScratchInit,
            akc->enable_sgpr_flat_scratch_init);
        initialSgprState.set(PrivateSegSize,
            akc->enable_sgpr_private_segment_size);
        initialSgprState.set(WorkgroupIdX,
            akc->enable_sgpr_workgroup_id_x);
        initialSgprState.set(WorkgroupIdY,
            akc->enable_sgpr_workgroup_id_y);
        initialSgprState.set(WorkgroupIdZ,
            akc->enable_sgpr_workgroup_id_z);
        initialSgprState.set(WorkgroupInfo,
            akc->enable_sgpr_workgroup_info);
        initialSgprState.set(PrivSegWaveByteOffset,
            akc->enable_private_segment);

        /**
         * set the enable bits for the initial VGPR state. the
         * workitem Id in the X dimension is always initialized.
         */
        initialVgprState.set(WorkitemIdX, true);
        initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id > 0);
        initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id > 1);
    }

    // store gfx version for version specific task handling
    GfxVersion _gfxVersion;
    // name of the kernel associated with the AQL entry
    std::string kernName;
    // workgroup Size (3 dimensions)
    std::array<int, MAX_DIM> _wgSize;
    // grid Size (3 dimensions)
    std::array<int, MAX_DIM> _gridSize;
    // total number of VGPRs per work-item
    int numVgprs;
    // total number of SGPRs per wavefront
    int numSgprs;
    // id of AQL queue in which this entry is placed
    uint32_t _queueId;
    int _dispatchId;
    // raw AQL packet pointer
    void *dispPkt;
    // host-side addr of the dispatch packet
    Addr _hostDispPktAddr;
    // pointer to bool
    Addr _completionSignal;
    // base address of the raw machine code
    Addr codeAddress;
    // base address of the kernel args
    Addr kernargAddress;
    /**
     * Number of outstanding invs for the kernel.
     * values:
     *  -1: initial value, invalidate has not started for the kernel
     *  0: 1)-1->0, about to start (a transient state, added in the same cycle)
     *     2)+1->0, all inv requests are finished, i.e., invalidate done
     *  ?: positive value, indicating the number of pending inv requests
     */
    int _outstandingInvs;
    /**
     * Number of outstanding wbs for the kernel
     * values:
     *  0: 1)initial value, flush has not started for the kernel
     *     2)+1->0: all wb requests are finished, i.e., flush done
     *  ?: positive value, indicating the number of pending wb requests
     */
    int _outstandingWbs;
    int _ldsSize;
    int _privMemPerItem;
    int _contextId;
    std::array<int, MAX_DIM> _wgId;
    std::array<int, MAX_DIM> _numWg;
    int _numWgTotal;
    int numWgArrivedAtBarrier;
    // The number of completed work groups
    int _numWgCompleted;
    int _globalWgId;
    bool dispatchComplete;

    std::bitset<NumVectorInitFields> initialVgprState;
    std::bitset<NumScalarInitFields> initialSgprState;

    unsigned _accumOffset;
};

} // namespace gem5

#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__