Files
gem5/src/gpu-compute/hsa_queue_entry.hh
Matthew Poremba 8be5ce6fc9 dev-amdgpu,configs,gpu-compute: Add gfx942 version
This is the version for MI300. For the most part, it is the same as
MI200 with the exception of architected flat scratch (not yet
implemented in gem5) and therefore a new version enum is required.

Change-Id: Id18cd7b57c4eebd467c010a3f61e3117beb8d58a
2024-05-15 12:08:41 -07:00

512 lines
14 KiB
C++

/*
* Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* @file
* HSAQueuEntry is the simulator's internal representation of an
* AQL queue entry (task). It encasulates all of the relevant info
* about a task, which is gathered from various runtime data
* structures including: the AQL MQD, the AQL packet, and the code
* object.
*/
#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
#include <bitset>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <vector>
#include "base/intmath.hh"
#include "base/types.hh"
#include "dev/hsa/hsa_packet.hh"
#include "dev/hsa/hsa_queue.hh"
#include "enums/GfxVersion.hh"
#include "gpu-compute/kernel_code.hh"
namespace gem5
{
class HSAQueueEntry
{
public:
HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version)
: _gfxVersion(gfx_version), kernName(kernel_name),
_wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
_gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
_queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
_hostDispPktAddr(host_pkt_addr),
_completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
->completion_signal),
codeAddress(code_addr),
kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
_outstandingInvs(-1), _outstandingWbs(0),
_ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
group_segment_size),
_privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
private_segment_size),
_contextId(0), _wgId{{ 0, 0, 0 }},
_numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
_globalWgId(0), dispatchComplete(false)
{
// Use the resource descriptors to determine number of GPRs. This will
// round up in some cases, however the exact number field in the AMD
// kernel code struct is not backwards compatible and that field is
// not populated in newer compiles. The resource descriptor dword must
// be backwards compatible, so use that always.
// LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
// #code-object-v3-kernel-descriptor
//
// Currently, the only supported gfx versions in gem5 that compute
// VGPR count differently are gfx90a and gfx942.
if (gfx_version == GfxVersion::gfx90a ||
gfx_version == GfxVersion::gfx942) {
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
} else {
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
}
// SGPR allocation granulary is 16 in GFX9
// Source: https://llvm.org/docs/AMDGPUUsage.html
if (gfx_version == GfxVersion::gfx900 ||
gfx_version == GfxVersion::gfx902 ||
gfx_version == GfxVersion::gfx908 ||
gfx_version == GfxVersion::gfx90a ||
gfx_version == GfxVersion::gfx942) {
numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
} else {
panic("Saw unknown gfx version setting up GPR counts\n");
}
initialVgprState.reset();
initialSgprState.reset();
for (int i = 0; i < MAX_DIM; ++i) {
_numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
_numWgTotal *= _numWg[i];
}
parseKernelCode(akc);
// Offset of a first AccVGPR in the unified register file.
// Granularity 4. Value 0-63. 0 - accum-offset = 4,
// 1 - accum-offset = 8, ..., 63 - accum-offset = 256.
_accumOffset = (akc->accum_offset + 1) * 4;
}
const GfxVersion&
gfxVersion() const
{
return _gfxVersion;
}
const std::string&
kernelName() const
{
return kernName;
}
int
wgSize(int dim) const
{
assert(dim < MAX_DIM);
return _wgSize[dim];
}
int
gridSize(int dim) const
{
assert(dim < MAX_DIM);
return _gridSize[dim];
}
int
numVectorRegs() const
{
return numVgprs;
}
int
numScalarRegs() const
{
return numSgprs;
}
uint32_t
queueId() const
{
return _queueId;
}
int
dispatchId() const
{
return _dispatchId;
}
void*
dispPktPtr()
{
return dispPkt;
}
Addr
hostDispPktAddr() const
{
return _hostDispPktAddr;
}
Addr
completionSignal() const
{
return _completionSignal;
}
Addr
codeAddr() const
{
return codeAddress;
}
Addr
kernargAddr() const
{
return kernargAddress;
}
int
ldsSize() const
{
return _ldsSize;
}
int privMemPerItem() const { return _privMemPerItem; }
int
contextId() const
{
return _contextId;
}
bool
dispComplete() const
{
return dispatchComplete;
}
int
wgId(int dim) const
{
assert(dim < MAX_DIM);
return _wgId[dim];
}
void
wgId(int dim, int val)
{
assert(dim < MAX_DIM);
_wgId[dim] = val;
}
int
globalWgId() const
{
return _globalWgId;
}
void
globalWgId(int val)
{
_globalWgId = val;
}
int
numWg(int dim) const
{
assert(dim < MAX_DIM);
return _numWg[dim];
}
void
notifyWgCompleted()
{
++_numWgCompleted;
}
int
numWgCompleted() const
{
return _numWgCompleted;
}
int
numWgTotal() const
{
return _numWgTotal;
}
void
markWgDispatch()
{
++_wgId[0];
++_globalWgId;
if (wgId(0) * wgSize(0) >= gridSize(0)) {
_wgId[0] = 0;
++_wgId[1];
if (wgId(1) * wgSize(1) >= gridSize(1)) {
_wgId[1] = 0;
++_wgId[2];
if (wgId(2) * wgSize(2) >= gridSize(2)) {
dispatchComplete = true;
}
}
}
}
int
numWgAtBarrier() const
{
return numWgArrivedAtBarrier;
}
bool vgprBitEnabled(int bit) const
{
return initialVgprState.test(bit);
}
bool sgprBitEnabled(int bit) const
{
return initialSgprState.test(bit);
}
/**
* Host-side addr of the amd_queue_t on which
* this task was queued.
*/
Addr hostAMDQueueAddr;
/**
* Keep a copy of the AMD HSA queue because we
* need info from some of its fields to initialize
* register state.
*/
_amd_queue_t amdQueue;
// the maximum number of dimensions for a grid or workgroup
const static int MAX_DIM = 3;
/* getter */
int
outstandingInvs() {
return _outstandingInvs;
}
/**
* Whether invalidate has started or finished -1 is the
* initial value indicating inv has not started for the
* kernel.
*/
bool
isInvStarted()
{
return (_outstandingInvs != -1);
}
/**
* update the number of pending invalidate requests
*
* val: negative to decrement, positive to increment
*/
void
updateOutstandingInvs(int val)
{
_outstandingInvs += val;
assert(_outstandingInvs >= 0);
}
/**
* Forcefully change the state to be inv done.
*/
void
markInvDone()
{
_outstandingInvs = 0;
}
/**
* Is invalidate done?
*/
bool
isInvDone() const
{
assert(_outstandingInvs >= 0);
return (_outstandingInvs == 0);
}
int
outstandingWbs() const
{
return _outstandingWbs;
}
/**
* Update the number of pending writeback requests.
*
* val: negative to decrement, positive to increment
*/
void
updateOutstandingWbs(int val)
{
_outstandingWbs += val;
assert(_outstandingWbs >= 0);
}
unsigned
accumOffset() const
{
return _accumOffset;
}
private:
void
parseKernelCode(AMDKernelCode *akc)
{
/** set the enable bits for the initial SGPR state */
initialSgprState.set(PrivateSegBuf,
akc->enable_sgpr_private_segment_buffer);
initialSgprState.set(DispatchPtr,
akc->enable_sgpr_dispatch_ptr);
initialSgprState.set(QueuePtr,
akc->enable_sgpr_queue_ptr);
initialSgprState.set(KernargSegPtr,
akc->enable_sgpr_kernarg_segment_ptr);
initialSgprState.set(DispatchId,
akc->enable_sgpr_dispatch_id);
initialSgprState.set(FlatScratchInit,
akc->enable_sgpr_flat_scratch_init);
initialSgprState.set(PrivateSegSize,
akc->enable_sgpr_private_segment_size);
initialSgprState.set(WorkgroupIdX,
akc->enable_sgpr_workgroup_id_x);
initialSgprState.set(WorkgroupIdY,
akc->enable_sgpr_workgroup_id_y);
initialSgprState.set(WorkgroupIdZ,
akc->enable_sgpr_workgroup_id_z);
initialSgprState.set(WorkgroupInfo,
akc->enable_sgpr_workgroup_info);
initialSgprState.set(PrivSegWaveByteOffset,
akc->enable_private_segment);
/**
* set the enable bits for the initial VGPR state. the
* workitem Id in the X dimension is always initialized.
*/
initialVgprState.set(WorkitemIdX, true);
initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id > 0);
initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id > 1);
}
// store gfx version for version specific task handling
GfxVersion _gfxVersion;
// name of the kernel associated with the AQL entry
std::string kernName;
// workgroup Size (3 dimensions)
std::array<int, MAX_DIM> _wgSize;
// grid Size (3 dimensions)
std::array<int, MAX_DIM> _gridSize;
// total number of VGPRs per work-item
int numVgprs;
// total number of SGPRs per wavefront
int numSgprs;
// id of AQL queue in which this entry is placed
uint32_t _queueId;
int _dispatchId;
// raw AQL packet pointer
void *dispPkt;
// host-side addr of the dispatch packet
Addr _hostDispPktAddr;
// pointer to bool
Addr _completionSignal;
// base address of the raw machine code
Addr codeAddress;
// base address of the kernel args
Addr kernargAddress;
/**
* Number of outstanding invs for the kernel.
* values:
* -1: initial value, invalidate has not started for the kernel
* 0: 1)-1->0, about to start (a transient state, added in the same cycle)
* 2)+1->0, all inv requests are finished, i.e., invalidate done
* ?: positive value, indicating the number of pending inv requests
*/
int _outstandingInvs;
/**
* Number of outstanding wbs for the kernel
* values:
* 0: 1)initial value, flush has not started for the kernel
* 2)+1->0: all wb requests are finished, i.e., flush done
* ?: positive value, indicating the number of pending wb requests
*/
int _outstandingWbs;
int _ldsSize;
int _privMemPerItem;
int _contextId;
std::array<int, MAX_DIM> _wgId;
std::array<int, MAX_DIM> _numWg;
int _numWgTotal;
int numWgArrivedAtBarrier;
// The number of completed work groups
int _numWgCompleted;
int _globalWgId;
bool dispatchComplete;
std::bitset<NumVectorInitFields> initialVgprState;
std::bitset<NumScalarInitFields> initialSgprState;
unsigned _accumOffset;
};
} // namespace gem5
#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__