Apply the gem5 namespace to the codebase. Some anonymous namespaces could theoretically be removed, but since this change's main goal was to keep conflicts at a minimum, it was decided not to modify much the general shape of the files. A few missing comments of the form "// namespace X" that occurred before the newly added "} // namespace gem5" have been added for consistency. std out should not be included in the gem5 namespace, so they weren't. ProtoMessage has not been included in the gem5 namespace, since I'm not familiar with how proto works. Regarding the SystemC files, although they belong to gem5, they actually perform integration between gem5 and SystemC; therefore, it deserved its own separate namespace. Files that are automatically generated have been included in the gem5 namespace. The .isa files currently are limited to a single namespace. This limitation should be later removed to make it easier to accomodate a better API. Regarding the files in util, gem5:: was prepended where suitable. Notice that this patch was tested as much as possible given that most of these were already not previously compiling. Change-Id: Ia53d404ec79c46edaa98f654e23bc3b0e179fe2d Signed-off-by: Daniel R. Carvalho <odanrc@yahoo.com.br> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/46323 Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu> Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu> Reviewed-by: Matthew Poremba <matthew.poremba@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
484 lines
13 KiB
C++
484 lines
13 KiB
C++
/*
|
|
* Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* HSAQueuEntry is the simulator's internal representation of an
|
|
* AQL queue entry (task). It encasulates all of the relevant info
|
|
* about a task, which is gathered from various runtime data
|
|
* structures including: the AQL MQD, the AQL packet, and the code
|
|
* object.
|
|
*/
|
|
|
|
#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
|
|
#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
|
|
|
|
#include <bitset>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
#include <iostream>
|
|
#include <vector>
|
|
|
|
#include "base/intmath.hh"
|
|
#include "base/types.hh"
|
|
#include "dev/hsa/hsa_packet.hh"
|
|
#include "dev/hsa/hsa_queue.hh"
|
|
#include "gpu-compute/kernel_code.hh"
|
|
|
|
namespace gem5
|
|
{
|
|
|
|
class HSAQueueEntry
|
|
{
|
|
public:
|
|
HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
|
|
int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
|
|
Addr host_pkt_addr, Addr code_addr)
|
|
: kernName(kernel_name),
|
|
_wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
|
|
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
|
|
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
|
|
_gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
|
|
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
|
|
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
|
|
numVgprs(akc->workitem_vgpr_count),
|
|
numSgprs(akc->wavefront_sgpr_count),
|
|
_queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
|
|
_hostDispPktAddr(host_pkt_addr),
|
|
_completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
|
|
->completion_signal),
|
|
codeAddress(code_addr),
|
|
kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
|
|
_outstandingInvs(-1), _outstandingWbs(0),
|
|
_ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
|
|
group_segment_size),
|
|
_privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
|
|
private_segment_size),
|
|
_contextId(0), _wgId{{ 0, 0, 0 }},
|
|
_numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
|
|
_globalWgId(0), dispatchComplete(false)
|
|
|
|
{
|
|
// Precompiled BLIT kernels actually violate the spec a bit
|
|
// and don't set many of the required akc fields. For these kernels,
|
|
// we need to rip register usage from the resource registers.
|
|
//
|
|
// We can't get an exact number of registers from the resource
|
|
// registers because they round, but we can get an upper bound on it
|
|
if (!numVgprs)
|
|
numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
|
|
|
|
// TODO: Granularity changes for GFX9!
|
|
if (!numSgprs)
|
|
numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
|
|
|
|
initialVgprState.reset();
|
|
initialSgprState.reset();
|
|
|
|
for (int i = 0; i < MAX_DIM; ++i) {
|
|
_numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
|
|
_numWgTotal *= _numWg[i];
|
|
}
|
|
|
|
parseKernelCode(akc);
|
|
}
|
|
|
|
const std::string&
|
|
kernelName() const
|
|
{
|
|
return kernName;
|
|
}
|
|
|
|
int
|
|
wgSize(int dim) const
|
|
{
|
|
assert(dim < MAX_DIM);
|
|
return _wgSize[dim];
|
|
}
|
|
|
|
int
|
|
gridSize(int dim) const
|
|
{
|
|
assert(dim < MAX_DIM);
|
|
return _gridSize[dim];
|
|
}
|
|
|
|
int
|
|
numVectorRegs() const
|
|
{
|
|
return numVgprs;
|
|
}
|
|
|
|
int
|
|
numScalarRegs() const
|
|
{
|
|
return numSgprs;
|
|
}
|
|
|
|
uint32_t
|
|
queueId() const
|
|
{
|
|
return _queueId;
|
|
}
|
|
|
|
int
|
|
dispatchId() const
|
|
{
|
|
return _dispatchId;
|
|
}
|
|
|
|
void*
|
|
dispPktPtr()
|
|
{
|
|
return dispPkt;
|
|
}
|
|
|
|
Addr
|
|
hostDispPktAddr() const
|
|
{
|
|
return _hostDispPktAddr;
|
|
}
|
|
|
|
Addr
|
|
completionSignal() const
|
|
{
|
|
return _completionSignal;
|
|
}
|
|
|
|
Addr
|
|
codeAddr() const
|
|
{
|
|
return codeAddress;
|
|
}
|
|
|
|
Addr
|
|
kernargAddr() const
|
|
{
|
|
return kernargAddress;
|
|
}
|
|
|
|
int
|
|
ldsSize() const
|
|
{
|
|
return _ldsSize;
|
|
}
|
|
|
|
int privMemPerItem() const { return _privMemPerItem; }
|
|
|
|
int
|
|
contextId() const
|
|
{
|
|
return _contextId;
|
|
}
|
|
|
|
bool
|
|
dispComplete() const
|
|
{
|
|
return dispatchComplete;
|
|
}
|
|
|
|
int
|
|
wgId(int dim) const
|
|
{
|
|
assert(dim < MAX_DIM);
|
|
return _wgId[dim];
|
|
}
|
|
|
|
void
|
|
wgId(int dim, int val)
|
|
{
|
|
assert(dim < MAX_DIM);
|
|
_wgId[dim] = val;
|
|
}
|
|
|
|
int
|
|
globalWgId() const
|
|
{
|
|
return _globalWgId;
|
|
}
|
|
|
|
void
|
|
globalWgId(int val)
|
|
{
|
|
_globalWgId = val;
|
|
}
|
|
|
|
int
|
|
numWg(int dim) const
|
|
{
|
|
assert(dim < MAX_DIM);
|
|
return _numWg[dim];
|
|
}
|
|
|
|
void
|
|
notifyWgCompleted()
|
|
{
|
|
++_numWgCompleted;
|
|
}
|
|
|
|
int
|
|
numWgCompleted() const
|
|
{
|
|
return _numWgCompleted;
|
|
}
|
|
|
|
int
|
|
numWgTotal() const
|
|
{
|
|
return _numWgTotal;
|
|
}
|
|
|
|
void
|
|
markWgDispatch()
|
|
{
|
|
++_wgId[0];
|
|
++_globalWgId;
|
|
|
|
if (wgId(0) * wgSize(0) >= gridSize(0)) {
|
|
_wgId[0] = 0;
|
|
++_wgId[1];
|
|
|
|
if (wgId(1) * wgSize(1) >= gridSize(1)) {
|
|
_wgId[1] = 0;
|
|
++_wgId[2];
|
|
|
|
if (wgId(2) * wgSize(2) >= gridSize(2)) {
|
|
dispatchComplete = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int
|
|
numWgAtBarrier() const
|
|
{
|
|
return numWgArrivedAtBarrier;
|
|
}
|
|
|
|
bool vgprBitEnabled(int bit) const
|
|
{
|
|
return initialVgprState.test(bit);
|
|
}
|
|
|
|
bool sgprBitEnabled(int bit) const
|
|
{
|
|
return initialSgprState.test(bit);
|
|
}
|
|
|
|
/**
|
|
* Host-side addr of the amd_queue_t on which
|
|
* this task was queued.
|
|
*/
|
|
Addr hostAMDQueueAddr;
|
|
|
|
/**
|
|
* Keep a copy of the AMD HSA queue because we
|
|
* need info from some of its fields to initialize
|
|
* register state.
|
|
*/
|
|
_amd_queue_t amdQueue;
|
|
|
|
// the maximum number of dimensions for a grid or workgroup
|
|
const static int MAX_DIM = 3;
|
|
|
|
/* getter */
|
|
int
|
|
outstandingInvs() {
|
|
return _outstandingInvs;
|
|
}
|
|
|
|
/**
|
|
* Whether invalidate has started or finished -1 is the
|
|
* initial value indicating inv has not started for the
|
|
* kernel.
|
|
*/
|
|
bool
|
|
isInvStarted()
|
|
{
|
|
return (_outstandingInvs != -1);
|
|
}
|
|
|
|
/**
|
|
* update the number of pending invalidate requests
|
|
*
|
|
* val: negative to decrement, positive to increment
|
|
*/
|
|
void
|
|
updateOutstandingInvs(int val)
|
|
{
|
|
_outstandingInvs += val;
|
|
assert(_outstandingInvs >= 0);
|
|
}
|
|
|
|
/**
|
|
* Forcefully change the state to be inv done.
|
|
*/
|
|
void
|
|
markInvDone()
|
|
{
|
|
_outstandingInvs = 0;
|
|
}
|
|
|
|
/**
|
|
* Is invalidate done?
|
|
*/
|
|
bool
|
|
isInvDone() const
|
|
{
|
|
assert(_outstandingInvs >= 0);
|
|
return (_outstandingInvs == 0);
|
|
}
|
|
|
|
int
|
|
outstandingWbs() const
|
|
{
|
|
return _outstandingWbs;
|
|
}
|
|
|
|
/**
|
|
* Update the number of pending writeback requests.
|
|
*
|
|
* val: negative to decrement, positive to increment
|
|
*/
|
|
void
|
|
updateOutstandingWbs(int val)
|
|
{
|
|
_outstandingWbs += val;
|
|
assert(_outstandingWbs >= 0);
|
|
}
|
|
|
|
private:
|
|
void
|
|
parseKernelCode(AMDKernelCode *akc)
|
|
{
|
|
/** set the enable bits for the initial SGPR state */
|
|
initialSgprState.set(PrivateSegBuf,
|
|
akc->enable_sgpr_private_segment_buffer);
|
|
initialSgprState.set(DispatchPtr,
|
|
akc->enable_sgpr_dispatch_ptr);
|
|
initialSgprState.set(QueuePtr,
|
|
akc->enable_sgpr_queue_ptr);
|
|
initialSgprState.set(KernargSegPtr,
|
|
akc->enable_sgpr_kernarg_segment_ptr);
|
|
initialSgprState.set(DispatchId,
|
|
akc->enable_sgpr_dispatch_id);
|
|
initialSgprState.set(FlatScratchInit,
|
|
akc->enable_sgpr_flat_scratch_init);
|
|
initialSgprState.set(PrivateSegSize,
|
|
akc->enable_sgpr_private_segment_size);
|
|
initialSgprState.set(GridWorkgroupCountX,
|
|
akc->enable_sgpr_grid_workgroup_count_x);
|
|
initialSgprState.set(GridWorkgroupCountY,
|
|
akc->enable_sgpr_grid_workgroup_count_y);
|
|
initialSgprState.set(GridWorkgroupCountZ,
|
|
akc->enable_sgpr_grid_workgroup_count_z);
|
|
initialSgprState.set(WorkgroupIdX,
|
|
akc->enable_sgpr_workgroup_id_x);
|
|
initialSgprState.set(WorkgroupIdY,
|
|
akc->enable_sgpr_workgroup_id_y);
|
|
initialSgprState.set(WorkgroupIdZ,
|
|
akc->enable_sgpr_workgroup_id_z);
|
|
initialSgprState.set(WorkgroupInfo,
|
|
akc->enable_sgpr_workgroup_info);
|
|
initialSgprState.set(PrivSegWaveByteOffset,
|
|
akc->enable_sgpr_private_segment_wave_byte_offset);
|
|
|
|
/**
|
|
* set the enable bits for the initial VGPR state. the
|
|
* workitem Id in the X dimension is always initialized.
|
|
*/
|
|
initialVgprState.set(WorkitemIdX, true);
|
|
initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id > 0);
|
|
initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id > 1);
|
|
}
|
|
|
|
// name of the kernel associated with the AQL entry
|
|
std::string kernName;
|
|
// workgroup Size (3 dimensions)
|
|
std::array<int, MAX_DIM> _wgSize;
|
|
// grid Size (3 dimensions)
|
|
std::array<int, MAX_DIM> _gridSize;
|
|
// total number of VGPRs per work-item
|
|
int numVgprs;
|
|
// total number of SGPRs per wavefront
|
|
int numSgprs;
|
|
// id of AQL queue in which this entry is placed
|
|
uint32_t _queueId;
|
|
int _dispatchId;
|
|
// raw AQL packet pointer
|
|
void *dispPkt;
|
|
// host-side addr of the dispatch packet
|
|
Addr _hostDispPktAddr;
|
|
// pointer to bool
|
|
Addr _completionSignal;
|
|
// base address of the raw machine code
|
|
Addr codeAddress;
|
|
// base address of the kernel args
|
|
Addr kernargAddress;
|
|
/**
|
|
* Number of outstanding invs for the kernel.
|
|
* values:
|
|
* -1: initial value, invalidate has not started for the kernel
|
|
* 0: 1)-1->0, about to start (a transient state, added in the same cycle)
|
|
* 2)+1->0, all inv requests are finished, i.e., invalidate done
|
|
* ?: positive value, indicating the number of pending inv requests
|
|
*/
|
|
int _outstandingInvs;
|
|
/**
|
|
* Number of outstanding wbs for the kernel
|
|
* values:
|
|
* 0: 1)initial value, flush has not started for the kernel
|
|
* 2)+1->0: all wb requests are finished, i.e., flush done
|
|
* ?: positive value, indicating the number of pending wb requests
|
|
*/
|
|
int _outstandingWbs;
|
|
int _ldsSize;
|
|
int _privMemPerItem;
|
|
int _contextId;
|
|
std::array<int, MAX_DIM> _wgId;
|
|
std::array<int, MAX_DIM> _numWg;
|
|
int _numWgTotal;
|
|
int numWgArrivedAtBarrier;
|
|
// The number of completed work groups
|
|
int _numWgCompleted;
|
|
int _globalWgId;
|
|
bool dispatchComplete;
|
|
|
|
std::bitset<NumVectorInitFields> initialVgprState;
|
|
std::bitset<NumScalarInitFields> initialSgprState;
|
|
};
|
|
|
|
} // namespace gem5
|
|
|
|
#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__
|