gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model
Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
committed by
Anthony Gutierrez
parent
b0eac7857a
commit
b8da9abba7
5
build_opts/GCN3_X86
Normal file
5
build_opts/GCN3_X86
Normal file
@@ -0,0 +1,5 @@
|
||||
PROTOCOL = 'GPU_VIPER'
|
||||
TARGET_ISA = 'x86'
|
||||
TARGET_GPU_ISA = 'gcn3'
|
||||
BUILD_GPU = True
|
||||
CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'
|
||||
@@ -48,7 +48,7 @@ def TLB_constructor(level):
|
||||
maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\
|
||||
accessDistance = options.L%(level)dAccessDistanceStat,\
|
||||
clk_domain = SrcClockDomain(\
|
||||
clock = options.GPUClock,\
|
||||
clock = options.gpu_clock,\
|
||||
voltage_domain = VoltageDomain(\
|
||||
voltage = options.gpu_voltage)))" % locals()
|
||||
return constructor_call
|
||||
@@ -60,23 +60,22 @@ def Coalescer_constructor(level):
|
||||
coalescingWindow = options.L%(level)dCoalescingWindow,\
|
||||
disableCoalescing = options.L%(level)dDisableCoalescing,\
|
||||
clk_domain = SrcClockDomain(\
|
||||
clock = options.GPUClock,\
|
||||
clock = options.gpu_clock,\
|
||||
voltage_domain = VoltageDomain(\
|
||||
voltage = options.gpu_voltage)))" % locals()
|
||||
return constructor_call
|
||||
|
||||
def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name):
|
||||
# arguments: options, TLB level, number of private structures for this Level,
|
||||
# TLB name and Coalescer name
|
||||
def create_TLB_Coalescer(options, my_level, my_index, tlb_name,
|
||||
coalescer_name):
|
||||
# arguments: options, TLB level, number of private structures for this
|
||||
# Level, TLB name and Coalescer name
|
||||
for i in range(my_index):
|
||||
TLB_name.append(eval(TLB_constructor(my_level)))
|
||||
Coalescer_name.append(eval(Coalescer_constructor(my_level)))
|
||||
tlb_name.append(eval(TLB_constructor(my_level)))
|
||||
coalescer_name.append(eval(Coalescer_constructor(my_level)))
|
||||
|
||||
def config_tlb_hierarchy(options, system, shader_idx):
|
||||
n_cu = options.num_compute_units
|
||||
# Make this configurable now, instead of the hard coded val. The dispatcher
|
||||
# is always the last item in the system.cpu list.
|
||||
dispatcher_idx = len(system.cpu) - 1
|
||||
n_cu = options.cu_per_sa * options.sa_per_complex * \
|
||||
options.num_gpu_complexes
|
||||
|
||||
if options.TLB_config == "perLane":
|
||||
num_TLBs = 64 * n_cu
|
||||
@@ -90,21 +89,26 @@ def config_tlb_hierarchy(options, system, shader_idx):
|
||||
print("Bad option for TLB Configuration.")
|
||||
sys.exit(1)
|
||||
|
||||
#----------------------------------------------------------------------------------------
|
||||
#-------------------------------------------------------------------------
|
||||
# A visual representation of the TLB hierarchy
|
||||
# for ease of configuration
|
||||
# < Modify here the width and the number of levels if you want a different configuration >
|
||||
# width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level
|
||||
L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []},
|
||||
{'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []},
|
||||
{'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}]
|
||||
# < Modify here the width and the number of levels if you want a different
|
||||
# configuration >
|
||||
# width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc)
|
||||
# for this level
|
||||
L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [],
|
||||
'CoalescerArray': []},
|
||||
{'name': 'scalar', 'width' : options.num_scalar_cache,
|
||||
'TLBarray': [], 'CoalescerArray': []},
|
||||
{'name': 'l1', 'width': num_TLBs, 'TLBarray': [],
|
||||
'CoalescerArray': []}]
|
||||
|
||||
L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
|
||||
L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
|
||||
|
||||
TLB_hierarchy = [L1, L2, L3]
|
||||
|
||||
#----------------------------------------------------------------------------------------
|
||||
#-------------------------------------------------------------------------
|
||||
# Create the hiearchy
|
||||
# Call the appropriate constructors and add objects to the system
|
||||
|
||||
@@ -164,17 +168,14 @@ def config_tlb_hierarchy(options, system, shader_idx):
|
||||
for tlb in range(tlb_per_cu):
|
||||
exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
|
||||
system.l1_coalescer[%d].slave[%d]' % \
|
||||
(shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0))
|
||||
(shader_idx, cu_idx, tlb,
|
||||
cu_idx*tlb_per_cu+tlb, 0))
|
||||
else:
|
||||
exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
|
||||
system.l1_coalescer[%d].slave[%d]' % \
|
||||
(shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs)))
|
||||
|
||||
elif name == 'dispatcher': # Dispatcher TLB
|
||||
for index in range(TLB_type['width']):
|
||||
exec('system.cpu[%d].translation_port = \
|
||||
system.dispatcher_coalescer[%d].slave[0]' % \
|
||||
(dispatcher_idx, index))
|
||||
(shader_idx, cu_idx, tlb_per_cu,
|
||||
cu_idx / (n_cu / num_TLBs),
|
||||
cu_idx % (n_cu / num_TLBs)))
|
||||
elif name == 'sqc': # I-TLB
|
||||
for index in range(n_cu):
|
||||
sqc_tlb_index = index / options.cu_per_sqc
|
||||
@@ -182,7 +183,14 @@ def config_tlb_hierarchy(options, system, shader_idx):
|
||||
exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \
|
||||
system.sqc_coalescer[%d].slave[%d]' % \
|
||||
(shader_idx, index, sqc_tlb_index, sqc_tlb_port_id))
|
||||
|
||||
elif name == 'scalar': # Scalar D-TLB
|
||||
for index in range(n_cu):
|
||||
scalar_tlb_index = index / options.cu_per_scalar_cache
|
||||
scalar_tlb_port_id = index % options.cu_per_scalar_cache
|
||||
exec('system.cpu[%d].CUs[%d].scalar_tlb_port = \
|
||||
system.scalar_coalescer[%d].slave[%d]' % \
|
||||
(shader_idx, index, scalar_tlb_index,
|
||||
scalar_tlb_port_id))
|
||||
|
||||
# Connect the memSidePorts (masters) of all the TLBs with the
|
||||
# cpuSidePorts (slaves) of the Coalescers of the next level
|
||||
|
||||
@@ -3728,7 +3728,7 @@ namespace Gcn3ISA
|
||||
DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
|
||||
wf->computeUnit->cu_id, wf->wgId, refCount);
|
||||
|
||||
wf->computeUnit->registerManager.freeRegisters(wf);
|
||||
wf->computeUnit->registerManager->freeRegisters(wf);
|
||||
wf->computeUnit->completedWfs++;
|
||||
wf->computeUnit->activeWaves--;
|
||||
|
||||
|
||||
@@ -192,7 +192,7 @@ namespace Gcn3ISA
|
||||
*/
|
||||
bool misaligned_acc = split_addr > vaddr;
|
||||
|
||||
RequestPtr req = new Request(0, vaddr, req_size, 0,
|
||||
RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
|
||||
gpuDynInst->computeUnit()->masterId(), 0,
|
||||
gpuDynInst->wfDynId);
|
||||
|
||||
@@ -208,7 +208,6 @@ namespace Gcn3ISA
|
||||
pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
|
||||
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
|
||||
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
|
||||
delete req;
|
||||
} else {
|
||||
gpuDynInst->numScalarReqs = 1;
|
||||
gpuDynInst->setRequestFlags(req);
|
||||
@@ -243,7 +242,7 @@ namespace Gcn3ISA
|
||||
*/
|
||||
bool misaligned_acc = split_addr > vaddr;
|
||||
|
||||
RequestPtr req = new Request(0, vaddr, req_size, 0,
|
||||
RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
|
||||
gpuDynInst->computeUnit()->masterId(), 0,
|
||||
gpuDynInst->wfDynId);
|
||||
|
||||
@@ -259,7 +258,6 @@ namespace Gcn3ISA
|
||||
pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
|
||||
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
|
||||
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
|
||||
delete req;
|
||||
} else {
|
||||
gpuDynInst->numScalarReqs = 1;
|
||||
gpuDynInst->setRequestFlags(req);
|
||||
@@ -574,7 +572,8 @@ namespace Gcn3ISA
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
Addr vaddr = gpuDynInst->addr[lane];
|
||||
|
||||
RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
|
||||
RequestPtr req = std::make_shared<Request>(vaddr,
|
||||
sizeof(T), 0,
|
||||
gpuDynInst->computeUnit()->masterId(), 0,
|
||||
gpuDynInst->wfDynId);
|
||||
|
||||
@@ -600,7 +599,8 @@ namespace Gcn3ISA
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
Addr vaddr = gpuDynInst->addr[lane];
|
||||
|
||||
RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
|
||||
RequestPtr req = std::make_shared<Request>(vaddr,
|
||||
sizeof(T), 0,
|
||||
gpuDynInst->computeUnit()->masterId(),
|
||||
0, gpuDynInst->wfDynId);
|
||||
|
||||
@@ -619,7 +619,7 @@ namespace Gcn3ISA
|
||||
{
|
||||
// create request and set flags
|
||||
gpuDynInst->statusBitVector = VectorMask(1);
|
||||
Request *req = new Request(0, 0, 0, 0,
|
||||
RequestPtr req = std::make_shared<Request>(0, 0, 0,
|
||||
gpuDynInst->computeUnit()->
|
||||
masterId(), 0,
|
||||
gpuDynInst->wfDynId);
|
||||
@@ -777,7 +777,8 @@ namespace Gcn3ISA
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
Addr vaddr = gpuDynInst->addr[lane];
|
||||
|
||||
RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
|
||||
RequestPtr req = std::make_shared<Request>(vaddr,
|
||||
sizeof(T), 0,
|
||||
gpuDynInst->computeUnit()->masterId(), 0,
|
||||
gpuDynInst->wfDynId);
|
||||
|
||||
@@ -802,7 +803,8 @@ namespace Gcn3ISA
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
Addr vaddr = gpuDynInst->addr[lane];
|
||||
|
||||
RequestPtr req = new Request(0, vaddr, req_size, 0,
|
||||
RequestPtr req = std::make_shared<Request>(vaddr, req_size,
|
||||
0,
|
||||
gpuDynInst->computeUnit()->masterId(), 0,
|
||||
gpuDynInst->wfDynId);
|
||||
|
||||
@@ -826,7 +828,8 @@ namespace Gcn3ISA
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
Addr vaddr = gpuDynInst->addr[lane];
|
||||
|
||||
RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
|
||||
RequestPtr req = std::make_shared<Request>(vaddr,
|
||||
sizeof(T), 0,
|
||||
gpuDynInst->computeUnit()->masterId(),
|
||||
0, gpuDynInst->wfDynId);
|
||||
|
||||
@@ -851,7 +854,8 @@ namespace Gcn3ISA
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
Addr vaddr = gpuDynInst->addr[lane];
|
||||
|
||||
RequestPtr req = new Request(0, vaddr, req_size, 0,
|
||||
RequestPtr req = std::make_shared<Request>(vaddr, req_size,
|
||||
0,
|
||||
gpuDynInst->computeUnit()->masterId(),
|
||||
0, gpuDynInst->wfDynId);
|
||||
|
||||
@@ -875,7 +879,8 @@ namespace Gcn3ISA
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
Addr vaddr = gpuDynInst->addr[lane];
|
||||
|
||||
RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
|
||||
RequestPtr req = std::make_shared<Request>(vaddr,
|
||||
sizeof(T), 0,
|
||||
gpuDynInst->computeUnit()->masterId(), 0,
|
||||
gpuDynInst->wfDynId,
|
||||
gpuDynInst->makeAtomicOpFunctor<T>(
|
||||
|
||||
@@ -153,7 +153,7 @@ namespace Gcn3ISA
|
||||
ComputeUnit *cu = _gpuDynInst->computeUnit();
|
||||
|
||||
for (auto i = 0; i < NumDwords; ++i) {
|
||||
int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx + i);
|
||||
int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx + i);
|
||||
vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
|
||||
|
||||
DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx);
|
||||
@@ -207,7 +207,7 @@ namespace Gcn3ISA
|
||||
? _gpuDynInst->exec_mask : wf->execMask();
|
||||
|
||||
if (NumDwords == 1) {
|
||||
int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx);
|
||||
int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx);
|
||||
vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
|
||||
assert(vrfData[0]);
|
||||
auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
|
||||
@@ -223,8 +223,8 @@ namespace Gcn3ISA
|
||||
DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx);
|
||||
cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
|
||||
} else if (NumDwords == 2) {
|
||||
int vgprIdx0 = cu->registerManager.mapVgpr(wf, _opIdx);
|
||||
int vgprIdx1 = cu->registerManager.mapVgpr(wf, _opIdx + 1);
|
||||
int vgprIdx0 = cu->registerManager->mapVgpr(wf, _opIdx);
|
||||
int vgprIdx1 = cu->registerManager->mapVgpr(wf, _opIdx + 1);
|
||||
vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0);
|
||||
vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1);
|
||||
assert(vrfData[0]);
|
||||
@@ -605,16 +605,16 @@ namespace Gcn3ISA
|
||||
|
||||
if (_opIdx == REG_VCC_LO) {
|
||||
sgprIdx = cu->registerManager
|
||||
.mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
|
||||
->mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
|
||||
} else if (_opIdx == REG_FLAT_SCRATCH_HI) {
|
||||
sgprIdx = cu->registerManager
|
||||
.mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
|
||||
->mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
|
||||
} else if (_opIdx == REG_FLAT_SCRATCH_LO) {
|
||||
assert(NumDwords == 1);
|
||||
sgprIdx = cu->registerManager
|
||||
.mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
|
||||
->mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
|
||||
} else {
|
||||
sgprIdx = cu->registerManager.mapSgpr(wf, _opIdx + dword);
|
||||
sgprIdx = cu->registerManager->mapSgpr(wf, _opIdx + dword);
|
||||
}
|
||||
|
||||
assert(sgprIdx > -1);
|
||||
|
||||
@@ -101,7 +101,7 @@ HSADevice::translateOrDie(Addr vaddr, Addr &paddr)
|
||||
* with new extensions, it will likely be wrong to just arbitrarily
|
||||
* grab context zero.
|
||||
*/
|
||||
auto process = sys->getThreadContext(0)->getProcessPtr();
|
||||
auto process = sys->threads[0]->getProcessPtr();
|
||||
|
||||
if (!process->pTable->translate(vaddr, paddr)) {
|
||||
fatal("failed translation: vaddr 0x%x\n", vaddr);
|
||||
|
||||
@@ -92,3 +92,28 @@ HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
|
||||
DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start);
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Forward relevant parameters to packet processor; queueID
|
||||
* is used to link doorbell. The queueIDs are not re-used
|
||||
* in current implementation, and we allocate only one page
|
||||
* (4096 bytes) for doorbells, so check if this queue ID can
|
||||
* be mapped into that page.
|
||||
*/
|
||||
void
|
||||
HSADriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
|
||||
{
|
||||
TypedBufferArg<kfd_ioctl_create_queue_args> args(ioc_buf);
|
||||
args.copyIn(mem_proxy);
|
||||
|
||||
if (queueId >= 0x1000) {
|
||||
fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
|
||||
}
|
||||
|
||||
args->queue_id = queueId++;
|
||||
auto &hsa_pp = device->hsaPacketProc();
|
||||
hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
|
||||
args->ring_base_address, args->queue_id,
|
||||
args->ring_size);
|
||||
args.copyOut(mem_proxy);
|
||||
}
|
||||
|
||||
@@ -56,7 +56,7 @@
|
||||
|
||||
struct HSADriverParams;
|
||||
class HSADevice;
|
||||
class SETranslatingPortProxy;
|
||||
class PortProxy;
|
||||
class ThreadContext;
|
||||
|
||||
class HSADriver : public EmulatedDriver
|
||||
@@ -74,8 +74,7 @@ class HSADriver : public EmulatedDriver
|
||||
HSADevice *device;
|
||||
uint32_t queueId;
|
||||
|
||||
void allocateQueue(const SETranslatingPortProxy &mem_proxy,
|
||||
Addr ioc_buf_addr);
|
||||
void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf);
|
||||
};
|
||||
|
||||
#endif // __DEV_HSA_HSA_DRIVER_HH__
|
||||
|
||||
@@ -151,7 +151,7 @@ HSAPacketProcessor::translateOrDie(Addr vaddr, Addr &paddr)
|
||||
// Grab the process and try to translate the virtual address with it; with
|
||||
// new extensions, it will likely be wrong to just arbitrarily grab context
|
||||
// zero.
|
||||
auto process = sys->getThreadContext(0)->getProcessPtr();
|
||||
auto process = sys->threads[0]->getProcessPtr();
|
||||
|
||||
if (!process->pTable->translate(vaddr, paddr))
|
||||
fatal("failed translation: vaddr 0x%x\n", vaddr);
|
||||
@@ -393,7 +393,7 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
|
||||
* The reason for this is that the DMASequencer does
|
||||
* not support atomic operations.
|
||||
*/
|
||||
auto tc = sys->getThreadContext(0);
|
||||
auto tc = sys->threads[0];
|
||||
auto &virt_proxy = tc->getVirtProxy();
|
||||
TypedBufferArg<uint64_t> prev_signal(signal_addr);
|
||||
prev_signal.copyIn(virt_proxy);
|
||||
|
||||
@@ -92,7 +92,7 @@ HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
|
||||
// We use the same mapping function used by hsa runtime to do this mapping
|
||||
//
|
||||
// Originally
|
||||
// #define VOID_PTR_ADD32(ptr,n) \
|
||||
// #define VOID_PTR_ADD32(ptr,n)
|
||||
// (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
|
||||
// (Addr)VOID_PTR_ADD32(0, queue_id)
|
||||
Addr db_offset = queue_id;
|
||||
@@ -343,7 +343,7 @@ HWScheduler::unregisterQueue(uint64_t queue_id)
|
||||
// `(Addr)(VOID_PRT_ADD32(0, queue_id))`
|
||||
//
|
||||
// Originally
|
||||
// #define VOID_PTR_ADD32(ptr,n) \
|
||||
// #define VOID_PTR_ADD32(ptr,n)
|
||||
// (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
|
||||
// (Addr)VOID_PTR_ADD32(0, queue_id)
|
||||
Addr db_offset = queue_id;
|
||||
|
||||
@@ -1,48 +1,48 @@
|
||||
# Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from this
|
||||
# software without specific prior written permission.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Author: Steve Reinhardt
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# Authors: Steve Reinhardt
|
||||
|
||||
from m5.defines import buildEnv
|
||||
from m5.params import *
|
||||
from m5.proxy import *
|
||||
from m5.SimObject import SimObject
|
||||
|
||||
from m5.objects.Bridge import Bridge
|
||||
from m5.objects.ClockedObject import ClockedObject
|
||||
from m5.objects.Device import DmaDevice
|
||||
from m5.objects.Process import EmulatedDriver
|
||||
from m5.objects.Bridge import Bridge
|
||||
from m5.objects.HSADevice import HSADevice
|
||||
from m5.objects.HSADriver import HSADriver
|
||||
from m5.objects.LdsState import LdsState
|
||||
from m5.objects.Process import EmulatedDriver
|
||||
|
||||
class PrefetchType(Enum): vals = [
|
||||
'PF_CU',
|
||||
@@ -52,15 +52,48 @@ class PrefetchType(Enum): vals = [
|
||||
'PF_END',
|
||||
]
|
||||
|
||||
class VectorRegisterFile(SimObject):
|
||||
class PoolManager(SimObject):
|
||||
type = 'PoolManager'
|
||||
abstract = True
|
||||
cxx_header = "gpu-compute/pool_manager.hh"
|
||||
|
||||
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
|
||||
pool_size = Param.Int(2048, 'number of vector registers per SIMD')
|
||||
|
||||
# The simple pool manage only allows one workgroup to
|
||||
# be executing on a CU at any given time.
|
||||
class SimplePoolManager(PoolManager):
|
||||
type = 'SimplePoolManager'
|
||||
cxx_class = 'SimplePoolManager'
|
||||
cxx_header = "gpu-compute/simple_pool_manager.hh"
|
||||
|
||||
class RegisterFile(SimObject):
|
||||
type = 'RegisterFile'
|
||||
cxx_class = 'RegisterFile'
|
||||
cxx_header = 'gpu-compute/register_file.hh'
|
||||
|
||||
simd_id = Param.Int(-1, 'SIMD ID associated with this Register File')
|
||||
num_regs = Param.Int(2048, 'number of registers in this RF')
|
||||
wf_size = Param.Int(64, 'Wavefront size (in work items)')
|
||||
|
||||
class ScalarRegisterFile(RegisterFile):
|
||||
type = 'ScalarRegisterFile'
|
||||
cxx_class = 'ScalarRegisterFile'
|
||||
cxx_header = 'gpu-compute/scalar_register_file.hh'
|
||||
|
||||
class VectorRegisterFile(RegisterFile):
|
||||
type = 'VectorRegisterFile'
|
||||
cxx_class = 'VectorRegisterFile'
|
||||
cxx_header = 'gpu-compute/vector_register_file.hh'
|
||||
|
||||
simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
|
||||
num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
|
||||
wfSize = Param.Int(64, 'Wavefront size (in work items)')
|
||||
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
|
||||
class RegisterManager(SimObject):
|
||||
type = 'RegisterManager'
|
||||
cxx_class = 'RegisterManager'
|
||||
cxx_header = 'gpu-compute/register_manager.hh'
|
||||
|
||||
policy = Param.String("static", "Register Manager Policy")
|
||||
vrf_pool_managers = VectorParam.PoolManager('VRF Pool Managers')
|
||||
srf_pool_managers = VectorParam.PoolManager('SRF Pool Managers')
|
||||
|
||||
class Wavefront(SimObject):
|
||||
type = 'Wavefront'
|
||||
@@ -69,45 +102,68 @@ class Wavefront(SimObject):
|
||||
|
||||
simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
|
||||
wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
|
||||
wfSize = Param.Int(64, 'Wavefront size (in work items)')
|
||||
wf_size = Param.Int(64, 'Wavefront size (in work items)')
|
||||
max_ib_size = Param.Int(13, 'Maximum size (in number of insts) of the '
|
||||
'instruction buffer (IB).')
|
||||
|
||||
# Most of the default values here are obtained from the
|
||||
# AMD Graphics Core Next (GCN) Architecture whitepaper.
|
||||
class ComputeUnit(ClockedObject):
|
||||
type = 'ComputeUnit'
|
||||
cxx_class = 'ComputeUnit'
|
||||
cxx_header = 'gpu-compute/compute_unit.hh'
|
||||
|
||||
wavefronts = VectorParam.Wavefront('Number of wavefronts')
|
||||
wfSize = Param.Int(64, 'Wavefront size (in work items)')
|
||||
# Wavefront size is 64. This is configurable, however changing
|
||||
# this value to anything other than 64 will likely cause errors.
|
||||
wf_size = Param.Int(64, 'Wavefront size (in work items)')
|
||||
num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
|
||||
num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU')
|
||||
num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\
|
||||
'per CU')
|
||||
simd_width = Param.Int(16, 'width (number of lanes) per SIMD unit')
|
||||
|
||||
operand_network_length = Param.Int(1, 'number of pipe stages of operand '\
|
||||
'network')
|
||||
|
||||
spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
|
||||
'latency')
|
||||
|
||||
dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
|
||||
dpbypass_pipe_length = Param.Int(4, 'vector ALU Double Precision bypass '\
|
||||
'latency')
|
||||
|
||||
scalar_pipe_length = Param.Int(1, 'number of pipe stages per scalar ALU')
|
||||
issue_period = Param.Int(4, 'number of cycles per issue period')
|
||||
|
||||
vrf_gm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
|
||||
'GM bus')
|
||||
srf_scm_bus_latency = Param.Int(1, 'number of cycles per use of SRF '\
|
||||
'to Scalar Mem bus')
|
||||
vrf_lm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
|
||||
'LM bus')
|
||||
|
||||
num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
|
||||
num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
|
||||
n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
|
||||
mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
|
||||
"Represents the pipeline to reach the TCP and "\
|
||||
"specified in GPU clock cycles")
|
||||
mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
|
||||
"cu. Represents the pipeline between the TCP "\
|
||||
"and cu as well as TCP data array access. "\
|
||||
"Specified in GPU clock cycles")
|
||||
n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
|
||||
mem_req_latency = Param.Int(50, "Latency for request from the cu to ruby. "\
|
||||
"Represents the pipeline to reach the TCP "\
|
||||
"and specified in GPU clock cycles")
|
||||
mem_resp_latency = Param.Int(50, "Latency for responses from ruby to the "\
|
||||
"cu. Represents the pipeline between the "\
|
||||
"TCP and cu as well as TCP data array "\
|
||||
"access. Specified in GPU clock cycles")
|
||||
system = Param.System(Parent.any, "system object")
|
||||
cu_id = Param.Int('CU id')
|
||||
vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
|
||||
"in bytes")
|
||||
coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
|
||||
"in bytes")
|
||||
vrf_to_coalescer_bus_width = Param.Int(64, "VRF->Coalescer data bus "\
|
||||
"width in bytes")
|
||||
coalescer_to_vrf_bus_width = Param.Int(64, "Coalescer->VRF data bus "\
|
||||
"width in bytes")
|
||||
|
||||
memory_port = VectorMasterPort("Port to the memory system")
|
||||
translation_port = VectorMasterPort('Port to the TLB hierarchy')
|
||||
sqc_port = MasterPort("Port to the SQC (I-cache")
|
||||
sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
|
||||
scalar_port = MasterPort("Port to the scalar data cache")
|
||||
scalar_tlb_port = MasterPort("Port to the TLB for the scalar data cache")
|
||||
perLaneTLB = Param.Bool(False, "enable per-lane TLB")
|
||||
prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
|
||||
"(0 turns off prefetching)")
|
||||
@@ -116,19 +172,22 @@ class ComputeUnit(ClockedObject):
|
||||
"from last mem req in lane of "\
|
||||
"CU|Phase|Wavefront")
|
||||
execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
|
||||
xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
|
||||
debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
|
||||
functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
|
||||
|
||||
localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
|
||||
"kernel end")
|
||||
|
||||
countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
|
||||
"and how many times")
|
||||
countPages = Param.Bool(False, "Generate per-CU file of all pages "\
|
||||
"touched and how many times")
|
||||
scalar_mem_queue_size = Param.Int(32, "Number of entries in scalar "\
|
||||
"memory pipeline's queues")
|
||||
global_mem_queue_size = Param.Int(256, "Number of entries in the global "
|
||||
"memory pipeline's queues")
|
||||
local_mem_queue_size = Param.Int(256, "Number of entries in the local "
|
||||
"memory pipeline's queues")
|
||||
max_wave_requests = Param.Int(64, "number of pending vector memory "\
|
||||
"requests per wavefront")
|
||||
max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\
|
||||
" of instructions that can be sent to coalescer")
|
||||
ldsBus = Bridge() # the bridge between the CU and its LDS
|
||||
@@ -137,72 +196,54 @@ class ComputeUnit(ClockedObject):
|
||||
|
||||
vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
|
||||
"file")
|
||||
|
||||
scalar_register_file = VectorParam.ScalarRegisterFile("Scalar register "\
|
||||
"file")
|
||||
out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
|
||||
" in the GM pipeline")
|
||||
register_manager = Param.RegisterManager("Register Manager")
|
||||
fetch_depth = Param.Int(2, 'number of i-cache lines that may be '
|
||||
'buffered in the fetch unit.')
|
||||
|
||||
class Shader(ClockedObject):
|
||||
type = 'Shader'
|
||||
cxx_class = 'Shader'
|
||||
cxx_header = 'gpu-compute/shader.hh'
|
||||
|
||||
CUs = VectorParam.ComputeUnit('Number of compute units')
|
||||
n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
|
||||
gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU')
|
||||
dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher')
|
||||
n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
|
||||
impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
|
||||
ruby at kernel boundaries""")
|
||||
separate_acquire_release = Param.Bool(False,
|
||||
"""Do ld_acquire/st_release generate separate requests for the
|
||||
acquire and release?""")
|
||||
ruby at kernel boundaries""")
|
||||
globalmem = Param.MemorySize('64kB', 'Memory size')
|
||||
timing = Param.Bool(False, 'timing memory accesses')
|
||||
|
||||
cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
|
||||
translation = Param.Bool(False, "address translation");
|
||||
timer_period = Param.Clock('10us', "system timer period")
|
||||
idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold")
|
||||
max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting")
|
||||
|
||||
class ClDriver(EmulatedDriver):
|
||||
type = 'ClDriver'
|
||||
cxx_header = 'gpu-compute/cl_driver.hh'
|
||||
codefile = VectorParam.String('code file name(s)')
|
||||
class GPUComputeDriver(HSADriver):
|
||||
type = 'GPUComputeDriver'
|
||||
cxx_header = 'gpu-compute/gpu_compute_driver.hh'
|
||||
|
||||
class GpuDispatcher(DmaDevice):
|
||||
type = 'GpuDispatcher'
|
||||
class GPUDispatcher(SimObject):
|
||||
type = 'GPUDispatcher'
|
||||
cxx_header = 'gpu-compute/dispatcher.hh'
|
||||
# put at 8GB line for now
|
||||
pio_addr = Param.Addr(0x200000000, "Device Address")
|
||||
pio_latency = Param.Latency('1ns', "Programmed IO latency")
|
||||
shader_pointer = Param.Shader('pointer to shader')
|
||||
translation_port = MasterPort('Port to the dispatcher TLB')
|
||||
cpu = Param.BaseCPU("CPU to wake up on kernel completion")
|
||||
|
||||
cl_driver = Param.ClDriver('pointer to driver')
|
||||
|
||||
class MemType(Enum): vals = [
|
||||
'M_U8',
|
||||
'M_U16',
|
||||
'M_U32',
|
||||
'M_U64',
|
||||
'M_S8',
|
||||
'M_S16',
|
||||
'M_S32',
|
||||
'M_S64',
|
||||
'M_F16',
|
||||
'M_F32',
|
||||
'M_F64',
|
||||
]
|
||||
class GPUCommandProcessor(HSADevice):
|
||||
type = 'GPUCommandProcessor'
|
||||
cxx_header = 'gpu-compute/gpu_command_processor.hh'
|
||||
dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU')
|
||||
|
||||
class StorageClassType(Enum): vals = [
|
||||
'SC_SPILL',
|
||||
'SC_GLOBAL',
|
||||
'SC_SHARED',
|
||||
'SC_GROUP',
|
||||
'SC_PRIVATE',
|
||||
'SC_READONLY',
|
||||
'SC_KERNARG',
|
||||
'SC_ARG',
|
||||
'SC_NONE',
|
||||
]
|
||||
|
||||
class RegisterType(Enum): vals = [
|
||||
'RT_VECTOR',
|
||||
'RT_SCALAR',
|
||||
'RT_CONDITION',
|
||||
'RT_HARDWARE',
|
||||
'RT_NONE',
|
||||
]
|
||||
|
||||
@@ -13,9 +13,9 @@
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software
|
||||
# without specific prior written permission.
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from this
|
||||
# software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
@@ -40,15 +40,18 @@ class GPUStaticInstFlags(Enum):
|
||||
# Op types
|
||||
'ALU', # ALU op
|
||||
'Branch', # Branch instruction
|
||||
'CondBranch', # Conditinal Branch instruction
|
||||
'Nop', # No-op (no effect at all)
|
||||
'Return', # Return instruction
|
||||
'Return', # Subroutine return instruction
|
||||
'EndOfKernel', # Kernel termination instruction
|
||||
'KernelLaunch', # Kernel launch inst
|
||||
'UnconditionalJump', #
|
||||
'SpecialOp', # Special op
|
||||
'Waitcnt', # Is a waitcnt instruction
|
||||
|
||||
# Memory ops
|
||||
'MemBarrier', # Barrier instruction
|
||||
'MemFence', # Memory fence instruction
|
||||
'MemSync', # Synchronizing instruction
|
||||
'MemoryRef', # References memory (load, store, or atomic)
|
||||
'Flat', # Flat memory op
|
||||
'Load', # Reads from memory
|
||||
@@ -64,6 +67,13 @@ class GPUStaticInstFlags(Enum):
|
||||
'WritesSCC', # The instruction writes SCC
|
||||
'ReadsVCC', # The instruction reads VCC
|
||||
'WritesVCC', # The instruction writes VCC
|
||||
'ReadsEXEC', # The instruction reads Exec Mask
|
||||
'WritesEXEC', # The instruction writes Exec Mask
|
||||
'ReadsMode', # The instruction reads Mode register
|
||||
'WritesMode', # The instruction writes Mode register
|
||||
'IgnoreExec', # The instruction ignores the Exec Mask
|
||||
'IsSDWA', # The instruction is a SDWA instruction
|
||||
'IsDPP', # The instruction is a DPP instruction
|
||||
|
||||
# Atomic OP types
|
||||
'AtomicAnd',
|
||||
@@ -78,13 +88,6 @@ class GPUStaticInstFlags(Enum):
|
||||
'AtomicMax',
|
||||
'AtomicMin',
|
||||
|
||||
# Memory order flags
|
||||
'RelaxedOrder',
|
||||
'Acquire', # Has acquire semantics
|
||||
'Release', # Has release semantics
|
||||
'AcquireRelease', # Has acquire and release semantics
|
||||
'NoOrder', # Has no ordering restrictions
|
||||
|
||||
# Segment access flags
|
||||
'ArgSegment', # Accesses the arg segment
|
||||
'GlobalSegment', # Accesses global memory
|
||||
@@ -95,15 +98,17 @@ class GPUStaticInstFlags(Enum):
|
||||
'SpillSegment', # Accesses the spill segment
|
||||
'NoSegment', # Does not have an associated segment
|
||||
|
||||
# Scope flags
|
||||
'WorkitemScope',
|
||||
'WavefrontScope',
|
||||
'WorkgroupScope',
|
||||
'DeviceScope',
|
||||
'SystemScope',
|
||||
'NoScope', # Does not have an associated scope
|
||||
|
||||
# Coherence flags
|
||||
'GloballyCoherent', # Coherent with other workitems on same device
|
||||
'SystemCoherent' # Coherent with a different device, or the host
|
||||
'GloballyCoherent', # Coherent with other work-items on same device
|
||||
'SystemCoherent', # Coherent with a different device, or the host
|
||||
|
||||
# Floating-point flags
|
||||
'F16', # F16 operation
|
||||
'F32', # F32 operation
|
||||
'F64', # F64 operation
|
||||
|
||||
# MAC, MAD, FMA
|
||||
'FMA', # FMA
|
||||
'MAC', # MAC
|
||||
'MAD' # MAD
|
||||
]
|
||||
|
||||
@@ -41,56 +41,62 @@ SimObject('GPUStaticInstFlags.py')
|
||||
SimObject('LdsState.py')
|
||||
SimObject('X86GPUTLB.py')
|
||||
|
||||
if env['TARGET_GPU_ISA'] == 'hsail':
|
||||
Source('brig_object.cc')
|
||||
Source('hsail_code.cc')
|
||||
|
||||
Source('cl_driver.cc')
|
||||
Source('compute_unit.cc')
|
||||
Source('condition_register_state.cc')
|
||||
Source('dispatcher.cc')
|
||||
Source('exec_stage.cc')
|
||||
Source('fetch_stage.cc')
|
||||
Source('fetch_unit.cc')
|
||||
Source('global_memory_pipeline.cc')
|
||||
Source('gpu_command_processor.cc')
|
||||
Source('gpu_compute_driver.cc')
|
||||
Source('gpu_dyn_inst.cc')
|
||||
Source('gpu_exec_context.cc')
|
||||
Source('gpu_static_inst.cc')
|
||||
Source('gpu_tlb.cc')
|
||||
Source('hsa_object.cc')
|
||||
Source('kernel_cfg.cc')
|
||||
Source('lds_state.cc')
|
||||
Source('local_memory_pipeline.cc')
|
||||
Source('pool_manager.cc')
|
||||
Source('register_file.cc')
|
||||
Source('register_manager.cc')
|
||||
Source('scalar_memory_pipeline.cc')
|
||||
Source('scalar_register_file.cc')
|
||||
Source('schedule_stage.cc')
|
||||
Source('scheduler.cc')
|
||||
Source('scoreboard_check_stage.cc')
|
||||
Source('shader.cc')
|
||||
Source('simple_pool_manager.cc')
|
||||
Source('static_register_manager_policy.cc')
|
||||
Source('tlb_coalescer.cc')
|
||||
Source('vector_register_file.cc')
|
||||
Source('vector_register_state.cc')
|
||||
Source('wavefront.cc')
|
||||
|
||||
DebugFlag('BRIG')
|
||||
DebugFlag('GPUCoalescer')
|
||||
DebugFlag('GPUCommandProc')
|
||||
DebugFlag('GPUDriver')
|
||||
DebugFlag('GPUInitAbi')
|
||||
DebugFlag('GPUDisp')
|
||||
DebugFlag('GPUExec')
|
||||
DebugFlag('GPUFetch')
|
||||
DebugFlag('GPUHsailCFInfo')
|
||||
DebugFlag('GPUKernelInfo')
|
||||
DebugFlag('GPUMem')
|
||||
DebugFlag('GPUPort')
|
||||
DebugFlag('GPUPrefetch')
|
||||
DebugFlag('GPUReg')
|
||||
DebugFlag('GPURename')
|
||||
DebugFlag('GPURF')
|
||||
DebugFlag('GPURfState')
|
||||
DebugFlag('GPUSched')
|
||||
DebugFlag('GPUShader')
|
||||
DebugFlag('GPUSRF')
|
||||
DebugFlag('GPUSync')
|
||||
DebugFlag('GPUTLB')
|
||||
DebugFlag('GPUVRF')
|
||||
DebugFlag('HSALoader')
|
||||
DebugFlag('HSAIL')
|
||||
DebugFlag('HSAILObject')
|
||||
DebugFlag('GPUVRFSched')
|
||||
DebugFlag('GPUWgLatency')
|
||||
DebugFlag('Predictor')
|
||||
DebugFlag('WavefrontStack')
|
||||
|
||||
CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
|
||||
'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL',
|
||||
'GPUVRF'])
|
||||
'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
|
||||
'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
|
||||
'GPUInitAbi'])
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -36,28 +36,30 @@
|
||||
|
||||
#include <deque>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "base/callback.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "base/types.hh"
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "enums/PrefetchType.hh"
|
||||
#include "gpu-compute/exec_stage.hh"
|
||||
#include "gpu-compute/fetch_stage.hh"
|
||||
#include "gpu-compute/global_memory_pipeline.hh"
|
||||
#include "gpu-compute/hsa_queue_entry.hh"
|
||||
#include "gpu-compute/local_memory_pipeline.hh"
|
||||
#include "gpu-compute/qstruct.hh"
|
||||
#include "gpu-compute/register_manager.hh"
|
||||
#include "gpu-compute/scalar_memory_pipeline.hh"
|
||||
#include "gpu-compute/schedule_stage.hh"
|
||||
#include "gpu-compute/scoreboard_check_stage.hh"
|
||||
#include "mem/port.hh"
|
||||
#include "mem/token_port.hh"
|
||||
#include "sim/clocked_object.hh"
|
||||
|
||||
static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
|
||||
static const int MAX_WIDTH_FOR_MEM_INST = 32;
|
||||
|
||||
class NDRange;
|
||||
class HSAQueueEntry;
|
||||
class LdsChunk;
|
||||
class ScalarRegisterFile;
|
||||
class Shader;
|
||||
class VectorRegisterFile;
|
||||
|
||||
@@ -69,18 +71,6 @@ enum EXEC_POLICY
|
||||
RR
|
||||
};
|
||||
|
||||
// List of execution units
|
||||
enum EXEC_UNIT
|
||||
{
|
||||
SIMD0 = 0,
|
||||
SIMD1,
|
||||
SIMD2,
|
||||
SIMD3,
|
||||
GLBMEM_PIPE,
|
||||
LDSMEM_PIPE,
|
||||
NUM_UNITS
|
||||
};
|
||||
|
||||
enum TLB_CACHE
|
||||
{
|
||||
TLB_MISS_CACHE_MISS = 0,
|
||||
@@ -92,32 +82,100 @@ enum TLB_CACHE
|
||||
class ComputeUnit : public ClockedObject
|
||||
{
|
||||
public:
|
||||
FetchStage fetchStage;
|
||||
ScoreboardCheckStage scoreboardCheckStage;
|
||||
ScheduleStage scheduleStage;
|
||||
ExecStage execStage;
|
||||
GlobalMemPipeline globalMemoryPipe;
|
||||
LocalMemPipeline localMemoryPipe;
|
||||
|
||||
|
||||
// Execution resources
|
||||
//
|
||||
// The ordering of units is:
|
||||
// Vector ALUs
|
||||
// Scalar ALUs
|
||||
// GM Pipe
|
||||
// LM Pipe
|
||||
// Scalar Mem Pipe
|
||||
//
|
||||
// Note: the ordering of units is important and the code assumes the
|
||||
// above ordering. However, there may be more than one resource of
|
||||
// each type (e.g., 4 VALUs or 2 SALUs)
|
||||
|
||||
int numVectorGlobalMemUnits;
|
||||
// Resource control for global memory to VRF data/address bus
|
||||
WaitClass glbMemToVrfBus;
|
||||
// Resource control for Vector Register File->Global Memory pipe buses
|
||||
WaitClass vrfToGlobalMemPipeBus;
|
||||
// Resource control for Vector Global Memory execution unit
|
||||
WaitClass vectorGlobalMemUnit;
|
||||
|
||||
int numVectorSharedMemUnits;
|
||||
// Resource control for local memory to VRF data/address bus
|
||||
WaitClass locMemToVrfBus;
|
||||
// Resource control for Vector Register File->Local Memory pipe buses
|
||||
WaitClass vrfToLocalMemPipeBus;
|
||||
// Resource control for Vector Shared/Local Memory execution unit
|
||||
WaitClass vectorSharedMemUnit;
|
||||
|
||||
int numScalarMemUnits;
|
||||
// Resource control for scalar memory to SRF data/address bus
|
||||
WaitClass scalarMemToSrfBus;
|
||||
// Resource control for Scalar Register File->Scalar Memory pipe buses
|
||||
WaitClass srfToScalarMemPipeBus;
|
||||
// Resource control for Scalar Memory execution unit
|
||||
WaitClass scalarMemUnit;
|
||||
|
||||
// vector ALU execution resources
|
||||
int numVectorALUs;
|
||||
std::vector<WaitClass> vectorALUs;
|
||||
|
||||
// scalar ALU execution resources
|
||||
int numScalarALUs;
|
||||
std::vector<WaitClass> scalarALUs;
|
||||
|
||||
// Return total number of execution units on this CU
|
||||
int numExeUnits() const;
|
||||
// index into readyList of the first memory unit
|
||||
int firstMemUnit() const;
|
||||
// index into readyList of the last memory unit
|
||||
int lastMemUnit() const;
|
||||
// index into scalarALUs vector of SALU used by the wavefront
|
||||
int mapWaveToScalarAlu(Wavefront *w) const;
|
||||
// index into readyList of SALU used by wavefront
|
||||
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
|
||||
// index into readyList of Global Memory unit used by wavefront
|
||||
int mapWaveToGlobalMem(Wavefront *w) const;
|
||||
// index into readyList of Local Memory unit used by wavefront
|
||||
int mapWaveToLocalMem(Wavefront *w) const;
|
||||
// index into readyList of Scalar Memory unit used by wavefront
|
||||
int mapWaveToScalarMem(Wavefront *w) const;
|
||||
|
||||
int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
|
||||
int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
|
||||
int numCyclesPerStoreTransfer; // number of cycles per vector store
|
||||
int numCyclesPerLoadTransfer; // number of cycles per vector load
|
||||
|
||||
// Buffers used to communicate between various pipeline stages
|
||||
|
||||
// At a high level, the following intra-/inter-stage communication occurs:
|
||||
// SCB to SCH: readyList provides per exec resource list of waves that
|
||||
// passed dependency and readiness checks. If selected by
|
||||
// scheduler, attempt to add wave to schList conditional on
|
||||
// RF support.
|
||||
// SCH: schList holds waves that are gathering operands or waiting
|
||||
// for execution resource availability. Once ready, waves are
|
||||
// placed on the dispatchList as candidates for execution. A wave
|
||||
// may spend multiple cycles in SCH stage, on the schList due to
|
||||
// RF access conflicts or execution resource contention.
|
||||
// SCH to EX: dispatchList holds waves that are ready to be executed.
|
||||
// LM/FLAT arbitration may remove an LM wave and place it
|
||||
// back on the schList. RF model may also force a wave back
|
||||
// to the schList if using the detailed model.
|
||||
|
||||
// List of waves which are ready to be scheduled.
|
||||
// Each execution resource has a ready list. readyList is
|
||||
// used to communicate between scoreboardCheck stage and
|
||||
// schedule stage
|
||||
// TODO: make enum to index readyList
|
||||
std::vector<std::vector<Wavefront*>> readyList;
|
||||
|
||||
// Stores the status of waves. A READY implies the
|
||||
// wave is ready to be scheduled this cycle and
|
||||
// is already present in the readyList. waveStatusList is
|
||||
// used to communicate between scoreboardCheck stage and
|
||||
// schedule stage
|
||||
// TODO: convert std::pair to a class to increase readability
|
||||
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
|
||||
|
||||
// List of waves which will be dispatched to
|
||||
// each execution resource. A FILLED implies
|
||||
// each execution resource. An EXREADY implies
|
||||
// dispatch list is non-empty and
|
||||
// execution unit has something to execute
|
||||
// this cycle. Currently, the dispatch list of
|
||||
@@ -127,32 +185,67 @@ class ComputeUnit : public ClockedObject
|
||||
// and exec stage
|
||||
// TODO: convert std::pair to a class to increase readability
|
||||
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
|
||||
// track presence of dynamic instructions in the Schedule pipeline
|
||||
// stage. This is used to check the readiness of the oldest,
|
||||
// non-dispatched instruction of every WF in the Scoreboard stage.
|
||||
std::unordered_set<uint64_t> pipeMap;
|
||||
|
||||
RegisterManager* registerManager;
|
||||
|
||||
FetchStage fetchStage;
|
||||
ScoreboardCheckStage scoreboardCheckStage;
|
||||
ScheduleStage scheduleStage;
|
||||
ExecStage execStage;
|
||||
GlobalMemPipeline globalMemoryPipe;
|
||||
LocalMemPipeline localMemoryPipe;
|
||||
ScalarMemPipeline scalarMemoryPipe;
|
||||
|
||||
EventFunctionWrapper tickEvent;
|
||||
|
||||
int rrNextMemID; // used by RR WF exec policy to cycle through WF's
|
||||
int rrNextALUWp;
|
||||
typedef ComputeUnitParams Params;
|
||||
std::vector<std::vector<Wavefront*>> wfList;
|
||||
int cu_id;
|
||||
|
||||
// array of vector register files, one per SIMD
|
||||
std::vector<VectorRegisterFile*> vrf;
|
||||
// Number of vector ALU units (SIMDs) in CU
|
||||
int numSIMDs;
|
||||
// array of scalar register files, one per SIMD
|
||||
std::vector<ScalarRegisterFile*> srf;
|
||||
|
||||
// Width per VALU/SIMD unit: number of work items that can be executed
|
||||
// on the vector ALU simultaneously in a SIMD unit
|
||||
int simdWidth;
|
||||
// number of pipe stages for bypassing data to next dependent single
|
||||
// precision vector instruction inside the vector ALU pipeline
|
||||
int spBypassPipeLength;
|
||||
// number of pipe stages for bypassing data to next dependent double
|
||||
// precision vector instruction inside the vector ALU pipeline
|
||||
int dpBypassPipeLength;
|
||||
// number of cycles per issue period
|
||||
int issuePeriod;
|
||||
// number of pipe stages for scalar ALU
|
||||
int scalarPipeStages;
|
||||
// number of pipe stages for operand collection & distribution network
|
||||
int operandNetworkLength;
|
||||
// number of cycles per instruction issue period
|
||||
Cycles issuePeriod;
|
||||
|
||||
// VRF to GM Bus latency
|
||||
Cycles vrf_gm_bus_latency;
|
||||
// SRF to Scalar Mem Bus latency
|
||||
Cycles srf_scm_bus_latency;
|
||||
// VRF to LM Bus latency
|
||||
Cycles vrf_lm_bus_latency;
|
||||
|
||||
// Number of global and local memory execution resources in CU
|
||||
int numGlbMemUnits;
|
||||
int numLocMemUnits;
|
||||
// tracks the last cycle a vector instruction was executed on a SIMD
|
||||
std::vector<uint64_t> lastExecCycle;
|
||||
|
||||
// Track the amount of interleaving between wavefronts on each SIMD.
|
||||
// This stat is sampled using instExecPerSimd to compute the number of
|
||||
// instructions that have been executed on a SIMD between a WF executing
|
||||
// two successive instructions.
|
||||
Stats::VectorDistribution instInterleave;
|
||||
|
||||
// tracks the number of dyn inst executed per SIMD
|
||||
std::vector<uint64_t> instExecPerSimd;
|
||||
|
||||
// true if we allow a separate TLB per lane
|
||||
bool perLaneTLB;
|
||||
// if 0, TLB prefetching is off.
|
||||
@@ -166,8 +259,10 @@ class ComputeUnit : public ClockedObject
|
||||
Enums::PrefetchType prefetchType;
|
||||
EXEC_POLICY exec_policy;
|
||||
|
||||
bool xact_cas_mode;
|
||||
bool debugSegFault;
|
||||
// Idle CU timeout in ticks
|
||||
Tick idleCUTimeout;
|
||||
int idleWfs;
|
||||
bool functionalTLB;
|
||||
bool localMemBarrier;
|
||||
|
||||
@@ -183,91 +278,67 @@ class ComputeUnit : public ClockedObject
|
||||
|
||||
Shader *shader;
|
||||
uint32_t barrier_id;
|
||||
// vector of Vector ALU (MACC) pipelines
|
||||
std::vector<WaitClass> aluPipe;
|
||||
// minimum issue period per SIMD unit (in cycles)
|
||||
std::vector<WaitClass> wfWait;
|
||||
|
||||
// Resource control for Vector Register File->Global Memory pipe buses
|
||||
std::vector<WaitClass> vrfToGlobalMemPipeBus;
|
||||
// Resource control for Vector Register File->Local Memory pipe buses
|
||||
std::vector<WaitClass> vrfToLocalMemPipeBus;
|
||||
int nextGlbMemBus;
|
||||
int nextLocMemBus;
|
||||
// Resource control for global memory to VRF data/address bus
|
||||
WaitClass glbMemToVrfBus;
|
||||
// Resource control for local memory to VRF data/address bus
|
||||
WaitClass locMemToVrfBus;
|
||||
|
||||
uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
|
||||
uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
|
||||
uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
|
||||
uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
|
||||
|
||||
Tick req_tick_latency;
|
||||
Tick resp_tick_latency;
|
||||
|
||||
// number of vector registers being reserved for each SIMD unit
|
||||
/**
|
||||
* Number of WFs to schedule to each SIMD. This vector is populated
|
||||
* by hasDispResources(), and consumed by the subsequent call to
|
||||
* dispWorkgroup(), to schedule the specified number of WFs to the
|
||||
* SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
|
||||
*/
|
||||
std::vector<int> numWfsToSched;
|
||||
|
||||
// number of currently reserved vector registers per SIMD unit
|
||||
std::vector<int> vectorRegsReserved;
|
||||
// number of currently reserved scalar registers per SIMD unit
|
||||
std::vector<int> scalarRegsReserved;
|
||||
// number of vector registers per SIMD unit
|
||||
uint32_t numVecRegsPerSimd;
|
||||
// Support for scheduling VGPR status update events
|
||||
std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
|
||||
std::vector<uint64_t> timestampVec;
|
||||
std::vector<uint8_t> statusVec;
|
||||
int numVecRegsPerSimd;
|
||||
// number of available scalar registers per SIMD unit
|
||||
int numScalarRegsPerSimd;
|
||||
|
||||
void
|
||||
registerEvent(uint32_t simdId,
|
||||
uint32_t regIdx,
|
||||
uint32_t operandSize,
|
||||
uint64_t when,
|
||||
uint8_t newStatus) {
|
||||
regIdxVec.push_back(std::make_pair(simdId, regIdx));
|
||||
timestampVec.push_back(when);
|
||||
statusVec.push_back(newStatus);
|
||||
if (operandSize > 4) {
|
||||
regIdxVec.push_back(std::make_pair(simdId,
|
||||
((regIdx + 1) %
|
||||
numVecRegsPerSimd)));
|
||||
timestampVec.push_back(when);
|
||||
statusVec.push_back(newStatus);
|
||||
}
|
||||
}
|
||||
|
||||
void updateEvents();
|
||||
void updateReadyList(int unitId);
|
||||
|
||||
// this hash map will keep track of page divergence
|
||||
// per memory instruction per wavefront. The hash map
|
||||
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
|
||||
std::map<Addr, int> pagesTouched;
|
||||
|
||||
void insertInPipeMap(Wavefront *w);
|
||||
void deleteFromPipeMap(Wavefront *w);
|
||||
|
||||
ComputeUnit(const Params *p);
|
||||
~ComputeUnit();
|
||||
int spBypassLength() { return spBypassPipeLength; };
|
||||
int dpBypassLength() { return dpBypassPipeLength; };
|
||||
int storeBusLength() { return numCyclesPerStoreTransfer; };
|
||||
int loadBusLength() { return numCyclesPerLoadTransfer; };
|
||||
int wfSize() const { return wavefrontSize; };
|
||||
|
||||
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
|
||||
// Timing Functions
|
||||
int oprNetPipeLength() const { return operandNetworkLength; }
|
||||
int simdUnitWidth() const { return simdWidth; }
|
||||
int spBypassLength() const { return spBypassPipeLength; }
|
||||
int dpBypassLength() const { return dpBypassPipeLength; }
|
||||
int scalarPipeLength() const { return scalarPipeStages; }
|
||||
int storeBusLength() const { return numCyclesPerStoreTransfer; }
|
||||
int loadBusLength() const { return numCyclesPerLoadTransfer; }
|
||||
int wfSize() const { return wavefrontSize; }
|
||||
|
||||
void exec();
|
||||
void initiateFetch(Wavefront *wavefront);
|
||||
void fetch(PacketPtr pkt, Wavefront *wavefront);
|
||||
void fillKernelState(Wavefront *w, NDRange *ndr);
|
||||
void fillKernelState(Wavefront *w, HSAQueueEntry *task);
|
||||
|
||||
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
|
||||
NDRange *ndr);
|
||||
HSAQueueEntry *task, bool fetchContext=false);
|
||||
|
||||
void StartWorkgroup(NDRange *ndr);
|
||||
int ReadyWorkgroup(NDRange *ndr);
|
||||
void doInvalidate(RequestPtr req, int kernId);
|
||||
void doFlush(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
void dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler=false);
|
||||
bool hasDispResources(HSAQueueEntry *task);
|
||||
|
||||
int cacheLineSize() const { return _cacheLineSize; }
|
||||
int getCacheLineBits() const { return cacheLineBits; }
|
||||
|
||||
bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
|
||||
bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
|
||||
bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
|
||||
int GlbMemUnitId() { return GLBMEM_PIPE; }
|
||||
int ShrMemUnitId() { return LDSMEM_PIPE; }
|
||||
int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
|
||||
int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
|
||||
/* This function cycles through all the wavefronts in all the phases to see
|
||||
* if all of the wavefronts which should be associated with one barrier
|
||||
* (denoted with _barrier_id), are all at the same barrier in the program
|
||||
@@ -275,14 +346,15 @@ class ComputeUnit : public ClockedObject
|
||||
* return true.
|
||||
*/
|
||||
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
|
||||
bool cedeSIMD(int simdId, int wfSlotId);
|
||||
|
||||
template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
|
||||
template<typename c0, typename c1>
|
||||
void doSmReturn(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
virtual void init() override;
|
||||
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
|
||||
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
|
||||
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
|
||||
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
|
||||
bool kernelLaunch=true,
|
||||
bool kernelMemSync,
|
||||
RequestPtr req=nullptr);
|
||||
void handleMemPacket(PacketPtr pkt, int memport_index);
|
||||
bool processTimingPacket(PacketPtr pkt);
|
||||
@@ -292,7 +364,7 @@ class ComputeUnit : public ClockedObject
|
||||
MasterID masterId() { return _masterId; }
|
||||
|
||||
bool isDone() const;
|
||||
bool isSimdDone(uint32_t) const;
|
||||
bool isVectorAluIdle(uint32_t simdId) const;
|
||||
|
||||
protected:
|
||||
MasterID _masterId;
|
||||
@@ -323,6 +395,44 @@ class ComputeUnit : public ClockedObject
|
||||
Stats::Scalar scalarMemReads;
|
||||
Stats::Formula scalarMemReadsPerWF;
|
||||
|
||||
Stats::Formula vectorMemReadsPerKiloInst;
|
||||
Stats::Formula vectorMemWritesPerKiloInst;
|
||||
Stats::Formula vectorMemInstsPerKiloInst;
|
||||
Stats::Formula scalarMemReadsPerKiloInst;
|
||||
Stats::Formula scalarMemWritesPerKiloInst;
|
||||
Stats::Formula scalarMemInstsPerKiloInst;
|
||||
|
||||
// Cycles required to send register source (addr and data) from
|
||||
// register files to memory pipeline, per SIMD.
|
||||
Stats::Vector instCyclesVMemPerSimd;
|
||||
Stats::Vector instCyclesScMemPerSimd;
|
||||
Stats::Vector instCyclesLdsPerSimd;
|
||||
|
||||
Stats::Scalar globalReads;
|
||||
Stats::Scalar globalWrites;
|
||||
Stats::Formula globalMemInsts;
|
||||
Stats::Scalar argReads;
|
||||
Stats::Scalar argWrites;
|
||||
Stats::Formula argMemInsts;
|
||||
Stats::Scalar spillReads;
|
||||
Stats::Scalar spillWrites;
|
||||
Stats::Formula spillMemInsts;
|
||||
Stats::Scalar groupReads;
|
||||
Stats::Scalar groupWrites;
|
||||
Stats::Formula groupMemInsts;
|
||||
Stats::Scalar privReads;
|
||||
Stats::Scalar privWrites;
|
||||
Stats::Formula privMemInsts;
|
||||
Stats::Scalar readonlyReads;
|
||||
Stats::Scalar readonlyWrites;
|
||||
Stats::Formula readonlyMemInsts;
|
||||
Stats::Scalar kernargReads;
|
||||
Stats::Scalar kernargWrites;
|
||||
Stats::Formula kernargMemInsts;
|
||||
|
||||
int activeWaves;
|
||||
Stats::Distribution waveLevelParallelism;
|
||||
|
||||
void updateInstStats(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
// the following stats compute the avg. TLB accesslatency per
|
||||
@@ -339,21 +449,48 @@ class ComputeUnit : public ClockedObject
|
||||
// over all memory instructions executed over all wavefronts
|
||||
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
|
||||
Stats::Distribution pageDivergenceDist;
|
||||
// count of non-flat global memory vector instructions executed
|
||||
Stats::Scalar dynamicGMemInstrCnt;
|
||||
// count of flat global memory vector instructions executed
|
||||
Stats::Scalar dynamicFlatMemInstrCnt;
|
||||
Stats::Scalar dynamicLMemInstrCnt;
|
||||
|
||||
Stats::Scalar wgBlockedDueLdsAllocation;
|
||||
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
|
||||
// when the instruction is committed, this number is still incremented by 1
|
||||
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
|
||||
// active when the instruction is committed, this number is still
|
||||
// incremented by 1
|
||||
Stats::Scalar numInstrExecuted;
|
||||
// Number of cycles among successive instruction executions across all
|
||||
// wavefronts of the same CU
|
||||
Stats::Distribution execRateDist;
|
||||
// number of individual vector operations executed
|
||||
Stats::Scalar numVecOpsExecuted;
|
||||
// number of individual f16 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedF16;
|
||||
// number of individual f32 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedF32;
|
||||
// number of individual f64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedF64;
|
||||
// number of individual FMA 16,32,64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedFMA16;
|
||||
Stats::Scalar numVecOpsExecutedFMA32;
|
||||
Stats::Scalar numVecOpsExecutedFMA64;
|
||||
// number of individual MAC 16,32,64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedMAC16;
|
||||
Stats::Scalar numVecOpsExecutedMAC32;
|
||||
Stats::Scalar numVecOpsExecutedMAC64;
|
||||
// number of individual MAD 16,32,64 vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedMAD16;
|
||||
Stats::Scalar numVecOpsExecutedMAD32;
|
||||
Stats::Scalar numVecOpsExecutedMAD64;
|
||||
// total number of two op FP vector operations executed
|
||||
Stats::Scalar numVecOpsExecutedTwoOpFP;
|
||||
// Total cycles that something is running on the GPU
|
||||
Stats::Scalar totalCycles;
|
||||
Stats::Formula vpc; // vector ops per cycle
|
||||
Stats::Formula vpc_f16; // vector ops per cycle
|
||||
Stats::Formula vpc_f32; // vector ops per cycle
|
||||
Stats::Formula vpc_f64; // vector ops per cycle
|
||||
Stats::Formula ipc; // vector instructions per cycle
|
||||
Stats::Distribution controlFlowDivergenceDist;
|
||||
Stats::Distribution activeLanesPerGMemInstrDist;
|
||||
@@ -362,20 +499,16 @@ class ComputeUnit : public ClockedObject
|
||||
Stats::Formula numALUInstsExecuted;
|
||||
// number of times a WG can not start due to lack of free VGPRs in SIMDs
|
||||
Stats::Scalar numTimesWgBlockedDueVgprAlloc;
|
||||
// number of times a WG can not start due to lack of free SGPRs in SIMDs
|
||||
Stats::Scalar numTimesWgBlockedDueSgprAlloc;
|
||||
Stats::Scalar numCASOps;
|
||||
Stats::Scalar numFailedCASOps;
|
||||
Stats::Scalar completedWfs;
|
||||
// flag per vector SIMD unit that is set when there is at least one
|
||||
// WV that has a vector ALU instruction as the oldest in its
|
||||
// Instruction Buffer: Defined in the Scoreboard stage, consumed
|
||||
// by the Execute stage.
|
||||
std::vector<bool> vectorAluInstAvail;
|
||||
// number of available (oldest) LDS instructions that could have
|
||||
// been issued to the LDS at a specific issue slot
|
||||
int shrMemInstAvail;
|
||||
// number of available Global memory instructions that could have
|
||||
// been issued to TCP at a specific issue slot
|
||||
int glbMemInstAvail;
|
||||
Stats::Scalar completedWGs;
|
||||
|
||||
// distrubtion in latency difference between first and last cache block
|
||||
// arrival ticks
|
||||
Stats::Distribution headTailLatency;
|
||||
|
||||
void
|
||||
regStats() override;
|
||||
@@ -389,8 +522,6 @@ class ComputeUnit : public ClockedObject
|
||||
int32_t
|
||||
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
|
||||
|
||||
int cacheLineSize() const { return _cacheLineSize; }
|
||||
|
||||
bool
|
||||
sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
|
||||
|
||||
@@ -486,6 +617,56 @@ class ComputeUnit : public ClockedObject
|
||||
|
||||
};
|
||||
|
||||
// Scalar data cache access port
|
||||
class ScalarDataPort : public MasterPort
|
||||
{
|
||||
public:
|
||||
ScalarDataPort(const std::string &_name, ComputeUnit *_cu,
|
||||
PortID _index)
|
||||
: MasterPort(_name, _cu, _index), computeUnit(_cu), index(_index)
|
||||
{
|
||||
(void)index;
|
||||
}
|
||||
|
||||
bool recvTimingResp(PacketPtr pkt) override;
|
||||
void recvReqRetry() override;
|
||||
|
||||
struct SenderState : public Packet::SenderState
|
||||
{
|
||||
SenderState(GPUDynInstPtr gpuDynInst,
|
||||
Packet::SenderState *sender_state=nullptr)
|
||||
: _gpuDynInst(gpuDynInst), saved(sender_state)
|
||||
{
|
||||
}
|
||||
|
||||
GPUDynInstPtr _gpuDynInst;
|
||||
Packet::SenderState *saved;
|
||||
};
|
||||
|
||||
class MemReqEvent : public Event
|
||||
{
|
||||
private:
|
||||
ScalarDataPort *scalarDataPort;
|
||||
PacketPtr pkt;
|
||||
|
||||
public:
|
||||
MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt)
|
||||
: Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
|
||||
{
|
||||
setFlags(Event::AutoDelete);
|
||||
}
|
||||
|
||||
void process();
|
||||
const char *description() const;
|
||||
};
|
||||
|
||||
std::deque<PacketPtr> retries;
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
PortID index;
|
||||
};
|
||||
|
||||
// Instruction cache access port
|
||||
class SQCPort : public MasterPort
|
||||
{
|
||||
@@ -500,10 +681,13 @@ class ComputeUnit : public ClockedObject
|
||||
{
|
||||
Wavefront *wavefront;
|
||||
Packet::SenderState *saved;
|
||||
// kernel id to be used in handling I-Cache invalidate response
|
||||
int kernId;
|
||||
|
||||
SenderState(Wavefront *_wavefront, Packet::SenderState
|
||||
*sender_state=nullptr)
|
||||
: wavefront(_wavefront), saved(sender_state) { }
|
||||
*sender_state=nullptr, int _kernId=-1)
|
||||
: wavefront(_wavefront), saved(sender_state),
|
||||
kernId(_kernId){ }
|
||||
};
|
||||
|
||||
std::deque<std::pair<PacketPtr, Wavefront*>> retries;
|
||||
@@ -575,6 +759,34 @@ class ComputeUnit : public ClockedObject
|
||||
virtual void recvReqRetry();
|
||||
};
|
||||
|
||||
class ScalarDTLBPort : public MasterPort
|
||||
{
|
||||
public:
|
||||
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
|
||||
: MasterPort(_name, _cu), computeUnit(_cu), stalled(false)
|
||||
{
|
||||
}
|
||||
|
||||
struct SenderState : public Packet::SenderState
|
||||
{
|
||||
SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
|
||||
GPUDynInstPtr _gpuDynInst;
|
||||
};
|
||||
|
||||
bool recvTimingResp(PacketPtr pkt) override;
|
||||
void recvReqRetry() override { assert(false); }
|
||||
|
||||
bool isStalled() const { return stalled; }
|
||||
void stallPort() { stalled = true; }
|
||||
void unstallPort() { stalled = false; }
|
||||
|
||||
std::deque<PacketPtr> retries;
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
bool stalled;
|
||||
};
|
||||
|
||||
class ITLBPort : public MasterPort
|
||||
{
|
||||
public:
|
||||
@@ -710,6 +922,10 @@ class ComputeUnit : public ClockedObject
|
||||
std::vector<DataPort*> memPort;
|
||||
// port to the TLB hierarchy (i.e., the L1 TLB)
|
||||
std::vector<DTLBPort*> tlbPort;
|
||||
// port to the scalar data cache
|
||||
ScalarDataPort *scalarDataPort;
|
||||
// port to the scalar data TLB
|
||||
ScalarDTLBPort *scalarDTLBPort;
|
||||
// port to the SQC (i.e. the I-cache)
|
||||
SQCPort *sqcPort;
|
||||
// port to the SQC TLB (there's a separate TLB for each I-cache)
|
||||
@@ -726,6 +942,14 @@ class ComputeUnit : public ClockedObject
|
||||
tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
|
||||
this, idx);
|
||||
return *tlbPort[idx];
|
||||
} else if (if_name == "scalar_port") {
|
||||
scalarDataPort = new ScalarDataPort(csprintf("%s-port%d", name(),
|
||||
idx), this, idx);
|
||||
return *scalarDataPort;
|
||||
} else if (if_name == "scalar_tlb_port") {
|
||||
scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port", name()),
|
||||
this);
|
||||
return *scalarDTLBPort;
|
||||
} else if (if_name == "sqc_port") {
|
||||
sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
|
||||
this, idx);
|
||||
@@ -746,32 +970,18 @@ class ComputeUnit : public ClockedObject
|
||||
}
|
||||
}
|
||||
|
||||
// xact_cas_load()
|
||||
class waveIdentifier
|
||||
{
|
||||
public:
|
||||
waveIdentifier() { }
|
||||
waveIdentifier(int _simdId, int _wfSlotId)
|
||||
: simdId(_simdId), wfSlotId(_wfSlotId) { }
|
||||
|
||||
int simdId;
|
||||
int wfSlotId;
|
||||
};
|
||||
|
||||
class waveQueue
|
||||
{
|
||||
public:
|
||||
std::list<waveIdentifier> waveIDQueue;
|
||||
};
|
||||
std::map<unsigned, waveQueue> xactCasLoadMap;
|
||||
|
||||
uint64_t getAndIncSeqNum() { return globalSeqNum++; }
|
||||
InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }
|
||||
|
||||
private:
|
||||
const int _cacheLineSize;
|
||||
uint64_t globalSeqNum;
|
||||
int cacheLineBits;
|
||||
InstSeqNum globalSeqNum;
|
||||
int wavefrontSize;
|
||||
GPUStaticInst *kernelLaunchInst;
|
||||
|
||||
// hold the time of the arrival of the first cache block related to
|
||||
// a particular GPUDynInst. This is used to calculate the difference
|
||||
// between the first and last chace block arrival times.
|
||||
std::map<GPUDynInstPtr, Tick> headTailMap;
|
||||
};
|
||||
|
||||
#endif // __COMPUTE_UNIT_HH__
|
||||
|
||||
@@ -34,66 +34,76 @@
|
||||
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
|
||||
#include "cpu/base.hh"
|
||||
#include "debug/GPUDisp.hh"
|
||||
#include "gpu-compute/cl_driver.hh"
|
||||
#include "gpu-compute/cl_event.hh"
|
||||
#include "debug/GPUKernelInfo.hh"
|
||||
#include "debug/GPUWgLatency.hh"
|
||||
#include "gpu-compute/gpu_command_processor.hh"
|
||||
#include "gpu-compute/hsa_queue_entry.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "mem/packet_access.hh"
|
||||
#include "sim/syscall_emul_buf.hh"
|
||||
#include "sim/system.hh"
|
||||
|
||||
GpuDispatcher *GpuDispatcher::instance = nullptr;
|
||||
|
||||
GpuDispatcher::GpuDispatcher(const Params *p)
|
||||
: DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")),
|
||||
pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
|
||||
dispatchCount(0), dispatchActive(false), cpu(p->cpu),
|
||||
shader(p->shader_pointer), driver(p->cl_driver),
|
||||
tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
|
||||
false, Event::CPU_Tick_Pri)
|
||||
GPUDispatcher::GPUDispatcher(const Params *p)
|
||||
: SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
|
||||
tickEvent([this]{ exec(); },
|
||||
"GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
|
||||
dispatchActive(false)
|
||||
{
|
||||
shader->handshake(this);
|
||||
driver->handshake(this);
|
||||
|
||||
ndRange.wg_disp_rem = false;
|
||||
ndRange.globalWgId = 0;
|
||||
|
||||
schedule(&tickEvent, 0);
|
||||
|
||||
// translation port for the dispatcher
|
||||
tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
|
||||
|
||||
num_kernelLaunched
|
||||
.name(name() + ".num_kernel_launched")
|
||||
.desc("number of kernel launched")
|
||||
;
|
||||
}
|
||||
|
||||
GpuDispatcher *GpuDispatcherParams::create()
|
||||
GPUDispatcher::~GPUDispatcher()
|
||||
{
|
||||
GpuDispatcher *dispatcher = new GpuDispatcher(this);
|
||||
GpuDispatcher::setInstance(dispatcher);
|
||||
|
||||
return GpuDispatcher::getInstance();
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::serialize(CheckpointOut &cp) const
|
||||
GPUDispatcher::regStats()
|
||||
{
|
||||
numKernelLaunched
|
||||
.name(name() + ".num_kernel_launched")
|
||||
.desc("number of kernel launched")
|
||||
;
|
||||
|
||||
cyclesWaitingForDispatch
|
||||
.name(name() + ".cycles_wait_dispatch")
|
||||
.desc("number of cycles with outstanding wavefronts "
|
||||
"that are waiting to be dispatched")
|
||||
;
|
||||
}
|
||||
|
||||
HSAQueueEntry*
|
||||
GPUDispatcher::hsaTask(int disp_id)
|
||||
{
|
||||
assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end());
|
||||
return hsaQueueEntries[disp_id];
|
||||
}
|
||||
|
||||
void
|
||||
GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
|
||||
{
|
||||
gpuCmdProc = gpu_cmd_proc;
|
||||
}
|
||||
|
||||
void
|
||||
GPUDispatcher::setShader(Shader *new_shader)
|
||||
{
|
||||
shader = new_shader;
|
||||
}
|
||||
|
||||
void
|
||||
GPUDispatcher::serialize(CheckpointOut &cp) const
|
||||
{
|
||||
Tick event_tick = 0;
|
||||
|
||||
if (ndRange.wg_disp_rem)
|
||||
fatal("Checkpointing not supported during active workgroup execution");
|
||||
|
||||
if (tickEvent.scheduled())
|
||||
event_tick = tickEvent.when();
|
||||
|
||||
SERIALIZE_SCALAR(event_tick);
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::unserialize(CheckpointIn &cp)
|
||||
GPUDispatcher::unserialize(CheckpointIn &cp)
|
||||
{
|
||||
Tick event_tick;
|
||||
|
||||
@@ -102,288 +112,256 @@ GpuDispatcher::unserialize(CheckpointIn &cp)
|
||||
|
||||
UNSERIALIZE_SCALAR(event_tick);
|
||||
|
||||
if (event_tick)
|
||||
if (event_tick) {
|
||||
schedule(&tickEvent, event_tick);
|
||||
}
|
||||
}
|
||||
|
||||
AddrRangeList
|
||||
GpuDispatcher::getAddrRanges() const
|
||||
/**
|
||||
* After all relevant HSA data structures have been traversed/extracted
|
||||
* from memory by the CP, dispatch() is called on the dispatcher. This will
|
||||
* schedule a dispatch event that, when triggered, will attempt to dispatch
|
||||
* the WGs associated with the given task to the CUs.
|
||||
*/
|
||||
void
|
||||
GPUDispatcher::dispatch(HSAQueueEntry *task)
|
||||
{
|
||||
AddrRangeList ranges;
|
||||
++numKernelLaunched;
|
||||
|
||||
DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
|
||||
pioAddr, pioSize);
|
||||
DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
|
||||
task->kernelName(), task->dispatchId());
|
||||
|
||||
ranges.push_back(RangeSize(pioAddr, pioSize));
|
||||
execIds.push(task->dispatchId());
|
||||
dispatchActive = true;
|
||||
hsaQueueEntries.emplace(task->dispatchId(), task);
|
||||
|
||||
return ranges;
|
||||
}
|
||||
|
||||
Tick
|
||||
GpuDispatcher::read(PacketPtr pkt)
|
||||
{
|
||||
assert(pkt->getAddr() >= pioAddr);
|
||||
assert(pkt->getAddr() < pioAddr + pioSize);
|
||||
|
||||
int offset = pkt->getAddr() - pioAddr;
|
||||
pkt->allocate();
|
||||
|
||||
DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
|
||||
|
||||
if (offset < 8) {
|
||||
assert(!offset);
|
||||
assert(pkt->getSize() == 8);
|
||||
|
||||
uint64_t retval = dispatchActive;
|
||||
pkt->setLE(retval);
|
||||
} else {
|
||||
offset -= 8;
|
||||
assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
|
||||
char *curTaskPtr = (char*)&curTask;
|
||||
|
||||
memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
|
||||
if (!tickEvent.scheduled()) {
|
||||
schedule(&tickEvent, curTick() + shader->clockPeriod());
|
||||
}
|
||||
|
||||
pkt->makeAtomicResponse();
|
||||
|
||||
return pioDelay;
|
||||
}
|
||||
|
||||
Tick
|
||||
GpuDispatcher::write(PacketPtr pkt)
|
||||
{
|
||||
assert(pkt->getAddr() >= pioAddr);
|
||||
assert(pkt->getAddr() < pioAddr + pioSize);
|
||||
|
||||
int offset = pkt->getAddr() - pioAddr;
|
||||
|
||||
#if TRACING_ON
|
||||
uint64_t data_val = 0;
|
||||
|
||||
switch (pkt->getSize()) {
|
||||
case 1:
|
||||
data_val = pkt->getLE<uint8_t>();
|
||||
break;
|
||||
case 2:
|
||||
data_val = pkt->getLE<uint16_t>();
|
||||
break;
|
||||
case 4:
|
||||
data_val = pkt->getLE<uint32_t>();
|
||||
break;
|
||||
case 8:
|
||||
data_val = pkt->getLE<uint64_t>();
|
||||
break;
|
||||
default:
|
||||
DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
|
||||
}
|
||||
|
||||
DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
|
||||
pkt->getSize());
|
||||
#endif
|
||||
if (!offset) {
|
||||
static int nextId = 0;
|
||||
|
||||
// The depends field of the qstruct, which was previously unused, is
|
||||
// used to communicate with simulated application.
|
||||
if (curTask.depends) {
|
||||
HostState hs;
|
||||
shader->ReadMem((uint64_t)(curTask.depends), &hs,
|
||||
sizeof(HostState), 0);
|
||||
|
||||
// update event start time (in nano-seconds)
|
||||
uint64_t start = curTick() / 1000;
|
||||
|
||||
shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
|
||||
&start, sizeof(uint64_t), 0);
|
||||
}
|
||||
|
||||
// launch kernel
|
||||
++num_kernelLaunched;
|
||||
|
||||
NDRange *ndr = &(ndRangeMap[nextId]);
|
||||
// copy dispatch info
|
||||
ndr->q = curTask;
|
||||
|
||||
// update the numDispTask polled by the runtime
|
||||
accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
|
||||
|
||||
ndr->numWgTotal = 1;
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
ndr->wgId[i] = 0;
|
||||
ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
|
||||
ndr->numWgTotal *= ndr->numWg[i];
|
||||
}
|
||||
|
||||
ndr->numWgCompleted = 0;
|
||||
ndr->globalWgId = 0;
|
||||
ndr->wg_disp_rem = true;
|
||||
ndr->execDone = false;
|
||||
ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
|
||||
ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
|
||||
ndr->dispatchId = nextId;
|
||||
ndr->curCid = pkt->req->contextId();
|
||||
DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
|
||||
execIds.push(nextId);
|
||||
++nextId;
|
||||
|
||||
dispatchActive = true;
|
||||
|
||||
if (!tickEvent.scheduled()) {
|
||||
schedule(&tickEvent, curTick() + shader->ticks(1));
|
||||
}
|
||||
} else {
|
||||
// populate current task struct
|
||||
// first 64 bits are launch reg
|
||||
offset -= 8;
|
||||
assert(offset < sizeof(HsaQueueEntry));
|
||||
char *curTaskPtr = (char*)&curTask;
|
||||
memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
|
||||
}
|
||||
|
||||
pkt->makeAtomicResponse();
|
||||
|
||||
return pioDelay;
|
||||
}
|
||||
|
||||
|
||||
Port &
|
||||
GpuDispatcher::getPort(const std::string &if_name, PortID idx)
|
||||
{
|
||||
if (if_name == "translation_port") {
|
||||
return *tlbPort;
|
||||
}
|
||||
|
||||
return DmaDevice::getPort(if_name, idx);
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::exec()
|
||||
GPUDispatcher::exec()
|
||||
{
|
||||
int fail_count = 0;
|
||||
int fail_count(0);
|
||||
|
||||
// There are potentially multiple outstanding kernel launches.
|
||||
// It is possible that the workgroups in a different kernel
|
||||
// can fit on the GPU even if another kernel's workgroups cannot
|
||||
/**
|
||||
* There are potentially multiple outstanding kernel launches.
|
||||
* It is possible that the workgroups in a different kernel
|
||||
* can fit on the GPU even if another kernel's workgroups cannot
|
||||
*/
|
||||
DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
|
||||
|
||||
if (execIds.size() > 0) {
|
||||
++cyclesWaitingForDispatch;
|
||||
}
|
||||
|
||||
/**
|
||||
* dispatch work cannot start until the kernel's invalidate is
|
||||
* completely finished; hence, kernel will always initiates
|
||||
* invalidate first and keeps waiting until inv done
|
||||
*/
|
||||
while (execIds.size() > fail_count) {
|
||||
int execId = execIds.front();
|
||||
int exec_id = execIds.front();
|
||||
auto task = hsaQueueEntries[exec_id];
|
||||
bool launched(false);
|
||||
|
||||
while (ndRangeMap[execId].wg_disp_rem) {
|
||||
//update the thread context
|
||||
shader->updateContext(ndRangeMap[execId].curCid);
|
||||
// invalidate is needed before starting dispatch
|
||||
if (shader->impl_kern_boundary_sync) {
|
||||
// try to invalidate cache
|
||||
shader->prepareInvalidate(task);
|
||||
} else {
|
||||
// kern boundary sync is not set, skip invalidate
|
||||
task->markInvDone();
|
||||
}
|
||||
|
||||
// attempt to dispatch_workgroup
|
||||
if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
|
||||
// if we failed try the next kernel,
|
||||
// it may have smaller workgroups.
|
||||
// put it on the queue to rety latter
|
||||
DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
|
||||
execIds.push(execId);
|
||||
/**
|
||||
* invalidate is still ongoing, put the kernel on the queue to
|
||||
* retry later
|
||||
*/
|
||||
if (!task->isInvDone()){
|
||||
execIds.push(exec_id);
|
||||
++fail_count;
|
||||
|
||||
DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending"
|
||||
" invalidate requests\n", exec_id, task->outstandingInvs());
|
||||
|
||||
// try the next kernel_id
|
||||
execIds.pop();
|
||||
continue;
|
||||
}
|
||||
|
||||
// kernel invalidate is done, start workgroup dispatch
|
||||
while (!task->dispComplete()) {
|
||||
// update the thread context
|
||||
shader->updateContext(task->contextId());
|
||||
|
||||
// attempt to dispatch workgroup
|
||||
DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n",
|
||||
curTick(), exec_id);
|
||||
|
||||
if (!shader->dispatchWorkgroups(task)) {
|
||||
/**
|
||||
* if we failed try the next kernel,
|
||||
* it may have smaller workgroups.
|
||||
* put it on the queue to rety latter
|
||||
*/
|
||||
DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
|
||||
execIds.push(exec_id);
|
||||
++fail_count;
|
||||
break;
|
||||
} else if (!launched) {
|
||||
launched = true;
|
||||
DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);
|
||||
}
|
||||
}
|
||||
// let's try the next kernel_id
|
||||
|
||||
// try the next kernel_id
|
||||
execIds.pop();
|
||||
}
|
||||
|
||||
DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
|
||||
|
||||
if (doneIds.size() && cpu) {
|
||||
shader->hostWakeUp(cpu);
|
||||
}
|
||||
|
||||
while (doneIds.size()) {
|
||||
// wakeup the CPU if any Kernels completed this cycle
|
||||
DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
|
||||
DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
|
||||
doneIds.pop();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::notifyWgCompl(Wavefront *w)
|
||||
bool
|
||||
GPUDispatcher::isReachingKernelEnd(Wavefront *wf)
|
||||
{
|
||||
int kern_id = w->kernId;
|
||||
DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
|
||||
assert(ndRangeMap[kern_id].dispatchId == kern_id);
|
||||
ndRangeMap[kern_id].numWgCompleted++;
|
||||
int kern_id = wf->kernId;
|
||||
assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end());
|
||||
auto task = hsaQueueEntries[kern_id];
|
||||
assert(task->dispatchId() == kern_id);
|
||||
|
||||
if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
|
||||
ndRangeMap[kern_id].execDone = true;
|
||||
doneIds.push(kern_id);
|
||||
/**
|
||||
* whether the next workgroup is the final one in the kernel,
|
||||
* +1 as we check first before taking action
|
||||
*/
|
||||
return (task->numWgCompleted() + 1 == task->numWgTotal());
|
||||
}
|
||||
|
||||
if (ndRangeMap[kern_id].addrToNotify) {
|
||||
accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
|
||||
0);
|
||||
/**
|
||||
* update the counter of oustanding inv requests for the kernel
|
||||
* kern_id: kernel id
|
||||
* val: +1/-1, increment or decrement the counter (default: -1)
|
||||
*/
|
||||
void
|
||||
GPUDispatcher::updateInvCounter(int kern_id, int val) {
|
||||
assert(val == -1 || val == 1);
|
||||
|
||||
auto task = hsaQueueEntries[kern_id];
|
||||
task->updateOutstandingInvs(val);
|
||||
|
||||
// kernel invalidate is done, schedule dispatch work
|
||||
if (task->isInvDone() && !tickEvent.scheduled()) {
|
||||
schedule(&tickEvent, curTick() + shader->clockPeriod());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* update the counter of oustanding wb requests for the kernel
|
||||
* kern_id: kernel id
|
||||
* val: +1/-1, increment or decrement the counter (default: -1)
|
||||
*
|
||||
* return true if all wbs are done for the kernel
|
||||
*/
|
||||
bool
|
||||
GPUDispatcher::updateWbCounter(int kern_id, int val) {
|
||||
assert(val == -1 || val == 1);
|
||||
|
||||
auto task = hsaQueueEntries[kern_id];
|
||||
task->updateOutstandingWbs(val);
|
||||
|
||||
// true: WB is done, false: WB is still ongoing
|
||||
return (task->outstandingWbs() == 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* get kernel's outstanding cache writeback requests
|
||||
*/
|
||||
int
|
||||
GPUDispatcher::getOutstandingWbs(int kernId) {
|
||||
auto task = hsaQueueEntries[kernId];
|
||||
|
||||
return task->outstandingWbs();
|
||||
}
|
||||
|
||||
/**
|
||||
* When an end program instruction detects that the last WF in
|
||||
* a WG has completed it will call this method on the dispatcher.
|
||||
* If we detect that this is the last WG for the given task, then
|
||||
* we ring the completion signal, which is used by the CPU to
|
||||
* synchronize with the GPU. The HSAPP is also notified that the
|
||||
* task has completed so it can be removed from its task queues.
|
||||
*/
|
||||
void
|
||||
GPUDispatcher::notifyWgCompl(Wavefront *wf)
|
||||
{
|
||||
int kern_id = wf->kernId;
|
||||
DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId);
|
||||
auto task = hsaQueueEntries[kern_id];
|
||||
assert(task->dispatchId() == kern_id);
|
||||
task->notifyWgCompleted();
|
||||
|
||||
DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",
|
||||
curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id);
|
||||
|
||||
if (task->numWgCompleted() == task->numWgTotal()) {
|
||||
// Notify the HSA PP that this kernel is complete
|
||||
gpuCmdProc->hsaPacketProc()
|
||||
.finishPkt(task->dispPktPtr(), task->queueId());
|
||||
if (task->completionSignal()) {
|
||||
// The signal value is aligned 8 bytes from
|
||||
// the actual handle in the runtime
|
||||
Addr signal_addr = task->completionSignal() + sizeof(Addr);
|
||||
DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "
|
||||
"completion signal: %x!\n", signal_addr);
|
||||
|
||||
/**
|
||||
* HACK: The semantics of the HSA signal is to decrement
|
||||
* the current signal value. We cheat here and read out
|
||||
* he value from main memory using functional access and
|
||||
* then just DMA the decremented value. This is because
|
||||
* the DMA controller does not currently support GPU
|
||||
* atomics.
|
||||
*/
|
||||
auto *tc = gpuCmdProc->system()->threads[0];
|
||||
auto &virt_proxy = tc->getVirtProxy();
|
||||
TypedBufferArg<Addr> prev_signal(signal_addr);
|
||||
prev_signal.copyIn(virt_proxy);
|
||||
|
||||
Addr *new_signal = new Addr;
|
||||
*new_signal = (Addr)*prev_signal - 1;
|
||||
|
||||
gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,
|
||||
new_signal, 0);
|
||||
} else {
|
||||
DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
|
||||
"signal\n");
|
||||
}
|
||||
|
||||
accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
|
||||
|
||||
// update event end time (in nano-seconds)
|
||||
if (ndRangeMap[kern_id].q.depends) {
|
||||
HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
|
||||
uint64_t event;
|
||||
shader->ReadMem((uint64_t)(&host_state->event), &event,
|
||||
sizeof(uint64_t), 0);
|
||||
|
||||
uint64_t end = curTick() / 1000;
|
||||
|
||||
shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
|
||||
sizeof(uint64_t), 0);
|
||||
}
|
||||
DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n",
|
||||
curTick(), kern_id);
|
||||
DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
|
||||
}
|
||||
|
||||
if (!tickEvent.scheduled()) {
|
||||
schedule(&tickEvent, curTick() + shader->ticks(1));
|
||||
schedule(&tickEvent, curTick() + shader->clockPeriod());
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::scheduleDispatch()
|
||||
GPUDispatcher::scheduleDispatch()
|
||||
{
|
||||
if (!tickEvent.scheduled())
|
||||
schedule(&tickEvent, curTick() + shader->ticks(1));
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
|
||||
{
|
||||
if (cpu) {
|
||||
if (off) {
|
||||
shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
|
||||
true);
|
||||
val += off;
|
||||
}
|
||||
|
||||
shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
|
||||
} else {
|
||||
panic("Cannot find host");
|
||||
if (!tickEvent.scheduled()) {
|
||||
schedule(&tickEvent, curTick() + shader->clockPeriod());
|
||||
}
|
||||
}
|
||||
|
||||
// helper functions for driver to retrieve GPU attributes
|
||||
int
|
||||
GpuDispatcher::getNumCUs()
|
||||
GPUDispatcher *GPUDispatcherParams::create()
|
||||
{
|
||||
return shader->cuList.size();
|
||||
}
|
||||
|
||||
int
|
||||
GpuDispatcher::wfSize() const
|
||||
{
|
||||
return shader->cuList[0]->wfSize();
|
||||
}
|
||||
|
||||
void
|
||||
GpuDispatcher::setFuncargsSize(int funcargs_size)
|
||||
{
|
||||
shader->funcargs_size = funcargs_size;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
GpuDispatcher::getStaticContextSize() const
|
||||
{
|
||||
return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
|
||||
return new GPUDispatcher(this);
|
||||
}
|
||||
|
||||
@@ -31,125 +31,69 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __GPU_DISPATCHER_HH__
|
||||
#define __GPU_DISPATCHER_HH__
|
||||
/**
|
||||
* @file
|
||||
* The GPUDispatcher is the component of the shader that is responsible
|
||||
* for creating and dispatching WGs to the compute units. If all WGs in
|
||||
* a kernel cannot be dispatched simultaneously, then the dispatcher will
|
||||
* keep track of all pending WGs and dispatch them as resources become
|
||||
* available.
|
||||
*/
|
||||
|
||||
#ifndef __GPU_COMPUTE_DISPATCHER_HH__
|
||||
#define __GPU_COMPUTE_DISPATCHER_HH__
|
||||
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "dev/dma_device.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/ndrange.hh"
|
||||
#include "gpu-compute/qstruct.hh"
|
||||
#include "mem/port.hh"
|
||||
#include "params/GpuDispatcher.hh"
|
||||
#include "dev/hsa/hsa_packet.hh"
|
||||
#include "params/GPUDispatcher.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
|
||||
class BaseCPU;
|
||||
class GPUCommandProcessor;
|
||||
class HSAQueueEntry;
|
||||
class Shader;
|
||||
class Wavefront;
|
||||
|
||||
class GpuDispatcher : public DmaDevice
|
||||
class GPUDispatcher : public SimObject
|
||||
{
|
||||
public:
|
||||
typedef GpuDispatcherParams Params;
|
||||
public:
|
||||
typedef GPUDispatcherParams Params;
|
||||
|
||||
MasterID masterId() { return _masterId; }
|
||||
GPUDispatcher(const Params *p);
|
||||
~GPUDispatcher();
|
||||
|
||||
protected:
|
||||
MasterID _masterId;
|
||||
void serialize(CheckpointOut &cp) const override;
|
||||
void unserialize(CheckpointIn &cp) override;
|
||||
void regStats() override;
|
||||
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc);
|
||||
void setShader(Shader *new_shader);
|
||||
void exec();
|
||||
bool isReachingKernelEnd(Wavefront *wf);
|
||||
void updateInvCounter(int kern_id, int val=-1);
|
||||
bool updateWbCounter(int kern_id, int val=-1);
|
||||
int getOutstandingWbs(int kern_id);
|
||||
void notifyWgCompl(Wavefront *wf);
|
||||
void scheduleDispatch();
|
||||
void dispatch(HSAQueueEntry *task);
|
||||
HSAQueueEntry* hsaTask(int disp_id);
|
||||
|
||||
// Base and length of PIO register space
|
||||
Addr pioAddr;
|
||||
Addr pioSize;
|
||||
Tick pioDelay;
|
||||
|
||||
HsaQueueEntry curTask;
|
||||
|
||||
std::unordered_map<int, NDRange> ndRangeMap;
|
||||
NDRange ndRange;
|
||||
|
||||
// list of kernel_ids to launch
|
||||
std::queue<int> execIds;
|
||||
// list of kernel_ids that have finished
|
||||
std::queue<int> doneIds;
|
||||
|
||||
uint64_t dispatchCount;
|
||||
// is there a kernel in execution?
|
||||
bool dispatchActive;
|
||||
|
||||
BaseCPU *cpu;
|
||||
Shader *shader;
|
||||
ClDriver *driver;
|
||||
EventFunctionWrapper tickEvent;
|
||||
|
||||
|
||||
static GpuDispatcher *instance;
|
||||
|
||||
// sycall emulation mode can have only 1 application running(?)
|
||||
// else we have to do some pid based tagging
|
||||
// unused
|
||||
typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
|
||||
TranslationBuffer tlb;
|
||||
|
||||
public:
|
||||
/*statistics*/
|
||||
Stats::Scalar num_kernelLaunched;
|
||||
GpuDispatcher(const Params *p);
|
||||
|
||||
~GpuDispatcher() { }
|
||||
|
||||
void exec();
|
||||
virtual void serialize(CheckpointOut &cp) const override;
|
||||
virtual void unserialize(CheckpointIn &cp) override;
|
||||
void notifyWgCompl(Wavefront *w);
|
||||
void scheduleDispatch();
|
||||
void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
|
||||
|
||||
// using singleton so that glue code can pass pointer locations
|
||||
// to the dispatcher. when there are multiple dispatchers, we can
|
||||
// call something like getInstance(index)
|
||||
static void
|
||||
setInstance(GpuDispatcher *_instance)
|
||||
{
|
||||
instance = _instance;
|
||||
}
|
||||
|
||||
static GpuDispatcher* getInstance() { return instance; }
|
||||
|
||||
class TLBPort : public MasterPort
|
||||
{
|
||||
public:
|
||||
|
||||
TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
|
||||
: MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
|
||||
|
||||
protected:
|
||||
GpuDispatcher *dispatcher;
|
||||
|
||||
virtual bool recvTimingResp(PacketPtr pkt) { return true; }
|
||||
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
|
||||
virtual void recvFunctional(PacketPtr pkt) { }
|
||||
virtual void recvRangeChange() { }
|
||||
virtual void recvReqRetry() { }
|
||||
|
||||
};
|
||||
|
||||
TLBPort *tlbPort;
|
||||
|
||||
Port &getPort(const std::string &if_name,
|
||||
PortID idx=InvalidPortID) override;
|
||||
|
||||
AddrRangeList getAddrRanges() const override;
|
||||
Tick read(PacketPtr pkt) override;
|
||||
Tick write(PacketPtr pkt) override;
|
||||
|
||||
// helper functions to retrieve/set GPU attributes
|
||||
int getNumCUs();
|
||||
int wfSize() const;
|
||||
void setFuncargsSize(int funcargs_size);
|
||||
|
||||
/** Returns the size of the static hardware context of a wavefront */
|
||||
uint32_t getStaticContextSize() const;
|
||||
private:
|
||||
Shader *shader;
|
||||
GPUCommandProcessor *gpuCmdProc;
|
||||
EventFunctionWrapper tickEvent;
|
||||
std::unordered_map<int, HSAQueueEntry*> hsaQueueEntries;
|
||||
// list of kernel_ids to launch
|
||||
std::queue<int> execIds;
|
||||
// list of kernel_ids that have finished
|
||||
std::queue<int> doneIds;
|
||||
// is there a kernel in execution?
|
||||
bool dispatchActive;
|
||||
/*statistics*/
|
||||
Stats::Scalar numKernelLaunched;
|
||||
Stats::Scalar cyclesWaitingForDispatch;
|
||||
};
|
||||
|
||||
#endif // __GPU_DISPATCHER_HH__
|
||||
#endif // __GPU_COMPUTE_DISPATCHER_HH__
|
||||
|
||||
@@ -33,13 +33,15 @@
|
||||
|
||||
#include "gpu-compute/exec_stage.hh"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include "base/trace.hh"
|
||||
#include "debug/GPUSched.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
|
||||
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
|
||||
vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
|
||||
shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
|
||||
ExecStage::ExecStage(const ComputeUnitParams *p) : lastTimeInstExecuted(false),
|
||||
thisTimeInstExecuted(false), instrExecuted (false),
|
||||
executionResourcesUsed(0)
|
||||
{
|
||||
@@ -53,37 +55,18 @@ ExecStage::init(ComputeUnit *cu)
|
||||
computeUnit = cu;
|
||||
_name = computeUnit->name() + ".ExecStage";
|
||||
dispatchList = &computeUnit->dispatchList;
|
||||
vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
|
||||
glbMemInstAvail= &(computeUnit->glbMemInstAvail);
|
||||
shrMemInstAvail= &(computeUnit->shrMemInstAvail);
|
||||
idle_dur = 0;
|
||||
}
|
||||
|
||||
void
|
||||
ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
|
||||
if (stage == IdleExec) {
|
||||
// count cycles of no vector ALU instruction executed
|
||||
// even if one was the oldest in a WV of that vector SIMD unit
|
||||
if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
|
||||
numCyclesWithNoInstrTypeIssued[unitId]++;
|
||||
}
|
||||
|
||||
// count cycles of no global memory (vector) instruction executed
|
||||
// even if one was the oldest in a WV of that vector SIMD unit
|
||||
if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
|
||||
numCyclesWithNoInstrTypeIssued[unitId]++;
|
||||
(*glbMemInstAvail)--;
|
||||
}
|
||||
|
||||
// count cycles of no shared memory (vector) instruction executed
|
||||
// even if one was the oldest in a WV of that vector SIMD unit
|
||||
if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
|
||||
numCyclesWithNoInstrTypeIssued[unitId]++;
|
||||
(*shrMemInstAvail)--;
|
||||
}
|
||||
// count cycles when no instruction to a specific execution resource
|
||||
// is executed
|
||||
numCyclesWithNoInstrTypeIssued[unitId]++;
|
||||
} else if (stage == BusyExec) {
|
||||
// count the number of cycles an instruction to a specific unit
|
||||
// was issued
|
||||
// count the number of cycles an instruction to a specific execution
|
||||
// resource type was issued
|
||||
numCyclesWithInstrTypeIssued[unitId]++;
|
||||
thisTimeInstExecuted = true;
|
||||
instrExecuted = true;
|
||||
@@ -102,14 +85,13 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
|
||||
}
|
||||
|
||||
lastTimeInstExecuted = thisTimeInstExecuted;
|
||||
// track the number of cycles we either issued one vector instruction
|
||||
// or issued no instructions at all
|
||||
// track the number of cycles we either issued at least
|
||||
// instruction or issued no instructions at all
|
||||
if (instrExecuted) {
|
||||
numCyclesWithInstrIssued++;
|
||||
} else {
|
||||
numCyclesWithNoIssue++;
|
||||
}
|
||||
|
||||
spc.sample(executionResourcesUsed);
|
||||
}
|
||||
}
|
||||
@@ -122,25 +104,86 @@ ExecStage::initStatistics()
|
||||
thisTimeInstExecuted = false;
|
||||
}
|
||||
|
||||
std::string
|
||||
ExecStage::dispStatusToStr(int i)
|
||||
{
|
||||
std::string s("INVALID");
|
||||
switch (i) {
|
||||
case EMPTY:
|
||||
s = "EMPTY";
|
||||
break;
|
||||
case SKIP:
|
||||
s = "SKIP";
|
||||
break;
|
||||
case EXREADY:
|
||||
s = "EXREADY";
|
||||
break;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void
|
||||
ExecStage::dumpDispList()
|
||||
{
|
||||
std::stringstream ss;
|
||||
bool empty = true;
|
||||
for (int i = 0; i < computeUnit->numExeUnits(); i++) {
|
||||
DISPATCH_STATUS s = dispatchList->at(i).second;
|
||||
ss << i << ": " << dispStatusToStr(s);
|
||||
if (s != EMPTY) {
|
||||
empty = false;
|
||||
Wavefront *w = dispatchList->at(i).first;
|
||||
ss << " SIMD[" << w->simdId << "] WV[" << w->wfDynId << "]: ";
|
||||
ss << (w->instructionBuffer.front())->seqNum() << ": ";
|
||||
ss << (w->instructionBuffer.front())->disassemble();
|
||||
}
|
||||
ss << "\n";
|
||||
}
|
||||
if (!empty) {
|
||||
DPRINTF(GPUSched, "Dispatch List:\n%s", ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ExecStage::exec()
|
||||
{
|
||||
initStatistics();
|
||||
|
||||
for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
|
||||
// if dispatch list for this execution resource is empty,
|
||||
// skip this execution resource this cycle
|
||||
if (dispatchList->at(unitId).second == EMPTY) {
|
||||
collectStatistics(IdleExec, unitId);
|
||||
continue;
|
||||
}
|
||||
|
||||
collectStatistics(BusyExec, unitId);
|
||||
// execute an instruction for the WF
|
||||
dispatchList->at(unitId).first->exec();
|
||||
// clear the dispatch list entry
|
||||
dispatchList->at(unitId).second = EMPTY;
|
||||
dispatchList->at(unitId).first = (Wavefront*)nullptr;
|
||||
if (Debug::GPUSched) {
|
||||
dumpDispList();
|
||||
}
|
||||
for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
|
||||
DISPATCH_STATUS s = dispatchList->at(unitId).second;
|
||||
switch (s) {
|
||||
case EMPTY:
|
||||
// Do not execute if empty, waiting for VRF reads,
|
||||
// or LM tied to GM waiting for VRF reads
|
||||
collectStatistics(IdleExec, unitId);
|
||||
break;
|
||||
case EXREADY:
|
||||
{
|
||||
collectStatistics(BusyExec, unitId);
|
||||
Wavefront *w = dispatchList->at(unitId).first;
|
||||
DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
|
||||
unitId, w->simdId, w->wfDynId,
|
||||
(w->instructionBuffer.front())->disassemble());
|
||||
DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
|
||||
dispatchList->at(unitId).first->exec();
|
||||
(computeUnit->scheduleStage).deleteFromSch(w);
|
||||
dispatchList->at(unitId).second = EMPTY;
|
||||
dispatchList->at(unitId).first->freeResources();
|
||||
dispatchList->at(unitId).first = nullptr;
|
||||
break;
|
||||
}
|
||||
case SKIP:
|
||||
collectStatistics(BusyExec, unitId);
|
||||
DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
|
||||
dispatchList->at(unitId).second = EMPTY;
|
||||
dispatchList->at(unitId).first->freeResources();
|
||||
dispatchList->at(unitId).first = nullptr;
|
||||
break;
|
||||
default:
|
||||
panic("Unknown dispatch status in exec()\n");
|
||||
}
|
||||
}
|
||||
|
||||
collectStatistics(PostExec, 0);
|
||||
@@ -165,7 +208,7 @@ ExecStage::regStats()
|
||||
;
|
||||
|
||||
spc
|
||||
.init(0, numSIMDs + numMemUnits, 1)
|
||||
.init(0, computeUnit->numExeUnits(), 1)
|
||||
.name(name() + ".spc")
|
||||
.desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
|
||||
;
|
||||
@@ -177,25 +220,36 @@ ExecStage::regStats()
|
||||
;
|
||||
|
||||
numCyclesWithInstrTypeIssued
|
||||
.init(numSIMDs + numMemUnits)
|
||||
.name(name() + ".num_cycles_with_instrtype_issue")
|
||||
.desc("Number of cycles at least one instruction of specific type "
|
||||
"issued")
|
||||
.init(computeUnit->numExeUnits())
|
||||
.name(name() + ".num_cycles_issue_exec_rsrc")
|
||||
.desc("Number of cycles at least one instruction issued to "
|
||||
"execution resource type")
|
||||
;
|
||||
|
||||
numCyclesWithNoInstrTypeIssued
|
||||
.init(numSIMDs + numMemUnits)
|
||||
.name(name() + ".num_cycles_with_instr_type_no_issue")
|
||||
.desc("Number of cycles no instruction of specific type issued")
|
||||
.init(computeUnit->numExeUnits())
|
||||
.name(name() + ".num_cycles_no_issue_exec_rsrc")
|
||||
.desc("Number of clks no instructions issued to execution "
|
||||
"resource type")
|
||||
;
|
||||
|
||||
for (int i = 0; i < numSIMDs; ++i) {
|
||||
numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
|
||||
numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
|
||||
int c = 0;
|
||||
for (int i = 0; i < computeUnit->numVectorALUs; i++,c++) {
|
||||
std::string s = "VectorALU" + std::to_string(i);
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, s);
|
||||
numCyclesWithInstrTypeIssued.subname(c, s);
|
||||
}
|
||||
for (int i = 0; i < computeUnit->numScalarALUs; i++,c++) {
|
||||
std::string s = "ScalarALU" + std::to_string(i);
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, s);
|
||||
numCyclesWithInstrTypeIssued.subname(c, s);
|
||||
}
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, "VectorMemPipe");
|
||||
numCyclesWithInstrTypeIssued.subname(c++, "VectorMemPipe");
|
||||
|
||||
numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
|
||||
numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
|
||||
numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
|
||||
numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe");
|
||||
numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe");
|
||||
|
||||
numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe");
|
||||
numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe");
|
||||
}
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#define __EXEC_STAGE_HH__
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
@@ -53,8 +54,9 @@ enum STAT_STATUS
|
||||
|
||||
enum DISPATCH_STATUS
|
||||
{
|
||||
EMPTY = 0,
|
||||
FILLED
|
||||
EMPTY = 0, // no wave present in dispatchList slot
|
||||
EXREADY, // wave ready for execution
|
||||
SKIP, // extra memory resource needed, Shared Mem. only
|
||||
};
|
||||
|
||||
// Execution stage.
|
||||
@@ -72,18 +74,21 @@ class ExecStage
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
|
||||
std::string dispStatusToStr(int j);
|
||||
void dumpDispList();
|
||||
|
||||
std::string name() { return _name; }
|
||||
void regStats();
|
||||
// number of idle cycles
|
||||
Stats::Scalar numCyclesWithNoIssue;
|
||||
// number of busy cycles
|
||||
Stats::Scalar numCyclesWithInstrIssued;
|
||||
// number of cycles (per execution unit) during which at least one
|
||||
// instruction was issued to that unit
|
||||
// number of cycles during which at least one
|
||||
// instruction was issued to an execution resource type
|
||||
Stats::Vector numCyclesWithInstrTypeIssued;
|
||||
// number of idle cycles (per execution unit) during which the unit issued
|
||||
// no instruction targeting that unit, even though there is at least one
|
||||
// Wavefront with such an instruction as the oldest
|
||||
// number of idle cycles during which the scheduler
|
||||
// issued no instructions targeting a specific
|
||||
// execution resource type
|
||||
Stats::Vector numCyclesWithNoInstrTypeIssued;
|
||||
// SIMDs active per cycle
|
||||
Stats::Distribution spc;
|
||||
@@ -92,11 +97,6 @@ class ExecStage
|
||||
void collectStatistics(enum STAT_STATUS stage, int unitId);
|
||||
void initStatistics();
|
||||
ComputeUnit *computeUnit;
|
||||
uint32_t numSIMDs;
|
||||
|
||||
// Number of memory execution resources;
|
||||
// both global and local memory execution resources in CU
|
||||
uint32_t numMemUnits;
|
||||
|
||||
// List of waves which will be dispatched to
|
||||
// each execution resource. A FILLED implies
|
||||
@@ -108,18 +108,12 @@ class ExecStage
|
||||
// dispatchList is used to communicate between schedule
|
||||
// and exec stage
|
||||
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
|
||||
// flag per vector SIMD unit that is set when there is at least one
|
||||
// WV that has a vector ALU instruction as the oldest in its
|
||||
// Instruction Buffer
|
||||
std::vector<bool> *vectorAluInstAvail;
|
||||
int *glbMemInstAvail;
|
||||
int *shrMemInstAvail;
|
||||
bool lastTimeInstExecuted;
|
||||
bool thisTimeInstExecuted;
|
||||
bool instrExecuted;
|
||||
Stats::Scalar numTransActiveIdle;
|
||||
Stats::Distribution idleDur;
|
||||
uint32_t executionResourcesUsed;
|
||||
int executionResourcesUsed;
|
||||
uint64_t idle_dur;
|
||||
std::string _name;
|
||||
};
|
||||
|
||||
@@ -36,18 +36,18 @@
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
|
||||
computeUnit(nullptr)
|
||||
FetchStage::FetchStage(const ComputeUnitParams* p) :
|
||||
numVectorALUs(p->num_SIMDs), computeUnit(nullptr)
|
||||
{
|
||||
for (int j = 0; j < numSIMDs; ++j) {
|
||||
for (int j = 0; j < numVectorALUs; ++j) {
|
||||
FetchUnit newFetchUnit(p);
|
||||
fetchUnit.push_back(newFetchUnit);
|
||||
_fetchUnit.push_back(newFetchUnit);
|
||||
}
|
||||
}
|
||||
|
||||
FetchStage::~FetchStage()
|
||||
{
|
||||
fetchUnit.clear();
|
||||
_fetchUnit.clear();
|
||||
}
|
||||
|
||||
void
|
||||
@@ -56,17 +56,17 @@ FetchStage::init(ComputeUnit *cu)
|
||||
computeUnit = cu;
|
||||
_name = computeUnit->name() + ".FetchStage";
|
||||
|
||||
for (int j = 0; j < numSIMDs; ++j) {
|
||||
fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
|
||||
fetchUnit[j].init(computeUnit);
|
||||
for (int j = 0; j < numVectorALUs; ++j) {
|
||||
_fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
|
||||
_fetchUnit[j].init(computeUnit);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
FetchStage::exec()
|
||||
{
|
||||
for (int j = 0; j < numSIMDs; ++j) {
|
||||
fetchUnit[j].exec();
|
||||
for (int j = 0; j < numVectorALUs; ++j) {
|
||||
_fetchUnit[j].exec();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,13 +83,13 @@ FetchStage::processFetchReturn(PacketPtr pkt)
|
||||
|
||||
instFetchInstReturned.sample(num_instructions);
|
||||
uint32_t simdId = wavefront->simdId;
|
||||
fetchUnit[simdId].processFetchReturn(pkt);
|
||||
_fetchUnit[simdId].processFetchReturn(pkt);
|
||||
}
|
||||
|
||||
void
|
||||
FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
|
||||
{
|
||||
fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
|
||||
_fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -62,14 +62,15 @@ class FetchStage
|
||||
std::string name() { return _name; }
|
||||
void regStats();
|
||||
Stats::Distribution instFetchInstReturned;
|
||||
FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
|
||||
|
||||
private:
|
||||
uint32_t numSIMDs;
|
||||
int numVectorALUs;
|
||||
ComputeUnit *computeUnit;
|
||||
|
||||
// List of fetch units. A fetch unit is
|
||||
// instantiated per SIMD
|
||||
std::vector<FetchUnit> fetchUnit;
|
||||
// instantiated per VALU/SIMD
|
||||
std::vector<FetchUnit> _fetchUnit;
|
||||
std::string _name;
|
||||
};
|
||||
|
||||
|
||||
@@ -45,11 +45,9 @@
|
||||
|
||||
uint32_t FetchUnit::globalFetchUnitID;
|
||||
|
||||
FetchUnit::FetchUnit(const ComputeUnitParams* params) :
|
||||
timingSim(true),
|
||||
computeUnit(nullptr),
|
||||
fetchScheduler(params),
|
||||
waveList(nullptr)
|
||||
FetchUnit::FetchUnit(const ComputeUnitParams* params)
|
||||
: timingSim(true), computeUnit(nullptr), fetchScheduler(params),
|
||||
waveList(nullptr), fetchDepth(params->fetch_depth)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -66,9 +64,14 @@ FetchUnit::init(ComputeUnit *cu)
|
||||
timingSim = computeUnit->shader->timingSim;
|
||||
fetchQueue.clear();
|
||||
fetchStatusQueue.resize(computeUnit->shader->n_wf);
|
||||
fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc());
|
||||
|
||||
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
|
||||
fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
|
||||
for (int i = 0; i < computeUnit->shader->n_wf; ++i) {
|
||||
Wavefront *wf = waveList->at(i);
|
||||
assert(wf->wfSlotId == i);
|
||||
fetchStatusQueue[i] = std::make_pair(wf, false);
|
||||
fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf);
|
||||
fetchBuf[i].decoder(&decoder);
|
||||
}
|
||||
|
||||
fetchScheduler.bindList(&fetchQueue);
|
||||
@@ -77,6 +80,23 @@ FetchUnit::init(ComputeUnit *cu)
|
||||
void
|
||||
FetchUnit::exec()
|
||||
{
|
||||
/**
|
||||
* now we check if any of the fetch buffers have
|
||||
* buffered instruction data that can be decoded
|
||||
* and sent to its wavefront's instruction buffer.
|
||||
* then we check if any of the fetch buffer entries
|
||||
* can be released. we only check if we can
|
||||
* release a buffer
|
||||
*/
|
||||
for (auto &fetch_buf : fetchBuf) {
|
||||
if (!fetch_buf.hasFreeSpace()) {
|
||||
fetch_buf.checkWaveReleaseBuf();
|
||||
}
|
||||
if (fetch_buf.hasFetchDataToProcess()) {
|
||||
fetch_buf.decodeInsts();
|
||||
}
|
||||
}
|
||||
|
||||
// re-evaluate waves which are marked as not ready for fetch
|
||||
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
|
||||
// Following code assumes 64-bit opertaion and all insts are
|
||||
@@ -88,9 +108,10 @@ FetchUnit::exec()
|
||||
// 4 or less instructions and it can not have any branches to
|
||||
// prevent speculative instruction fetches
|
||||
if (!fetchStatusQueue[j].second) {
|
||||
if (curWave->status == Wavefront::S_RUNNING &&
|
||||
curWave->instructionBuffer.size() <= 4 &&
|
||||
!curWave->instructionBufferHasBranch() &&
|
||||
if ((curWave->getStatus() == Wavefront::S_RUNNING ||
|
||||
curWave->getStatus() == Wavefront::S_WAITCNT) &&
|
||||
fetchBuf[j].hasFreeSpace() &&
|
||||
!curWave->stopFetch() &&
|
||||
!curWave->pendingFetch) {
|
||||
fetchQueue.push_back(curWave);
|
||||
fetchStatusQueue[j].second = true;
|
||||
@@ -111,45 +132,38 @@ FetchUnit::exec()
|
||||
void
|
||||
FetchUnit::initiateFetch(Wavefront *wavefront)
|
||||
{
|
||||
// calculate the virtual address to fetch from the SQC
|
||||
Addr vaddr = wavefront->pc();
|
||||
assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace());
|
||||
|
||||
/**
|
||||
* the instruction buffer holds one instruction per entry, regardless
|
||||
* of the underlying instruction's size. the PC, however, addresses
|
||||
* instrutions on a 32b granularity so we must account for that here.
|
||||
*/
|
||||
for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) {
|
||||
vaddr +=
|
||||
wavefront->instructionBuffer.at(i)->staticInstruction()->instSize();
|
||||
}
|
||||
vaddr = wavefront->basePtr + vaddr;
|
||||
* calculate the virtual address to fetch from the SQC. the fetch
|
||||
* buffer holds a configurable number of cache lines. we start
|
||||
* fetching at the address of the cache line immediately following
|
||||
* the buffered line(s).
|
||||
*/
|
||||
Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr();
|
||||
|
||||
// this should already be aligned to a cache line
|
||||
assert(vaddr == makeLineAddress(vaddr,
|
||||
computeUnit->getCacheLineBits()));
|
||||
|
||||
// shouldn't be fetching a line that is already buffered
|
||||
assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
|
||||
|
||||
fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
|
||||
|
||||
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
|
||||
"from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId,
|
||||
wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
|
||||
|
||||
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
|
||||
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
|
||||
|
||||
// Since this is an instruction prefetch, if you're split then just finish
|
||||
// out the current line.
|
||||
int block_size = computeUnit->cacheLineSize();
|
||||
// check for split accesses
|
||||
Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
|
||||
int size = block_size;
|
||||
|
||||
if (split_addr > vaddr) {
|
||||
// misaligned access, just grab the rest of the line
|
||||
size = split_addr - vaddr;
|
||||
}
|
||||
|
||||
// set up virtual request
|
||||
RequestPtr req = std::make_shared<Request>(
|
||||
vaddr, size, Request::INST_FETCH,
|
||||
vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH,
|
||||
computeUnit->masterId(), 0, 0, nullptr);
|
||||
|
||||
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
|
||||
// This fetchBlock is kind of faux right now - because the translations so
|
||||
// far don't actually return Data
|
||||
uint64_t fetchBlock;
|
||||
pkt->dataStatic(&fetchBlock);
|
||||
|
||||
if (timingSim) {
|
||||
// SenderState needed on Return
|
||||
@@ -210,19 +224,23 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
|
||||
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
|
||||
pkt->req->getPaddr());
|
||||
|
||||
// this is necessary because the GPU TLB receives packets instead of
|
||||
// requests. when the translation is complete, all relevent fields in the
|
||||
// request will be populated, but not in the packet. here we create the
|
||||
// new packet so we can set the size, addr, and proper flags.
|
||||
/**
|
||||
* this is necessary because the GPU TLB receives packets instead of
|
||||
* requests. when the translation is complete, all relevent fields in
|
||||
* the request will be populated, but not in the packet. here we create
|
||||
* the new packet so we can set the size, addr, and proper flags.
|
||||
*/
|
||||
PacketPtr oldPkt = pkt;
|
||||
pkt = new Packet(oldPkt->req, oldPkt->cmd);
|
||||
delete oldPkt;
|
||||
|
||||
TheGpuISA::RawMachInst *data =
|
||||
new TheGpuISA::RawMachInst[pkt->req->getSize() /
|
||||
sizeof(TheGpuISA::RawMachInst)];
|
||||
|
||||
pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
|
||||
/**
|
||||
* we should have reserved an entry in the fetch buffer
|
||||
* for this cache line. here we get the pointer to the
|
||||
* entry used to buffer this request's line data.
|
||||
*/
|
||||
pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId)
|
||||
.reservedBuf(pkt->req->getVaddr()));
|
||||
|
||||
// New SenderState for the memory access
|
||||
pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
|
||||
@@ -257,47 +275,15 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
|
||||
Wavefront *wavefront = sender_state->wavefront;
|
||||
|
||||
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
|
||||
"%d bytes, %d instructions!\n", computeUnit->cu_id,
|
||||
wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
|
||||
pkt->req->getSize(), pkt->req->getSize() /
|
||||
sizeof(TheGpuISA::RawMachInst));
|
||||
"%d bytes!\n", computeUnit->cu_id, wavefront->simdId,
|
||||
wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
|
||||
|
||||
if (wavefront->dropFetch) {
|
||||
assert(wavefront->instructionBuffer.empty());
|
||||
assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
|
||||
wavefront->dropFetch = false;
|
||||
} else {
|
||||
TheGpuISA::RawMachInst *inst_index_ptr =
|
||||
(TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
|
||||
|
||||
assert(wavefront->instructionBuffer.size() <= 4);
|
||||
|
||||
for (int i = 0; i < pkt->req->getSize() /
|
||||
sizeof(TheGpuISA::RawMachInst); ++i) {
|
||||
GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
|
||||
|
||||
assert(inst_ptr);
|
||||
|
||||
if (inst_ptr->instSize() == 8) {
|
||||
/**
|
||||
* this instruction occupies 2 consecutive
|
||||
* entries in the instruction array, the
|
||||
* second of which contains a nullptr. so if
|
||||
* this inst is 8 bytes we advance two entries
|
||||
* instead of 1
|
||||
*/
|
||||
++i;
|
||||
}
|
||||
|
||||
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
|
||||
computeUnit->cu_id, wavefront->simdId,
|
||||
wavefront->wfSlotId, inst_ptr->disassemble());
|
||||
|
||||
GPUDynInstPtr gpuDynInst =
|
||||
std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
|
||||
computeUnit->getAndIncSeqNum());
|
||||
|
||||
wavefront->instructionBuffer.push_back(gpuDynInst);
|
||||
}
|
||||
fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
|
||||
}
|
||||
|
||||
wavefront->pendingFetch = false;
|
||||
@@ -306,8 +292,337 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
|
||||
delete pkt;
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::flushBuf(int wfSlotId)
|
||||
{
|
||||
fetchBuf.at(wfSlotId).flushBuf();
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
|
||||
{
|
||||
waveList = wave_list;
|
||||
}
|
||||
|
||||
/** FetchBufDesc */
|
||||
void
|
||||
FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size,
|
||||
Wavefront *wf)
|
||||
{
|
||||
wavefront = wf;
|
||||
fetchDepth = fetch_depth;
|
||||
maxIbSize = wavefront->maxIbSize;
|
||||
cacheLineSize = cache_line_size;
|
||||
maxFbSize = cacheLineSize * fetchDepth;
|
||||
|
||||
// Calculate the number of bits to address a cache line
|
||||
panic_if(!isPowerOf2(cacheLineSize),
|
||||
"Cache line size should be a power of two.");
|
||||
cacheLineBits = floorLog2(cacheLineSize);
|
||||
|
||||
bufStart = new uint8_t[maxFbSize];
|
||||
readPtr = bufStart;
|
||||
bufEnd = bufStart + maxFbSize;
|
||||
|
||||
for (int i = 0; i < fetchDepth; ++i) {
|
||||
freeList.emplace_back(readPtr + i * cacheLineSize);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::FetchBufDesc::flushBuf()
|
||||
{
|
||||
restartFromBranch = true;
|
||||
/**
|
||||
* free list may have some entries
|
||||
* so we clear it here to avoid duplicates
|
||||
*/
|
||||
freeList.clear();
|
||||
bufferedPCs.clear();
|
||||
reservedPCs.clear();
|
||||
readPtr = bufStart;
|
||||
|
||||
for (int i = 0; i < fetchDepth; ++i) {
|
||||
freeList.push_back(bufStart + i * cacheLineSize);
|
||||
}
|
||||
|
||||
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
|
||||
"buffer\n", wavefront->simdId, wavefront->wfSlotId,
|
||||
wavefront->wfDynId);
|
||||
}
|
||||
|
||||
Addr
|
||||
FetchUnit::FetchBufDesc::nextFetchAddr()
|
||||
{
|
||||
Addr next_line = 0;
|
||||
|
||||
if (bufferedAndReservedLines()) {
|
||||
Addr last_line_fetched = 0;
|
||||
if (!reservedLines()) {
|
||||
/**
|
||||
* get the PC of the most recently fetched cache line,
|
||||
* then return the address of the next line.
|
||||
*/
|
||||
last_line_fetched = bufferedPCs.rbegin()->first;
|
||||
} else {
|
||||
last_line_fetched = reservedPCs.rbegin()->first;
|
||||
}
|
||||
|
||||
next_line = last_line_fetched + cacheLineSize;
|
||||
|
||||
/**
|
||||
* should not be trying to fetch a line that has already
|
||||
* been fetched.
|
||||
*/
|
||||
assert(bufferedPCs.find(next_line) == bufferedPCs.end());
|
||||
assert(reservedPCs.find(next_line) == reservedPCs.end());
|
||||
} else {
|
||||
/**
|
||||
* we do not have any buffered cache lines yet, so we
|
||||
* assume this is the initial fetch, or the first fetch
|
||||
* after a branch, and get the PC directly from the WF.
|
||||
* in the case of a branch, we may not start at the
|
||||
* beginning of a cache line, so we adjust the readPtr by
|
||||
* the current PC's offset from the start of the line.
|
||||
*/
|
||||
next_line = makeLineAddress(wavefront->pc(), cacheLineBits);
|
||||
readPtr = bufStart;
|
||||
|
||||
/**
|
||||
* if we are here we have no buffered lines. in the case we flushed
|
||||
* the buffer due to a branch, we may need to start fetching from
|
||||
* some offset from the start of the fetch buffer, so we adjust for
|
||||
* that here.
|
||||
*/
|
||||
if (restartFromBranch) {
|
||||
restartFromBranch = false;
|
||||
int byte_offset
|
||||
= wavefront->pc() - makeLineAddress(wavefront->pc(),
|
||||
cacheLineBits);
|
||||
readPtr += byte_offset;
|
||||
}
|
||||
}
|
||||
|
||||
return next_line;
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
|
||||
{
|
||||
// we should have free buffer space, and the line
|
||||
// at vaddr should not already be cached.
|
||||
assert(hasFreeSpace());
|
||||
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
|
||||
assert(reservedPCs.find(vaddr) == reservedPCs.end());
|
||||
assert(bufferedAndReservedLines() < fetchDepth);
|
||||
|
||||
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry "
|
||||
"for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
|
||||
wavefront->wfDynId, vaddr);
|
||||
|
||||
/**
|
||||
* we reserve buffer space, by moving it out of the
|
||||
* free list, however we do not mark the buffered
|
||||
* line as valid until the fetch unit for this buffer
|
||||
* has receieved the response from the memory system.
|
||||
*/
|
||||
uint8_t *inst_buf = freeList.front();
|
||||
reservedPCs.emplace(vaddr, inst_buf);
|
||||
freeList.pop_front();
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
|
||||
{
|
||||
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
|
||||
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
|
||||
wavefront->simdId, wavefront->wfSlotId,
|
||||
wavefront->wfDynId, vaddr);
|
||||
|
||||
/**
|
||||
* this address should have an entry reserved in the
|
||||
* fetch buffer already, however it should be invalid
|
||||
* until the fetch completes.
|
||||
*/
|
||||
auto reserved_pc = reservedPCs.find(vaddr);
|
||||
assert(reserved_pc != reservedPCs.end());
|
||||
bufferedPCs.emplace(vaddr, reserved_pc->second);
|
||||
|
||||
if (readPtr == bufEnd) {
|
||||
readPtr = bufStart;
|
||||
}
|
||||
|
||||
reserved_pc->second = nullptr;
|
||||
reservedPCs.erase(reserved_pc);
|
||||
}
|
||||
|
||||
bool
|
||||
FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
|
||||
{
|
||||
return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst);
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
|
||||
{
|
||||
Addr cur_wave_pc = roundDown(wavefront->pc(),
|
||||
wavefront->computeUnit->cacheLineSize());
|
||||
if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
|
||||
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still "
|
||||
"being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
|
||||
wavefront->wfDynId, cur_wave_pc);
|
||||
|
||||
// should be reserved, but not buffered yet
|
||||
assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
|
||||
auto oldest_buffered_pc = bufferedPCs.begin();
|
||||
|
||||
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
|
||||
"(PC = %#x) can be released.\n", wavefront->simdId,
|
||||
wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
|
||||
wavefront->pc());
|
||||
|
||||
#ifdef DEBUG
|
||||
int idx = 0;
|
||||
for (const auto &buf_pc : bufferedPCs) {
|
||||
DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first);
|
||||
++idx;
|
||||
}
|
||||
#endif
|
||||
|
||||
// if we haven't buffered data for this PC, we shouldn't
|
||||
// be fetching from it.
|
||||
assert(current_buffered_pc != bufferedPCs.end());
|
||||
|
||||
/**
|
||||
* we're using a std::map so the addresses are sorted. if this
|
||||
* PC is not the oldest one in the map, we must be fetching from
|
||||
* a newer block, and we can release the oldest PC's fetch buffer
|
||||
* entry back to the free list.
|
||||
*/
|
||||
if (current_buffered_pc != oldest_buffered_pc) {
|
||||
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
|
||||
"removing it from the fetch buffer.\n", wavefront->simdId,
|
||||
wavefront->wfSlotId, wavefront->wfDynId,
|
||||
oldest_buffered_pc->first);
|
||||
|
||||
freeList.emplace_back(oldest_buffered_pc->second);
|
||||
oldest_buffered_pc->second = nullptr;
|
||||
bufferedPCs.erase(oldest_buffered_pc);
|
||||
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n",
|
||||
wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
|
||||
bufferedLines());
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::FetchBufDesc::decodeInsts()
|
||||
{
|
||||
assert(readPtr);
|
||||
|
||||
if (splitDecode()) {
|
||||
decodeSplitInst();
|
||||
}
|
||||
|
||||
while (wavefront->instructionBuffer.size() < maxIbSize
|
||||
&& hasFetchDataToProcess()) {
|
||||
if (splitDecode()) {
|
||||
decodeSplitInst();
|
||||
} else {
|
||||
TheGpuISA::MachInst mach_inst
|
||||
= reinterpret_cast<TheGpuISA::MachInst>(readPtr);
|
||||
GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
|
||||
readPtr += gpu_static_inst->instSize();
|
||||
|
||||
assert(readPtr <= bufEnd);
|
||||
|
||||
GPUDynInstPtr gpu_dyn_inst
|
||||
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
|
||||
wavefront, gpu_static_inst,
|
||||
wavefront->computeUnit->
|
||||
getAndIncSeqNum());
|
||||
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
|
||||
|
||||
DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
|
||||
"%d bytes remain.\n", wavefront->simdId,
|
||||
wavefront->wfSlotId, wavefront->wfDynId,
|
||||
gpu_static_inst->disassemble(),
|
||||
gpu_static_inst->instSize(),
|
||||
fetchBytesRemaining());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
FetchUnit::FetchBufDesc::decodeSplitInst()
|
||||
{
|
||||
TheGpuISA::RawMachInst split_inst = 0;
|
||||
int dword_size = sizeof(uint32_t);
|
||||
int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
|
||||
|
||||
for (int i = 0; i < num_dwords; ++i) {
|
||||
((uint32_t*)(&split_inst))[i] = *reinterpret_cast<uint32_t*>(readPtr);
|
||||
if (readPtr + dword_size >= bufEnd) {
|
||||
readPtr = bufStart;
|
||||
}
|
||||
}
|
||||
|
||||
assert(readPtr == bufStart);
|
||||
|
||||
TheGpuISA::MachInst mach_inst
|
||||
= reinterpret_cast<TheGpuISA::MachInst>(&split_inst);
|
||||
GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
|
||||
readPtr += (gpu_static_inst->instSize() - dword_size);
|
||||
assert(readPtr < bufEnd);
|
||||
|
||||
GPUDynInstPtr gpu_dyn_inst
|
||||
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
|
||||
wavefront, gpu_static_inst,
|
||||
wavefront->computeUnit->
|
||||
getAndIncSeqNum());
|
||||
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
|
||||
|
||||
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
|
||||
"(%d bytes). %d bytes remain in %d buffered lines.\n",
|
||||
wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
|
||||
gpu_static_inst->disassemble(), split_inst,
|
||||
gpu_static_inst->instSize(), fetchBytesRemaining(),
|
||||
bufferedLines());
|
||||
}
|
||||
|
||||
bool
|
||||
FetchUnit::FetchBufDesc::splitDecode() const
|
||||
{
|
||||
/**
|
||||
* if a read of a raw instruction would go beyond the end
|
||||
* of the fetch buffer, then we must perform a split decode.
|
||||
*/
|
||||
bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd;
|
||||
|
||||
return is_split;
|
||||
}
|
||||
|
||||
int
|
||||
FetchUnit::FetchBufDesc::fetchBytesRemaining() const
|
||||
{
|
||||
int bytes_remaining = 0;
|
||||
|
||||
if (bufferedLines() && readPtr != bufEnd) {
|
||||
auto last_buf_pc = bufferedPCs.rbegin();
|
||||
uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
|
||||
int byte_diff = end_ptr - readPtr;
|
||||
|
||||
if (end_ptr > readPtr) {
|
||||
bytes_remaining = byte_diff;
|
||||
} else if (end_ptr < readPtr) {
|
||||
bytes_remaining = bufferedBytes() + byte_diff;
|
||||
}
|
||||
}
|
||||
|
||||
assert(bytes_remaining <= bufferedBytes());
|
||||
return bytes_remaining;
|
||||
}
|
||||
|
||||
@@ -36,7 +36,6 @@
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arch/gpu_decoder.hh"
|
||||
#include "base/statistics.hh"
|
||||
@@ -58,9 +57,170 @@ class FetchUnit
|
||||
void initiateFetch(Wavefront *wavefront);
|
||||
void fetch(PacketPtr pkt, Wavefront *wavefront);
|
||||
void processFetchReturn(PacketPtr pkt);
|
||||
void flushBuf(int wfSlotId);
|
||||
static uint32_t globalFetchUnitID;
|
||||
|
||||
private:
|
||||
/**
|
||||
* fetch buffer descriptor. holds buffered
|
||||
* instruction data in the fetch unit.
|
||||
*/
|
||||
class FetchBufDesc
|
||||
{
|
||||
public:
|
||||
FetchBufDesc() : bufStart(nullptr), bufEnd(nullptr),
|
||||
readPtr(nullptr), fetchDepth(0), maxIbSize(0), maxFbSize(0),
|
||||
cacheLineSize(0), restartFromBranch(false), wavefront(nullptr),
|
||||
_decoder(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
~FetchBufDesc()
|
||||
{
|
||||
delete[] bufStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* allocate the fetch buffer space, and set the fetch depth
|
||||
* (number of lines that may be buffered), fetch size
|
||||
* (cache line size), and parent WF for this fetch buffer.
|
||||
*/
|
||||
void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf);
|
||||
|
||||
int
|
||||
bufferedAndReservedLines() const
|
||||
{
|
||||
return bufferedLines() + reservedLines();
|
||||
}
|
||||
|
||||
int bufferedLines() const { return bufferedPCs.size(); }
|
||||
int bufferedBytes() const { return bufferedLines() * cacheLineSize; }
|
||||
int reservedLines() const { return reservedPCs.size(); }
|
||||
bool hasFreeSpace() const { return !freeList.empty(); }
|
||||
void flushBuf();
|
||||
Addr nextFetchAddr();
|
||||
|
||||
/**
|
||||
* reserve an entry in the fetch buffer for PC = vaddr,
|
||||
*/
|
||||
void reserveBuf(Addr vaddr);
|
||||
|
||||
/**
|
||||
* return a pointer to the raw fetch buffer data.
|
||||
* this allows the fetch pkt to use this data directly
|
||||
* to avoid unnecessary memcpy and malloc/new.
|
||||
*/
|
||||
uint8_t*
|
||||
reservedBuf(Addr vaddr) const
|
||||
{
|
||||
auto reserved_pc = reservedPCs.find(vaddr);
|
||||
assert(reserved_pc != reservedPCs.end());
|
||||
assert(reserved_pc == reservedPCs.begin());
|
||||
|
||||
return reserved_pc->second;
|
||||
}
|
||||
|
||||
void fetchDone(Addr vaddr);
|
||||
|
||||
/**
|
||||
* checks if the buffer contains valid data. this essentially
|
||||
* tells fetch when there is data remaining that needs to be
|
||||
* decoded into the WF's IB.
|
||||
*/
|
||||
bool hasFetchDataToProcess() const;
|
||||
|
||||
/**
|
||||
* each time the fetch stage is ticked, we check if there
|
||||
* are any data in the fetch buffer that may be decoded and
|
||||
* sent to the IB. because we are modeling the fetch buffer
|
||||
* as a circular buffer, it is possible that an instruction
|
||||
* can straddle the end/beginning of the fetch buffer, so
|
||||
* decodeSplitInsts() handles that case.
|
||||
*/
|
||||
void decodeInsts();
|
||||
|
||||
/**
|
||||
* checks if the wavefront can release any of its fetch
|
||||
* buffer entries. this will occur when the WF's PC goes
|
||||
* beyond any of the currently buffered cache lines.
|
||||
*/
|
||||
void checkWaveReleaseBuf();
|
||||
|
||||
void
|
||||
decoder(TheGpuISA::Decoder *dec)
|
||||
{
|
||||
_decoder = dec;
|
||||
}
|
||||
|
||||
bool
|
||||
pcBuffered(Addr pc) const
|
||||
{
|
||||
bool buffered = bufferedPCs.find(pc) != bufferedPCs.end()
|
||||
&& reservedPCs.find(pc) != reservedPCs.end();
|
||||
|
||||
return buffered;
|
||||
}
|
||||
|
||||
/**
|
||||
* calculates the number of fetched bytes that have yet
|
||||
* to be decoded.
|
||||
*/
|
||||
int fetchBytesRemaining() const;
|
||||
|
||||
private:
|
||||
void decodeSplitInst();
|
||||
|
||||
/**
|
||||
* check if the next instruction to be processed out of
|
||||
* the fetch buffer is split across the end/beginning of
|
||||
* the fetch buffer.
|
||||
*/
|
||||
bool splitDecode() const;
|
||||
|
||||
/**
|
||||
* the set of PCs (fetch addresses) that are currently
|
||||
* buffered. bufferedPCs are valid, reservedPCs are
|
||||
* waiting for their buffers to be filled with valid
|
||||
* fetch data.
|
||||
*/
|
||||
std::map<Addr, uint8_t*> bufferedPCs;
|
||||
std::map<Addr, uint8_t*> reservedPCs;
|
||||
|
||||
/**
|
||||
* represents the fetch buffer free list. holds buffer space
|
||||
* that is currently free. each pointer in this array must
|
||||
* have enough space to hold a cache line. in reality we
|
||||
* have one actual fetch buffer: 'bufStart', these pointers
|
||||
* point to addresses within bufStart that are aligned to the
|
||||
* cache line size.
|
||||
*/
|
||||
std::deque<uint8_t*> freeList;
|
||||
|
||||
/**
|
||||
* raw instruction buffer. holds cache line data associated with
|
||||
* the set of PCs (fetch addresses) that are buffered here.
|
||||
*/
|
||||
uint8_t *bufStart;
|
||||
uint8_t *bufEnd;
|
||||
/**
|
||||
* pointer that points to the next chunk of inst data to be
|
||||
* decoded.
|
||||
*/
|
||||
uint8_t *readPtr;
|
||||
// how many lines the fetch unit may buffer
|
||||
int fetchDepth;
|
||||
// maximum size (in number of insts) of the WF's IB
|
||||
int maxIbSize;
|
||||
// maximum size (in bytes) of this fetch buffer
|
||||
int maxFbSize;
|
||||
int cacheLineSize;
|
||||
int cacheLineBits;
|
||||
bool restartFromBranch;
|
||||
// wavefront whose IB is serviced by this fetch buffer
|
||||
Wavefront *wavefront;
|
||||
TheGpuISA::Decoder *_decoder;
|
||||
};
|
||||
|
||||
bool timingSim;
|
||||
ComputeUnit *computeUnit;
|
||||
TheGpuISA::Decoder decoder;
|
||||
@@ -82,6 +242,15 @@ class FetchUnit
|
||||
|
||||
// Pointer to list of waves dispatched on to this SIMD unit
|
||||
std::vector<Wavefront*> *waveList;
|
||||
// holds the fetch buffers. each wave has 1 entry.
|
||||
std::vector<FetchBufDesc> fetchBuf;
|
||||
/**
|
||||
* number of cache lines we can fetch and buffer.
|
||||
* this includes the currently fetched line (i.e., the
|
||||
* line that corresponds to the WF's current PC), as
|
||||
* well as any lines that may be prefetched.
|
||||
*/
|
||||
int fetchDepth;
|
||||
};
|
||||
|
||||
#endif // __FETCH_UNIT_HH__
|
||||
|
||||
@@ -31,12 +31,13 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "gpu-compute/global_memory_pipeline.hh"
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <cinttypes>
|
||||
#include "debug/GPUCoalescer.hh"
|
||||
#include "debug/GPUMem.hh"
|
||||
#include "debug/GPUReg.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/global_memory_pipeline.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
@@ -44,7 +45,7 @@
|
||||
|
||||
GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
|
||||
computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
|
||||
outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
|
||||
maxWaveRequests(p->max_wave_requests), inflightStores(0),
|
||||
inflightLoads(0)
|
||||
{
|
||||
}
|
||||
@@ -76,6 +77,31 @@ GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
|
||||
{
|
||||
// We require one token from the coalescer's uncoalesced table to
|
||||
// proceed
|
||||
int token_count = 1;
|
||||
|
||||
DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
|
||||
assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
|
||||
mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
|
||||
}
|
||||
|
||||
bool
|
||||
GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
|
||||
{
|
||||
// Ensure we haven't exceeded the maximum number of vmem requests
|
||||
// for this wavefront
|
||||
if ((mp->wavefront()->outstandingReqsRdGm
|
||||
+ mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
GlobalMemPipeline::exec()
|
||||
{
|
||||
@@ -87,42 +113,60 @@ GlobalMemPipeline::exec()
|
||||
|
||||
// check the VRF to see if the operands of a load (or load component
|
||||
// of an atomic) are accessible
|
||||
if ((m) && (m->isLoad() || m->isAtomicRet())) {
|
||||
if (m && (m->isLoad() || m->isAtomicRet())) {
|
||||
w = m->wavefront();
|
||||
|
||||
accessVrf =
|
||||
w->computeUnit->vrf[w->simdId]->
|
||||
vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
|
||||
accessVrf = w->computeUnit->vrf[w->simdId]->
|
||||
canScheduleWriteOperandsFromLoad(w, m);
|
||||
|
||||
}
|
||||
|
||||
if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
|
||||
accessVrf && m->statusBitVector == VectorMask(0) &&
|
||||
(computeUnit->shader->coissue_return ||
|
||||
computeUnit->wfWait.at(m->pipeId).rdy())) {
|
||||
accessVrf && (computeUnit->shader->coissue_return ||
|
||||
computeUnit->vectorGlobalMemUnit.rdy())) {
|
||||
|
||||
w = m->wavefront();
|
||||
|
||||
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
|
||||
m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
|
||||
m->completeAcc(m);
|
||||
|
||||
if (m->isLoad() || m->isAtomicRet()) {
|
||||
w->computeUnit->vrf[w->simdId]->
|
||||
scheduleWriteOperandsFromLoad(w, m);
|
||||
}
|
||||
|
||||
completeRequest(m);
|
||||
|
||||
// Decrement outstanding register count
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
|
||||
Tick accessTime = curTick() - m->getAccessTime();
|
||||
|
||||
if (m->isStore() || m->isAtomic()) {
|
||||
// Decrement outstanding requests count
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
|
||||
if (m->isStore() || m->isAtomic() || m->isMemSync()) {
|
||||
computeUnit->shader->sampleStore(accessTime);
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
if (m->isLoad() || m->isAtomic()) {
|
||||
if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
|
||||
computeUnit->shader->sampleLoad(accessTime);
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
w->validateRequestCounters();
|
||||
|
||||
// Generate stats for round-trip time for vectory memory insts
|
||||
// going all the way to memory and stats for individual cache
|
||||
// blocks generated by the instruction.
|
||||
m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
|
||||
computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
|
||||
computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
|
||||
|
||||
// Mark write bus busy for appropriate amount of time
|
||||
computeUnit->glbMemToVrfBus.set(m->time);
|
||||
if (!computeUnit->shader->coissue_return)
|
||||
w->computeUnit->wfWait.at(m->pipeId).set(m->time);
|
||||
w->computeUnit->vectorGlobalMemUnit.set(m->time);
|
||||
}
|
||||
|
||||
// If pipeline has executed a global memory instruction
|
||||
@@ -148,13 +192,13 @@ GlobalMemPipeline::exec()
|
||||
mp->disassemble(), mp->seqNum());
|
||||
// Memfences will not return tokens and must be issued so we should
|
||||
// not request one as this will deplete the token count until deadlock
|
||||
if (!mp->isMemFence()) {
|
||||
if (!mp->isMemSync()) {
|
||||
assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
|
||||
mp->computeUnit()->getTokenManager()->acquireTokens(1);
|
||||
}
|
||||
mp->initiateAcc(mp);
|
||||
|
||||
if (!outOfOrderDataDelivery && !mp->isMemFence()) {
|
||||
if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
|
||||
/**
|
||||
* if we are not in out-of-order data delivery mode
|
||||
* then we keep the responses sorted in program order.
|
||||
@@ -178,19 +222,11 @@ GlobalMemPipeline::exec()
|
||||
GPUDynInstPtr
|
||||
GlobalMemPipeline::getNextReadyResp()
|
||||
{
|
||||
if (outOfOrderDataDelivery) {
|
||||
if (!gmReturnedLoads.empty()) {
|
||||
return gmReturnedLoads.front();
|
||||
} else if (!gmReturnedStores.empty()) {
|
||||
return gmReturnedStores.front();
|
||||
}
|
||||
} else {
|
||||
if (!gmOrderedRespBuffer.empty()) {
|
||||
auto mem_req = gmOrderedRespBuffer.begin();
|
||||
if (!gmOrderedRespBuffer.empty()) {
|
||||
auto mem_req = gmOrderedRespBuffer.begin();
|
||||
|
||||
if (mem_req->second.second) {
|
||||
return mem_req->second.first;
|
||||
}
|
||||
if (mem_req->second.second) {
|
||||
return mem_req->second.first;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,51 +244,33 @@ GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
|
||||
--inflightStores;
|
||||
}
|
||||
|
||||
if (outOfOrderDataDelivery) {
|
||||
if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
|
||||
assert(!gmReturnedLoads.empty());
|
||||
gmReturnedLoads.pop();
|
||||
} else if (gpuDynInst->isStore()) {
|
||||
assert(!gmReturnedStores.empty());
|
||||
gmReturnedStores.pop();
|
||||
}
|
||||
} else {
|
||||
// we should only pop the oldest requst, and it
|
||||
// should be marked as done if we are here
|
||||
assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
|
||||
assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
|
||||
assert(gmOrderedRespBuffer.begin()->second.second);
|
||||
// remove this instruction from the buffer by its
|
||||
// unique seq ID
|
||||
gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
|
||||
}
|
||||
// we should only pop the oldest requst, and it
|
||||
// should be marked as done if we are here
|
||||
assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
|
||||
assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
|
||||
assert(gmOrderedRespBuffer.begin()->second.second);
|
||||
// remove this instruction from the buffer by its
|
||||
// unique seq ID
|
||||
gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
|
||||
}
|
||||
|
||||
void
|
||||
GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
gpuDynInst->setAccessTime(curTick());
|
||||
gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
|
||||
gmIssuedRequests.push(gpuDynInst);
|
||||
}
|
||||
|
||||
void
|
||||
GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
if (outOfOrderDataDelivery) {
|
||||
if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
|
||||
assert(isGMLdRespFIFOWrRdy());
|
||||
gmReturnedLoads.push(gpuDynInst);
|
||||
} else {
|
||||
assert(isGMStRespFIFOWrRdy());
|
||||
gmReturnedStores.push(gpuDynInst);
|
||||
}
|
||||
} else {
|
||||
auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
|
||||
// if we are getting a response for this mem request,
|
||||
// then it ought to already be in the ordered response
|
||||
// buffer
|
||||
assert(mem_req != gmOrderedRespBuffer.end());
|
||||
mem_req->second.second = true;
|
||||
}
|
||||
auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
|
||||
// if we are getting a response for this mem request,
|
||||
// then it ought to already be in the ordered response
|
||||
// buffer
|
||||
assert(mem_req != gmOrderedRespBuffer.end());
|
||||
mem_req->second.second = true;
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -60,52 +60,34 @@ class GlobalMemPipeline
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
|
||||
std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
|
||||
std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
|
||||
|
||||
/**
|
||||
* find the next ready response to service. for OoO mode we
|
||||
* simply pop the oldest (based on when the response was
|
||||
* received) response in the response FIFOs. for in-order mode
|
||||
* we pop the oldest (in program order) response, and only if
|
||||
* it is marked as done.
|
||||
* Find the next ready response to service. In order to ensure
|
||||
* that no waitcnts are violated, we pop the oldest (in program order)
|
||||
* response, and only if it is marked as done. This is because waitcnt
|
||||
* values expect memory operations to complete and decrement their
|
||||
* counter values in program order.
|
||||
*/
|
||||
GPUDynInstPtr getNextReadyResp();
|
||||
|
||||
/**
|
||||
* once a memory request is finished we remove it from the
|
||||
* buffer. this method determines which response buffer
|
||||
* we're using based on the mode (in-order vs. OoO).
|
||||
* buffer.
|
||||
*/
|
||||
void completeRequest(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
/**
|
||||
* issues a request to the pipeline - i.e., enqueue it
|
||||
* in the request buffer.
|
||||
* Issues a request to the pipeline (i.e., enqueue it
|
||||
* in the request buffer).
|
||||
*/
|
||||
void issueRequest(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
/**
|
||||
* this method handles responses sent to this GM pipeline by the
|
||||
* CU. in the case of in-order delivery it simply marks the reqeust
|
||||
* as done in the ordered buffer to indicate that the requst is
|
||||
* finished. for out-of-order data delivery, the requests are enqueued
|
||||
* (in the order in which they are received) in the response FIFOs.
|
||||
* This method handles responses sent to this GM pipeline by the
|
||||
* CU. Simply marks the reqeust as done in the ordered buffer to
|
||||
* indicate that the requst is finished.
|
||||
*/
|
||||
void handleResponse(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
bool
|
||||
isGMLdRespFIFOWrRdy() const
|
||||
{
|
||||
return gmReturnedLoads.size() < gmQueueSize;
|
||||
}
|
||||
|
||||
bool
|
||||
isGMStRespFIFOWrRdy() const
|
||||
{
|
||||
return gmReturnedStores.size() < gmQueueSize;
|
||||
}
|
||||
|
||||
bool
|
||||
isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
|
||||
{
|
||||
@@ -114,7 +96,6 @@ class GlobalMemPipeline
|
||||
|
||||
const std::string &name() const { return _name; }
|
||||
void regStats();
|
||||
|
||||
void
|
||||
incLoadVRFBankConflictCycles(int num_cycles)
|
||||
{
|
||||
@@ -122,12 +103,15 @@ class GlobalMemPipeline
|
||||
}
|
||||
|
||||
bool coalescerReady(GPUDynInstPtr mp) const;
|
||||
bool outstandingReqsCheck(GPUDynInstPtr mp) const;
|
||||
|
||||
void acqCoalescerToken(GPUDynInstPtr mp);
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
std::string _name;
|
||||
int gmQueueSize;
|
||||
bool outOfOrderDataDelivery;
|
||||
int maxWaveRequests;
|
||||
|
||||
// number of cycles of delaying the update of a VGPR that is the
|
||||
// target of a load instruction (or the load component of an atomic)
|
||||
@@ -143,12 +127,11 @@ class GlobalMemPipeline
|
||||
int globalMemSize;
|
||||
|
||||
/*
|
||||
* this buffer holds the memory responses when in-order data
|
||||
* deilvery is used - the responses are ordered by their unique
|
||||
* sequence number, which is monotonically increasing. when a
|
||||
* memory request returns its "done" flag is set to true. during
|
||||
* each tick the the GM pipeline will check if the oldest request
|
||||
* is finished, and if so it will be removed from the queue.
|
||||
* This buffer holds the memory responses in order data - the responses
|
||||
* are ordered by their unique sequence number, which is monotonically
|
||||
* increasing. When a memory request returns its "done" flag is set to
|
||||
* true. During each tick the the GM pipeline will check if the oldest
|
||||
* request is finished, and if so it will be removed from the queue.
|
||||
*
|
||||
* key: memory instruction's sequence ID
|
||||
*
|
||||
@@ -161,14 +144,6 @@ class GlobalMemPipeline
|
||||
// Global Memory Request FIFO: all global memory requests
|
||||
// are issued to this FIFO from the memory pipelines
|
||||
std::queue<GPUDynInstPtr> gmIssuedRequests;
|
||||
|
||||
// Globa Store Response FIFO: all responses of global memory
|
||||
// stores are sent to this FIFO from TCP
|
||||
std::queue<GPUDynInstPtr> gmReturnedStores;
|
||||
|
||||
// Global Load Response FIFO: all responses of global memory
|
||||
// loads are sent to this FIFO from TCP
|
||||
std::queue<GPUDynInstPtr> gmReturnedLoads;
|
||||
};
|
||||
|
||||
#endif // __GLOBAL_MEMORY_PIPELINE_HH__
|
||||
|
||||
215
src/gpu-compute/gpu_command_processor.cc
Normal file
215
src/gpu-compute/gpu_command_processor.cc
Normal file
@@ -0,0 +1,215 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "gpu-compute/gpu_command_processor.hh"
|
||||
|
||||
#include "debug/GPUCommandProc.hh"
|
||||
#include "debug/GPUKernelInfo.hh"
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
#include "params/GPUCommandProcessor.hh"
|
||||
|
||||
GPUCommandProcessor::GPUCommandProcessor(const Params *p)
|
||||
: HSADevice(p), dispatcher(*p->dispatcher)
|
||||
{
|
||||
dispatcher.setCommandProcessor(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* submitDispatchPkt() is the entry point into the CP from the HSAPP
|
||||
* and is only meant to be used with AQL kernel dispatch packets.
|
||||
* After the HSAPP receives and extracts an AQL packet, it sends
|
||||
* it to the CP, which is responsible for gathering all relevant
|
||||
* information about a task, initializing CU state, and sending
|
||||
* it to the dispatcher for WG creation and dispatch.
|
||||
*
|
||||
* First we need capture all information from the the AQL pkt and
|
||||
* the code object, then store it in an HSAQueueEntry. Once the
|
||||
* packet and code are extracted, we extract information from the
|
||||
* queue descriptor that the CP needs to perform state initialization
|
||||
* on the CU. Finally we call dispatch() to send the task to the
|
||||
* dispatcher. When the task completely finishes, we call finishPkt()
|
||||
* on the HSA packet processor in order to remove the packet from the
|
||||
* queue, and notify the runtime that the task has completed.
|
||||
*/
|
||||
void
|
||||
GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr)
|
||||
{
|
||||
static int dynamic_task_id = 0;
|
||||
_hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
|
||||
|
||||
/**
|
||||
* we need to read a pointer in the application's address
|
||||
* space to pull out the kernel code descriptor.
|
||||
*/
|
||||
auto *tc = sys->threads[0];
|
||||
auto &virt_proxy = tc->getVirtProxy();
|
||||
|
||||
/**
|
||||
* The kernel_object is a pointer to the machine code, whose entry
|
||||
* point is an 'amd_kernel_code_t' type, which is included in the
|
||||
* kernel binary, and describes various aspects of the kernel. The
|
||||
* desired entry is the 'kernel_code_entry_byte_offset' field,
|
||||
* which provides the byte offset (positive or negative) from the
|
||||
* address of the amd_kernel_code_t to the start of the machine
|
||||
* instructions.
|
||||
*/
|
||||
AMDKernelCode akc;
|
||||
virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
|
||||
sizeof(AMDKernelCode));
|
||||
|
||||
DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
|
||||
"kernel object\n", akc.kernel_code_entry_byte_offset);
|
||||
|
||||
Addr machine_code_addr = (Addr)disp_pkt->kernel_object
|
||||
+ akc.kernel_code_entry_byte_offset;
|
||||
|
||||
DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
|
||||
machine_code_addr);
|
||||
|
||||
Addr kern_name_addr(0);
|
||||
virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
|
||||
(uint8_t*)&kern_name_addr, 0x8);
|
||||
|
||||
std::string kernel_name;
|
||||
virt_proxy.readString(kernel_name, kern_name_addr);
|
||||
|
||||
DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
|
||||
|
||||
HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
|
||||
dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
|
||||
|
||||
DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
|
||||
"grid size (%dx%dx%d) kernarg addr: %#x, completion "
|
||||
"signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
|
||||
disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
|
||||
disp_pkt->grid_size_x, disp_pkt->grid_size_y,
|
||||
disp_pkt->grid_size_z, disp_pkt->kernarg_address,
|
||||
disp_pkt->completion_signal);
|
||||
|
||||
DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
|
||||
"num scalar regs: %d, code addr: %#x, kernarg size: %d, "
|
||||
"LDS size: %d)\n", kernel_name, task->numVectorRegs(),
|
||||
task->numScalarRegs(), task->codeAddr(), 0, 0);
|
||||
|
||||
initABI(task);
|
||||
++dynamic_task_id;
|
||||
}
|
||||
|
||||
/**
|
||||
* submitVendorPkt() is for accepting vendor-specific packets from
|
||||
* the HSAPP. Vendor-specific packets may be used by the runtime to
|
||||
* send commands to the HSA device that are specific to a particular
|
||||
* vendor. The vendor-specific packets should be defined by the vendor
|
||||
* in the runtime.
|
||||
*/
|
||||
|
||||
/**
|
||||
* TODO: For now we simply tell the HSAPP to finish the packet,
|
||||
* however a future patch will update this method to provide
|
||||
* the proper handling of any required vendor-specific packets.
|
||||
* In the version of ROCm that is currently supported (1.6)
|
||||
* the runtime will send packets that direct the CP to
|
||||
* invalidate the GPUs caches. We do this automatically on
|
||||
* each kernel launch in the CU, so this is safe for now.
|
||||
*/
|
||||
void
|
||||
GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr)
|
||||
{
|
||||
hsaPP->finishPkt(raw_pkt, queue_id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Once the CP has finished extracting all relevant information about
|
||||
* a task and has initialized the ABI state, we send a description of
|
||||
* the task to the dispatcher. The dispatcher will create and dispatch
|
||||
* WGs to the CUs.
|
||||
*/
|
||||
void
|
||||
GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
|
||||
{
|
||||
dispatcher.dispatch(task);
|
||||
}
|
||||
|
||||
/**
|
||||
* The CP is responsible for traversing all HSA-ABI-related data
|
||||
* structures from memory and initializing the ABI state.
|
||||
* Information provided by the MQD, AQL packet, and code object
|
||||
* metadata will be used to initialze register file state.
|
||||
*/
|
||||
void
|
||||
GPUCommandProcessor::initABI(HSAQueueEntry *task)
|
||||
{
|
||||
auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
|
||||
|
||||
Addr hostReadIdxPtr
|
||||
= hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
|
||||
|
||||
dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
|
||||
sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
|
||||
&readDispIdOffEvent->readDispIdOffset);
|
||||
}
|
||||
|
||||
System*
|
||||
GPUCommandProcessor::system()
|
||||
{
|
||||
return sys;
|
||||
}
|
||||
|
||||
AddrRangeList
|
||||
GPUCommandProcessor::getAddrRanges() const
|
||||
{
|
||||
AddrRangeList ranges;
|
||||
return ranges;
|
||||
}
|
||||
|
||||
void
|
||||
GPUCommandProcessor::setShader(Shader *shader)
|
||||
{
|
||||
_shader = shader;
|
||||
}
|
||||
|
||||
Shader*
|
||||
GPUCommandProcessor::shader()
|
||||
{
|
||||
return _shader;
|
||||
}
|
||||
|
||||
GPUCommandProcessor*
|
||||
GPUCommandProcessorParams::create()
|
||||
{
|
||||
return new GPUCommandProcessor(this);
|
||||
}
|
||||
165
src/gpu-compute/gpu_command_processor.hh
Normal file
165
src/gpu-compute/gpu_command_processor.hh
Normal file
@@ -0,0 +1,165 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* The GPUCommandProcessor (CP) is responsible for accepting commands, in
|
||||
* the form of HSA AQL packets, from the HSA packet processor (HSAPP). The CP
|
||||
* works with several components, including the HSAPP and the dispatcher.
|
||||
* When the HSAPP sends a ready task to the CP, it will perform the necessary
|
||||
* operations to extract relevant data structures from memory, such as the
|
||||
* AQL queue descriptor and AQL packet, and initializes register state for the
|
||||
* task's wavefronts.
|
||||
*/
|
||||
|
||||
#ifndef __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
|
||||
#define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
|
||||
|
||||
#include "dev/hsa/hsa_device.hh"
|
||||
#include "gpu-compute/hsa_queue_entry.hh"
|
||||
|
||||
struct GPUCommandProcessorParams;
|
||||
class GPUDispatcher;
|
||||
class Shader;
|
||||
|
||||
class GPUCommandProcessor : public HSADevice
|
||||
{
|
||||
public:
|
||||
typedef GPUCommandProcessorParams Params;
|
||||
|
||||
GPUCommandProcessor() = delete;
|
||||
GPUCommandProcessor(const Params *p);
|
||||
|
||||
void setShader(Shader *shader);
|
||||
Shader* shader();
|
||||
|
||||
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr) override;
|
||||
void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr) override;
|
||||
void dispatchPkt(HSAQueueEntry *task);
|
||||
|
||||
Tick write(PacketPtr pkt) override { return 0; }
|
||||
Tick read(PacketPtr pkt) override { return 0; }
|
||||
AddrRangeList getAddrRanges() const override;
|
||||
System *system();
|
||||
|
||||
private:
|
||||
Shader *_shader;
|
||||
GPUDispatcher &dispatcher;
|
||||
|
||||
void initABI(HSAQueueEntry *task);
|
||||
|
||||
/**
|
||||
* Perform a DMA read of the read_dispatch_id_field_base_byte_offset
|
||||
* field, which follows directly after the read_dispatch_id (the read
|
||||
* pointer) in the amd_hsa_queue_t struct (aka memory queue descriptor
|
||||
* (MQD)), to find the base address of the MQD. The MQD is the runtime's
|
||||
* soft representation of a HW queue descriptor (HQD).
|
||||
*
|
||||
* Any fields below the read dispatch ID in the amd_hsa_queue_t should
|
||||
* not change according to the HSA standard, therefore we should be able
|
||||
* to get them based on their known relative position to the read dispatch
|
||||
* ID.
|
||||
*/
|
||||
class ReadDispIdOffsetDmaEvent : public DmaCallback
|
||||
{
|
||||
public:
|
||||
ReadDispIdOffsetDmaEvent(GPUCommandProcessor &gpu_cmd_proc,
|
||||
HSAQueueEntry *task)
|
||||
: DmaCallback(), readDispIdOffset(0), gpuCmdProc(gpu_cmd_proc),
|
||||
_task(task)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
process() override
|
||||
{
|
||||
/**
|
||||
* Now that the read pointer's offset from the base of
|
||||
* the MQD is known, we can use that to calculate the
|
||||
* the address of the MQD itself, the dispatcher will
|
||||
* DMA that into the HSAQueueEntry when a kernel is
|
||||
* launched.
|
||||
*/
|
||||
_task->hostAMDQueueAddr
|
||||
= gpuCmdProc.hsaPP->getQueueDesc(_task->queueId())
|
||||
->hostReadIndexPtr - readDispIdOffset;
|
||||
|
||||
/**
|
||||
* DMA a copy of the MQD into the task. Some fields of
|
||||
* the MQD will be used to initialize register state.
|
||||
*/
|
||||
auto *mqdDmaEvent = new MQDDmaEvent(gpuCmdProc, _task);
|
||||
gpuCmdProc.dmaReadVirt(_task->hostAMDQueueAddr,
|
||||
sizeof(_amd_queue_t), mqdDmaEvent,
|
||||
&_task->amdQueue);
|
||||
}
|
||||
|
||||
uint32_t readDispIdOffset;
|
||||
|
||||
private:
|
||||
GPUCommandProcessor &gpuCmdProc;
|
||||
HSAQueueEntry *_task;
|
||||
};
|
||||
|
||||
/**
|
||||
* Perform a DMA read of the MQD that corresponds to a hardware
|
||||
* queue descriptor (HQD). We store a copy of the MQD in the
|
||||
* HSAQueueEntry object so we can send a copy of it along with
|
||||
* a dispatch packet, which is needed to initialize register
|
||||
* state.
|
||||
*/
|
||||
class MQDDmaEvent : public DmaCallback
|
||||
{
|
||||
public:
|
||||
MQDDmaEvent(GPUCommandProcessor &gpu_cmd_proc, HSAQueueEntry *task)
|
||||
: DmaCallback(), gpuCmdProc(gpu_cmd_proc), _task(task)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
process() override
|
||||
{
|
||||
gpuCmdProc.dispatchPkt(_task);
|
||||
}
|
||||
|
||||
private:
|
||||
GPUCommandProcessor &gpuCmdProc;
|
||||
HSAQueueEntry *_task;
|
||||
};
|
||||
};
|
||||
|
||||
#endif // __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
|
||||
417
src/gpu-compute/gpu_compute_driver.cc
Normal file
417
src/gpu-compute/gpu_compute_driver.cc
Normal file
@@ -0,0 +1,417 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Sooraj Puthoor
|
||||
* Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#include "gpu-compute/gpu_compute_driver.hh"
|
||||
|
||||
#include "cpu/thread_context.hh"
|
||||
#include "debug/GPUDriver.hh"
|
||||
#include "dev/hsa/hsa_device.hh"
|
||||
#include "dev/hsa/hsa_packet_processor.hh"
|
||||
#include "dev/hsa/kfd_ioctl.h"
|
||||
#include "params/GPUComputeDriver.hh"
|
||||
#include "sim/syscall_emul_buf.hh"
|
||||
|
||||
GPUComputeDriver::GPUComputeDriver(Params *p)
|
||||
: HSADriver(p)
|
||||
{
|
||||
DPRINTF(GPUDriver, "Constructing KFD: device\n");
|
||||
}
|
||||
|
||||
int
|
||||
GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
|
||||
{
|
||||
auto &virt_proxy = tc->getVirtProxy();
|
||||
|
||||
switch (req) {
|
||||
case AMDKFD_IOC_GET_VERSION:
|
||||
{
|
||||
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");
|
||||
|
||||
TypedBufferArg<kfd_ioctl_get_version_args> args(ioc_buf);
|
||||
args->major_version = 1;
|
||||
args->minor_version = 0;
|
||||
|
||||
args.copyOut(virt_proxy);
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_CREATE_QUEUE:
|
||||
{
|
||||
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
|
||||
|
||||
allocateQueue(virt_proxy, ioc_buf);
|
||||
|
||||
DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_DESTROY_QUEUE:
|
||||
{
|
||||
TypedBufferArg<kfd_ioctl_destroy_queue_args> args(ioc_buf);
|
||||
args.copyIn(virt_proxy);
|
||||
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
|
||||
"queue offset %d\n", args->queue_id);
|
||||
device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id);
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_SET_MEMORY_POLICY:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_GET_CLOCK_COUNTERS:
|
||||
{
|
||||
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
|
||||
|
||||
TypedBufferArg<kfd_ioctl_get_clock_counters_args> args(ioc_buf);
|
||||
args.copyIn(virt_proxy);
|
||||
|
||||
// Set nanosecond resolution
|
||||
args->system_clock_freq = 1000000000;
|
||||
|
||||
/**
|
||||
* Derive all clock counters based on the tick. All
|
||||
* device clocks are identical and perfectly in sync.
|
||||
*/
|
||||
uint64_t elapsed_nsec = curTick() / SimClock::Int::ns;
|
||||
args->gpu_clock_counter = elapsed_nsec;
|
||||
args->cpu_clock_counter = elapsed_nsec;
|
||||
args->system_clock_counter = elapsed_nsec;
|
||||
|
||||
args.copyOut(virt_proxy);
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_GET_PROCESS_APERTURES:
|
||||
{
|
||||
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");
|
||||
|
||||
TypedBufferArg<kfd_ioctl_get_process_apertures_args> args(ioc_buf);
|
||||
args->num_of_nodes = 1;
|
||||
|
||||
/**
|
||||
* Set the GPUVM/LDS/Scratch APEs exactly as they
|
||||
* are in the real driver, see the KFD driver
|
||||
* in the ROCm Linux kernel source:
|
||||
* drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
|
||||
*/
|
||||
for (int i = 0; i < args->num_of_nodes; ++i) {
|
||||
/**
|
||||
* While the GPU node numbers start at 0, we add 1
|
||||
* to force the count to start at 1. This is to
|
||||
* ensure that the base/limit addresses are
|
||||
* calculated correctly.
|
||||
*/
|
||||
args->process_apertures[i].scratch_base
|
||||
= scratchApeBase(i + 1);
|
||||
args->process_apertures[i].scratch_limit =
|
||||
scratchApeLimit(args->process_apertures[i].scratch_base);
|
||||
|
||||
args->process_apertures[i].lds_base = ldsApeBase(i + 1);
|
||||
args->process_apertures[i].lds_limit =
|
||||
ldsApeLimit(args->process_apertures[i].lds_base);
|
||||
|
||||
args->process_apertures[i].gpuvm_base = gpuVmApeBase(i + 1);
|
||||
args->process_apertures[i].gpuvm_limit =
|
||||
gpuVmApeLimit(args->process_apertures[i].gpuvm_base);
|
||||
|
||||
// NOTE: Must match ID populated by hsaTopology.py
|
||||
args->process_apertures[i].gpu_id = 2765;
|
||||
|
||||
DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
|
||||
args->process_apertures[i].gpuvm_base);
|
||||
DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i,
|
||||
args->process_apertures[i].gpuvm_limit);
|
||||
|
||||
DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i,
|
||||
args->process_apertures[i].lds_base);
|
||||
DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i,
|
||||
args->process_apertures[i].lds_limit);
|
||||
|
||||
DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i,
|
||||
args->process_apertures[i].scratch_base);
|
||||
DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i,
|
||||
args->process_apertures[i].scratch_limit);
|
||||
|
||||
/**
|
||||
* The CPU's 64b address space can only use the
|
||||
* areas with VA[63:47] == 0x1ffff or VA[63:47] == 0,
|
||||
* therefore we must ensure that the apertures do not
|
||||
* fall in the CPU's address space.
|
||||
*/
|
||||
assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
|
||||
47) != 0x1ffff);
|
||||
assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
|
||||
47) != 0);
|
||||
assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
|
||||
47) != 0x1ffff);
|
||||
assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
|
||||
47) != 0);
|
||||
assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
|
||||
47) != 0x1ffff);
|
||||
assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
|
||||
47) != 0);
|
||||
assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
|
||||
47) != 0x1ffff);
|
||||
assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
|
||||
47) != 0);
|
||||
assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
|
||||
47) != 0x1ffff);
|
||||
assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
|
||||
47) != 0);
|
||||
assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
|
||||
47) != 0x1ffff);
|
||||
assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
|
||||
47) != 0);
|
||||
}
|
||||
|
||||
args.copyOut(virt_proxy);
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_UPDATE_QUEUE:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_CREATE_EVENT:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_CREATE_EVENT\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_DESTROY_EVENT:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_SET_EVENT:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_SET_EVENT\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_RESET_EVENT:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_WAIT_EVENTS:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_DBG_REGISTER:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_DBG_UNREGISTER:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_DBG_ADDRESS_WATCH:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_DBG_WAVE_CONTROL:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_MAP_MEMORY_TO_GPU:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
|
||||
}
|
||||
case AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_SET_CU_MASK:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE"
|
||||
"\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_SET_TRAP_HANDLER:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_GET_PROCESS_APERTURES_NEW:
|
||||
{
|
||||
DPRINTF(GPUDriver,
|
||||
"ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
|
||||
|
||||
TypedBufferArg<kfd_ioctl_get_process_apertures_new_args>
|
||||
ioc_args(ioc_buf);
|
||||
|
||||
ioc_args.copyIn(virt_proxy);
|
||||
ioc_args->num_of_nodes = 1;
|
||||
|
||||
for (int i = 0; i < ioc_args->num_of_nodes; ++i) {
|
||||
TypedBufferArg<kfd_process_device_apertures> ape_args
|
||||
(ioc_args->kfd_process_device_apertures_ptr);
|
||||
|
||||
ape_args->scratch_base = scratchApeBase(i + 1);
|
||||
ape_args->scratch_limit =
|
||||
scratchApeLimit(ape_args->scratch_base);
|
||||
ape_args->lds_base = ldsApeBase(i + 1);
|
||||
ape_args->lds_limit = ldsApeLimit(ape_args->lds_base);
|
||||
ape_args->gpuvm_base = gpuVmApeBase(i + 1);
|
||||
ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base);
|
||||
|
||||
ape_args->gpu_id = 2765;
|
||||
|
||||
assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
|
||||
assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
|
||||
assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
|
||||
assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
|
||||
assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
|
||||
assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0);
|
||||
assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
|
||||
assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
|
||||
assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0x1ffff);
|
||||
assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0);
|
||||
assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0x1ffff);
|
||||
assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0);
|
||||
|
||||
ape_args.copyOut(virt_proxy);
|
||||
}
|
||||
|
||||
ioc_args.copyOut(virt_proxy);
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_GET_DMABUF_INFO:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_IMPORT_DMABUF:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_GET_TILE_CONFIG:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_IPC_IMPORT_HANDLE:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_IPC_IMPORT_HANDLE\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_IPC_EXPORT_HANDLE:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_IPC_EXPORT_HANDLE\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_CROSS_MEMORY_COPY:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_CROSS_MEMORY_COPY\n");
|
||||
}
|
||||
break;
|
||||
case AMDKFD_IOC_OPEN_GRAPHIC_HANDLE:
|
||||
{
|
||||
warn("unimplemented ioctl: AMDKFD_IOC_OPEN_GRAPHIC_HANDLE\n");
|
||||
}
|
||||
break;
|
||||
default:
|
||||
fatal("%s: bad ioctl %d\n", req);
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
Addr
|
||||
GPUComputeDriver::gpuVmApeBase(int gpuNum) const
|
||||
{
|
||||
return ((Addr)gpuNum << 61) + 0x1000000000000L;
|
||||
}
|
||||
|
||||
Addr
|
||||
GPUComputeDriver::gpuVmApeLimit(Addr apeBase) const
|
||||
{
|
||||
return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
|
||||
}
|
||||
|
||||
Addr
|
||||
GPUComputeDriver::scratchApeBase(int gpuNum) const
|
||||
{
|
||||
return ((Addr)gpuNum << 61) + 0x100000000L;
|
||||
}
|
||||
|
||||
Addr
|
||||
GPUComputeDriver::scratchApeLimit(Addr apeBase) const
|
||||
{
|
||||
return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
|
||||
}
|
||||
|
||||
Addr
|
||||
GPUComputeDriver::ldsApeBase(int gpuNum) const
|
||||
{
|
||||
return ((Addr)gpuNum << 61) + 0x0;
|
||||
}
|
||||
|
||||
Addr
|
||||
GPUComputeDriver::ldsApeLimit(Addr apeBase) const
|
||||
{
|
||||
return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
|
||||
}
|
||||
|
||||
GPUComputeDriver*
|
||||
GPUComputeDriverParams::create()
|
||||
{
|
||||
return new GPUComputeDriver(this);
|
||||
}
|
||||
83
src/gpu-compute/gpu_compute_driver.hh
Normal file
83
src/gpu-compute/gpu_compute_driver.hh
Normal file
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Sooraj Puthoor
|
||||
* Anthony Gutierrez
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* The GPUComputeDriver implements an HSADriver for an HSA AMD GPU
|
||||
* agent. Other GPU devices, or other HSA agents, should not derive
|
||||
* from this class. Instead device-specific implementations of an
|
||||
* HSADriver should be provided for each unique device.
|
||||
*/
|
||||
|
||||
#ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
|
||||
#define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
|
||||
|
||||
#include "dev/hsa/hsa_driver.hh"
|
||||
|
||||
struct GPUComputeDriverParams;
|
||||
|
||||
class GPUComputeDriver final : public HSADriver
|
||||
{
|
||||
public:
|
||||
typedef GPUComputeDriverParams Params;
|
||||
GPUComputeDriver(Params *p);
|
||||
int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* The aperture (APE) base/limit pairs are set
|
||||
* statically at startup by the real KFD. AMD
|
||||
* x86_64 CPUs only use the areas in the 64b
|
||||
* address space where VA[63:47] == 0x1ffff or
|
||||
* VA[63:47] = 0. These methods generate the APE
|
||||
* base/limit pairs in exactly the same way as
|
||||
* the real KFD does, which ensures these APEs do
|
||||
* not fall into the CPU's address space
|
||||
*
|
||||
* see the macros in the KFD driver in the ROCm
|
||||
* Linux kernel source:
|
||||
*
|
||||
* drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
|
||||
*/
|
||||
Addr gpuVmApeBase(int gpuNum) const;
|
||||
Addr gpuVmApeLimit(Addr apeBase) const;
|
||||
Addr scratchApeBase(int gpuNum) const;
|
||||
Addr scratchApeLimit(Addr apeBase) const;
|
||||
Addr ldsApeBase(int gpuNum) const;
|
||||
Addr ldsApeLimit(Addr apeBase) const;
|
||||
};
|
||||
|
||||
#endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
|
||||
@@ -35,26 +35,50 @@
|
||||
|
||||
#include "debug/GPUMem.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/scalar_register_file.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
|
||||
GPUStaticInst *static_inst, uint64_t instSeqNum)
|
||||
: GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
|
||||
n_reg(0), useContinuation(false),
|
||||
statusBitVector(0), _staticInst(static_inst), _seqNum(instSeqNum)
|
||||
GPUStaticInst *static_inst, InstSeqNum instSeqNum)
|
||||
: GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
|
||||
(Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false),
|
||||
_staticInst(static_inst), _seqNum(instSeqNum)
|
||||
{
|
||||
tlbHitLevel.assign(computeUnit()->wfSize(), -1);
|
||||
d_data = new uint8_t[computeUnit()->wfSize() * 16];
|
||||
// vector instructions can have up to 4 source/destination operands
|
||||
d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
|
||||
a_data = new uint8_t[computeUnit()->wfSize() * 8];
|
||||
x_data = new uint8_t[computeUnit()->wfSize() * 8];
|
||||
// scalar loads can read up to 16 Dwords of data (see publicly
|
||||
// available GCN3 ISA manual)
|
||||
scalar_data = new uint8_t[16 * sizeof(uint32_t)];
|
||||
for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
|
||||
scalar_data[i] = 0;
|
||||
}
|
||||
for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
|
||||
a_data[i] = 0;
|
||||
x_data[i] = 0;
|
||||
}
|
||||
for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
|
||||
for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {
|
||||
d_data[i] = 0;
|
||||
}
|
||||
time = 0;
|
||||
|
||||
cu_id = _cu->cu_id;
|
||||
if (_wf) {
|
||||
simdId = _wf->simdId;
|
||||
wfDynId = _wf->wfDynId;
|
||||
kern_id = _wf->kernId;
|
||||
wg_id = _wf->wgId;
|
||||
wfSlotId = _wf->wfSlotId;
|
||||
} else {
|
||||
simdId = -1;
|
||||
wfDynId = -1;
|
||||
kern_id = -1;
|
||||
wg_id = -1;
|
||||
wfSlotId = -1;
|
||||
}
|
||||
}
|
||||
|
||||
GPUDynInst::~GPUDynInst()
|
||||
@@ -62,6 +86,8 @@ GPUDynInst::~GPUDynInst()
|
||||
delete[] d_data;
|
||||
delete[] a_data;
|
||||
delete[] x_data;
|
||||
delete[] scalar_data;
|
||||
delete _staticInst;
|
||||
}
|
||||
|
||||
void
|
||||
@@ -82,6 +108,36 @@ GPUDynInst::numDstRegOperands()
|
||||
return _staticInst->numDstRegOperands();
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::numSrcVecOperands()
|
||||
{
|
||||
return _staticInst->numSrcVecOperands();
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::numDstVecOperands()
|
||||
{
|
||||
return _staticInst->numDstVecOperands();
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::numSrcVecDWORDs()
|
||||
{
|
||||
return _staticInst->numSrcVecDWORDs();
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::numDstVecDWORDs()
|
||||
{
|
||||
return _staticInst->numDstVecDWORDs();
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::numOpdDWORDs(int operandIdx)
|
||||
{
|
||||
return _staticInst->numOpdDWORDs(operandIdx);
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::getNumOperands()
|
||||
{
|
||||
@@ -100,12 +156,6 @@ GPUDynInst::isScalarRegister(int operandIdx)
|
||||
return _staticInst->isScalarRegister(operandIdx);
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isCondRegister(int operandIdx)
|
||||
{
|
||||
return _staticInst->isCondRegister(operandIdx);
|
||||
}
|
||||
|
||||
int
|
||||
GPUDynInst::getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
@@ -130,13 +180,82 @@ GPUDynInst::isSrcOperand(int operandIdx)
|
||||
return _staticInst->isSrcOperand(operandIdx);
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::hasSourceSgpr() const
|
||||
{
|
||||
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
||||
if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::hasSourceVgpr() const
|
||||
{
|
||||
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
||||
if (_staticInst->isVectorRegister(i) && _staticInst->isSrcOperand(i)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::hasDestinationSgpr() const
|
||||
{
|
||||
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
||||
if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::srcIsVgpr(int index) const
|
||||
{
|
||||
assert(index >= 0 && index < _staticInst->getNumOperands());
|
||||
if (_staticInst->isVectorRegister(index) &&
|
||||
_staticInst->isSrcOperand(index)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::hasDestinationVgpr() const
|
||||
{
|
||||
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
||||
if (_staticInst->isVectorRegister(i) && _staticInst->isDstOperand(i)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isOpcode(const std::string& opcodeStr,
|
||||
const std::string& extStr) const
|
||||
{
|
||||
return _staticInst->opcode().find(opcodeStr) != std::string::npos &&
|
||||
_staticInst->opcode().find(extStr) != std::string::npos;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isOpcode(const std::string& opcodeStr) const
|
||||
{
|
||||
return _staticInst->opcode().find(opcodeStr) != std::string::npos;
|
||||
}
|
||||
|
||||
const std::string&
|
||||
GPUDynInst::disassemble() const
|
||||
{
|
||||
return _staticInst->disassemble();
|
||||
}
|
||||
|
||||
uint64_t
|
||||
InstSeqNum
|
||||
GPUDynInst::seqNum() const
|
||||
{
|
||||
return _seqNum;
|
||||
@@ -148,6 +267,40 @@ GPUDynInst::executedAs()
|
||||
return _staticInst->executed_as;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::hasVgprRawDependence(GPUDynInstPtr s)
|
||||
{
|
||||
assert(s);
|
||||
for (int i = 0; i < getNumOperands(); ++i) {
|
||||
if (isVectorRegister(i) && isSrcOperand(i)) {
|
||||
for (int j = 0; j < s->getNumOperands(); ++j) {
|
||||
if (s->isVectorRegister(j) && s->isDstOperand(j)) {
|
||||
if (i == j)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::hasSgprRawDependence(GPUDynInstPtr s)
|
||||
{
|
||||
assert(s);
|
||||
for (int i = 0; i < getNumOperands(); ++i) {
|
||||
if (isScalarRegister(i) && isSrcOperand(i)) {
|
||||
for (int j = 0; j < s->getNumOperands(); ++j) {
|
||||
if (s->isScalarRegister(j) && s->isDstOperand(j)) {
|
||||
if (i == j)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Process a memory instruction and (if necessary) submit timing request
|
||||
void
|
||||
GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
@@ -156,12 +309,15 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
cu->cu_id, simdId, wfSlotId, exec_mask);
|
||||
|
||||
_staticInst->initiateAcc(gpuDynInst);
|
||||
time = 0;
|
||||
}
|
||||
|
||||
void
|
||||
GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="
|
||||
"%#x\n complete",
|
||||
cu->cu_id, simdId, wfSlotId, exec_mask);
|
||||
|
||||
_staticInst->completeAcc(gpuDynInst);
|
||||
}
|
||||
|
||||
@@ -181,12 +337,42 @@ GPUDynInst::isBranch() const
|
||||
return _staticInst->isBranch();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isCondBranch() const
|
||||
{
|
||||
return _staticInst->isCondBranch();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isNop() const
|
||||
{
|
||||
return _staticInst->isNop();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isEndOfKernel() const
|
||||
{
|
||||
return _staticInst->isEndOfKernel();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isKernelLaunch() const
|
||||
{
|
||||
return _staticInst->isKernelLaunch();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isSDWAInst() const
|
||||
{
|
||||
return _staticInst->isSDWAInst();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isDPPInst() const
|
||||
{
|
||||
return _staticInst->isDPPInst();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isReturn() const
|
||||
{
|
||||
@@ -218,9 +404,9 @@ GPUDynInst::isBarrier() const
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isMemFence() const
|
||||
GPUDynInst::isMemSync() const
|
||||
{
|
||||
return _staticInst->isMemFence();
|
||||
return _staticInst->isMemSync();
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -265,6 +451,12 @@ GPUDynInst::isAtomicRet() const
|
||||
return _staticInst->isAtomicRet();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isVector() const
|
||||
{
|
||||
return !_staticInst->isScalar();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isScalar() const
|
||||
{
|
||||
@@ -295,6 +487,78 @@ GPUDynInst::writesVCC() const
|
||||
return _staticInst->writesVCC();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::readsMode() const
|
||||
{
|
||||
return _staticInst->readsMode();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::writesMode() const
|
||||
{
|
||||
return _staticInst->writesMode();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::readsEXEC() const
|
||||
{
|
||||
return _staticInst->readsEXEC();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::writesEXEC() const
|
||||
{
|
||||
return _staticInst->writesEXEC();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::ignoreExec() const
|
||||
{
|
||||
return _staticInst->ignoreExec();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::writesExecMask() const
|
||||
{
|
||||
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
||||
return _staticInst->isDstOperand(i) &&
|
||||
_staticInst->isExecMaskRegister(i);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::readsExecMask() const
|
||||
{
|
||||
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
||||
return _staticInst->isSrcOperand(i) &&
|
||||
_staticInst->isExecMaskRegister(i);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::writesFlatScratch() const
|
||||
{
|
||||
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
||||
if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
|
||||
return _staticInst->isFlatScratchRegister(i);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::readsFlatScratch() const
|
||||
{
|
||||
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
||||
if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
|
||||
return _staticInst->isFlatScratchRegister(i);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isAtomicAnd() const
|
||||
{
|
||||
@@ -420,72 +684,6 @@ GPUDynInst::isSpillSeg() const
|
||||
return _staticInst->isSpillSeg();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isWorkitemScope() const
|
||||
{
|
||||
return _staticInst->isWorkitemScope();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isWavefrontScope() const
|
||||
{
|
||||
return _staticInst->isWavefrontScope();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isWorkgroupScope() const
|
||||
{
|
||||
return _staticInst->isWorkgroupScope();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isDeviceScope() const
|
||||
{
|
||||
return _staticInst->isDeviceScope();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isSystemScope() const
|
||||
{
|
||||
return _staticInst->isSystemScope();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isNoScope() const
|
||||
{
|
||||
return _staticInst->isNoScope();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isRelaxedOrder() const
|
||||
{
|
||||
return _staticInst->isRelaxedOrder();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isAcquire() const
|
||||
{
|
||||
return _staticInst->isAcquire();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isRelease() const
|
||||
{
|
||||
return _staticInst->isRelease();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isAcquireRelease() const
|
||||
{
|
||||
return _staticInst->isAcquireRelease();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isNoOrder() const
|
||||
{
|
||||
return _staticInst->isNoOrder();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isGloballyCoherent() const
|
||||
{
|
||||
@@ -498,12 +696,240 @@ GPUDynInst::isSystemCoherent() const
|
||||
return _staticInst->isSystemCoherent();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isF16() const
|
||||
{
|
||||
return _staticInst->isF16();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isF32() const
|
||||
{
|
||||
return _staticInst->isF32();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isF64() const
|
||||
{
|
||||
return _staticInst->isF64();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isFMA() const
|
||||
{
|
||||
return _staticInst->isFMA();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isMAC() const
|
||||
{
|
||||
return _staticInst->isMAC();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isMAD() const
|
||||
{
|
||||
return _staticInst->isMAD();
|
||||
}
|
||||
|
||||
void
|
||||
GPUDynInst::doApertureCheck(const VectorMask &mask)
|
||||
{
|
||||
assert(mask.any());
|
||||
// find the segment of the first active address, after
|
||||
// that we check that all other active addresses also
|
||||
// fall within the same APE
|
||||
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
||||
if (mask[lane]) {
|
||||
if (computeUnit()->shader->isLdsApe(addr[lane])) {
|
||||
// group segment
|
||||
staticInstruction()->executed_as = Enums::SC_GROUP;
|
||||
break;
|
||||
} else if (computeUnit()->shader->isScratchApe(addr[lane])) {
|
||||
// private segment
|
||||
staticInstruction()->executed_as = Enums::SC_PRIVATE;
|
||||
break;
|
||||
} else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {
|
||||
// we won't support GPUVM
|
||||
fatal("flat access is in GPUVM APE\n");
|
||||
} else if (bits(addr[lane], 63, 47) != 0x1FFFF &&
|
||||
bits(addr[lane], 63, 47)) {
|
||||
// we are in the "hole", this is a memory violation
|
||||
fatal("flat access at addr %#x has a memory violation\n",
|
||||
addr[lane]);
|
||||
} else {
|
||||
// global memory segment
|
||||
staticInstruction()->executed_as = Enums::SC_GLOBAL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// we should have found the segment
|
||||
assert(executedAs() != Enums::SC_NONE);
|
||||
|
||||
// flat accesses should not straddle multiple APEs so we
|
||||
// must check that all addresses fall within the same APE
|
||||
if (executedAs() == Enums::SC_GROUP) {
|
||||
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
||||
if (mask[lane]) {
|
||||
// if the first valid addr we found above was LDS,
|
||||
// all the rest should be
|
||||
assert(computeUnit()->shader->isLdsApe(addr[lane]));
|
||||
}
|
||||
}
|
||||
} else if (executedAs() == Enums::SC_PRIVATE) {
|
||||
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
||||
if (mask[lane]) {
|
||||
// if the first valid addr we found above was private,
|
||||
// all the rest should be
|
||||
assert(computeUnit()->shader->isScratchApe(addr[lane]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
||||
if (mask[lane]) {
|
||||
// if the first valid addr we found above was global,
|
||||
// all the rest should be. because we don't have an
|
||||
// explicit range of the global segment, we just make
|
||||
// sure that the address fall in no other APE and that
|
||||
// it is not a memory violation
|
||||
assert(!computeUnit()->shader->isLdsApe(addr[lane]));
|
||||
assert(!computeUnit()->shader->isScratchApe(addr[lane]));
|
||||
assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));
|
||||
assert(!(bits(addr[lane], 63, 47) != 0x1FFFF
|
||||
&& bits(addr[lane], 63, 47)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUDynInst::resolveFlatSegment(const VectorMask &mask)
|
||||
{
|
||||
doApertureCheck(mask);
|
||||
|
||||
|
||||
// Now that we know the aperature, do the following:
|
||||
// 1. Transform the flat address to its segmented equivalent.
|
||||
// 2. Set the execUnitId based an the aperture check.
|
||||
// 3. Decrement any extra resources that were reserved. Other
|
||||
// resources are released as normal, below.
|
||||
if (executedAs() == Enums::SC_GLOBAL) {
|
||||
// no transormation for global segment
|
||||
wavefront()->execUnitId = wavefront()->flatGmUnitId;
|
||||
if (isLoad()) {
|
||||
wavefront()->rdLmReqsInPipe--;
|
||||
} else if (isStore()) {
|
||||
wavefront()->wrLmReqsInPipe--;
|
||||
} else if (isAtomic() || isMemSync()) {
|
||||
wavefront()->wrLmReqsInPipe--;
|
||||
wavefront()->rdLmReqsInPipe--;
|
||||
} else {
|
||||
panic("Invalid memory operation!\n");
|
||||
}
|
||||
} else if (executedAs() == Enums::SC_GROUP) {
|
||||
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
||||
if (mask[lane]) {
|
||||
// flat address calculation goes here.
|
||||
// addr[lane] = segmented address
|
||||
panic("Flat group memory operation is unimplemented!\n");
|
||||
}
|
||||
}
|
||||
wavefront()->execUnitId = wavefront()->flatLmUnitId;
|
||||
if (isLoad()) {
|
||||
wavefront()->rdGmReqsInPipe--;
|
||||
} else if (isStore()) {
|
||||
wavefront()->wrGmReqsInPipe--;
|
||||
} else if (isAtomic() || isMemSync()) {
|
||||
wavefront()->rdGmReqsInPipe--;
|
||||
wavefront()->wrGmReqsInPipe--;
|
||||
} else {
|
||||
panic("Invalid memory operation!\n");
|
||||
}
|
||||
} else if (executedAs() == Enums::SC_PRIVATE) {
|
||||
/**
|
||||
* Flat instructions may resolve to the private segment (scratch),
|
||||
* which is backed by main memory and provides per-lane scratch
|
||||
* memory. Flat addressing uses apertures - registers that specify
|
||||
* the address range in the VA space where LDS/private memory is
|
||||
* mapped. The value of which is set by the kernel mode driver.
|
||||
* These apertures use addresses that are not used by x86 CPUs.
|
||||
* When the address of a Flat operation falls into one of the
|
||||
* apertures, the Flat operation is redirected to either LDS or
|
||||
* to the private memory segment.
|
||||
*
|
||||
* For private memory the SW runtime will allocate some space in
|
||||
* the VA space for each AQL queue. The base address of which is
|
||||
* stored in scalar registers per the AMD GPU ABI. The amd_queue_t
|
||||
* scratch_backing_memory_location provides the base address in
|
||||
* memory for the queue's private segment. Various other fields
|
||||
* loaded into register state during kernel launch specify per-WF
|
||||
* and per-work-item offsets so that individual lanes may access
|
||||
* their private segment allocation.
|
||||
*
|
||||
* For more details about flat addressing see:
|
||||
* http://rocm-documentation.readthedocs.io/en/latest/
|
||||
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
|
||||
*
|
||||
* https://github.com/ROCm-Developer-Tools/
|
||||
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
|
||||
* #flat-addressing
|
||||
*/
|
||||
|
||||
uint32_t numSgprs = wavefront()->maxSgprs;
|
||||
uint32_t physSgprIdx =
|
||||
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
|
||||
numSgprs - 3);
|
||||
uint32_t offset =
|
||||
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
|
||||
physSgprIdx =
|
||||
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
|
||||
numSgprs - 4);
|
||||
uint32_t size =
|
||||
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
|
||||
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
||||
if (mask[lane]) {
|
||||
addr[lane] = addr[lane] + lane * size + offset +
|
||||
wavefront()->computeUnit->shader->getHiddenPrivateBase() -
|
||||
wavefront()->computeUnit->shader->getScratchBase();
|
||||
}
|
||||
}
|
||||
wavefront()->execUnitId = wavefront()->flatLmUnitId;
|
||||
if (isLoad()) {
|
||||
wavefront()->rdGmReqsInPipe--;
|
||||
} else if (isStore()) {
|
||||
wavefront()->wrGmReqsInPipe--;
|
||||
} else if (isAtomic() || isMemSync()) {
|
||||
wavefront()->rdGmReqsInPipe--;
|
||||
wavefront()->wrGmReqsInPipe--;
|
||||
} else {
|
||||
panic("Invalid memory operation!\n");
|
||||
}
|
||||
} else {
|
||||
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
||||
if (mask[lane]) {
|
||||
panic("flat addr %#llx maps to bad segment %d\n",
|
||||
addr[lane], executedAs());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TheGpuISA::ScalarRegU32
|
||||
GPUDynInst::srcLiteral() const
|
||||
{
|
||||
return _staticInst->srcLiteral();
|
||||
}
|
||||
|
||||
void
|
||||
GPUDynInst::updateStats()
|
||||
{
|
||||
if (_staticInst->isLocalMem()) {
|
||||
// access to LDS (shared) memory
|
||||
cu->dynamicLMemInstrCnt++;
|
||||
} else if (_staticInst->isFlat()) {
|
||||
cu->dynamicFlatMemInstrCnt++;
|
||||
} else {
|
||||
// access to global memory
|
||||
|
||||
@@ -536,3 +962,28 @@ GPUDynInst::updateStats()
|
||||
cu->dynamicGMemInstrCnt++;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)
|
||||
{
|
||||
// Only take the first measurement in the case of coalescing
|
||||
if (roundTripTime.size() > hopId)
|
||||
return;
|
||||
|
||||
roundTripTime.push_back(currentTime);
|
||||
}
|
||||
|
||||
void
|
||||
GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
|
||||
{
|
||||
if (lineAddressTime.count(addr)) {
|
||||
if (lineAddressTime[addr].size() > hopId) {
|
||||
return;
|
||||
}
|
||||
|
||||
lineAddressTime[addr].push_back(currentTime);
|
||||
} else if (hopId == 0) {
|
||||
auto addressTimeVec = std::vector<Tick> { currentTime };
|
||||
lineAddressTime.insert(std::make_pair(addr, addressTimeVec));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,7 +39,6 @@
|
||||
|
||||
#include "base/amo.hh"
|
||||
#include "base/logging.hh"
|
||||
#include "enums/MemType.hh"
|
||||
#include "enums/StorageClassType.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_exec_context.hh"
|
||||
@@ -68,20 +67,10 @@ class AtomicOpCAS : public TypedAtomicOpFunctor<T>
|
||||
} else {
|
||||
computeUnit->numFailedCASOps++;
|
||||
}
|
||||
|
||||
if (computeUnit->xact_cas_mode) {
|
||||
computeUnit->xactCasLoadMap.clear();
|
||||
}
|
||||
}
|
||||
AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
|
||||
};
|
||||
|
||||
typedef enum
|
||||
{
|
||||
VT_32,
|
||||
VT_64,
|
||||
} vgpr_type;
|
||||
|
||||
class GPUDynInst : public GPUExecContext
|
||||
{
|
||||
public:
|
||||
@@ -91,27 +80,51 @@ class GPUDynInst : public GPUExecContext
|
||||
void execute(GPUDynInstPtr gpuDynInst);
|
||||
int numSrcRegOperands();
|
||||
int numDstRegOperands();
|
||||
int numDstVecOperands();
|
||||
int numSrcVecOperands();
|
||||
int numSrcVecDWORDs();
|
||||
int numDstVecDWORDs();
|
||||
int numOpdDWORDs(int operandIdx);
|
||||
int getNumOperands();
|
||||
bool isVectorRegister(int operandIdx);
|
||||
bool isScalarRegister(int operandIdx);
|
||||
bool isCondRegister(int operandIdx);
|
||||
int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
|
||||
int getOperandSize(int operandIdx);
|
||||
bool isDstOperand(int operandIdx);
|
||||
bool isSrcOperand(int operandIdx);
|
||||
|
||||
bool hasDestinationSgpr() const;
|
||||
bool hasSourceSgpr() const;
|
||||
bool hasDestinationVgpr() const;
|
||||
bool hasSourceVgpr() const;
|
||||
|
||||
bool hasSgprRawDependence(GPUDynInstPtr s);
|
||||
bool hasVgprRawDependence(GPUDynInstPtr s);
|
||||
|
||||
// returns true if the string "opcodeStr" is found in the
|
||||
// opcode of the instruction
|
||||
bool isOpcode(const std::string& opcodeStr) const;
|
||||
bool isOpcode(const std::string& opcodeStr,
|
||||
const std::string& extStr) const;
|
||||
// returns true if source operand at "index" is a vector register
|
||||
bool srcIsVgpr(int index) const;
|
||||
|
||||
const std::string &disassemble() const;
|
||||
|
||||
uint64_t seqNum() const;
|
||||
InstSeqNum seqNum() const;
|
||||
|
||||
Enums::StorageClassType executedAs();
|
||||
|
||||
// The address of the memory operation
|
||||
// virtual address for scalar memory operations
|
||||
Addr scalarAddr;
|
||||
// virtual addressies for vector memory operations
|
||||
std::vector<Addr> addr;
|
||||
Addr pAddr;
|
||||
|
||||
// The data to get written
|
||||
// vector data to get written
|
||||
uint8_t *d_data;
|
||||
// scalar data to be transferred
|
||||
uint8_t *scalar_data;
|
||||
// Additional data (for atomics)
|
||||
uint8_t *a_data;
|
||||
// Additional data (for atomics)
|
||||
@@ -119,19 +132,6 @@ class GPUDynInst : public GPUExecContext
|
||||
// The execution mask
|
||||
VectorMask exec_mask;
|
||||
|
||||
// The memory type (M_U32, M_S32, ...)
|
||||
Enums::MemType m_type;
|
||||
|
||||
// The equivalency class
|
||||
int equiv;
|
||||
// The return VGPR type (VT_32 or VT_64)
|
||||
vgpr_type v_type;
|
||||
// Number of VGPR's accessed (1, 2, or 4)
|
||||
int n_reg;
|
||||
// The return VGPR index
|
||||
int dst_reg;
|
||||
// There can be max 4 dest regs>
|
||||
int dst_reg_vec[4];
|
||||
// SIMD where the WF of the memory instruction has been mapped to
|
||||
int simdId;
|
||||
// unique id of the WF where the memory instruction belongs to
|
||||
@@ -140,21 +140,16 @@ class GPUDynInst : public GPUExecContext
|
||||
int kern_id;
|
||||
// The CU id of the requesting wf
|
||||
int cu_id;
|
||||
// The workgroup id of the requesting wf
|
||||
int wg_id;
|
||||
// HW slot id where the WF is mapped to inside a SIMD unit
|
||||
int wfSlotId;
|
||||
// execution pipeline id where the memory instruction has been scheduled
|
||||
int pipeId;
|
||||
int execUnitId;
|
||||
// The execution time of this operation
|
||||
Tick time;
|
||||
// The latency of this operation
|
||||
WaitClass latency;
|
||||
// A list of bank conflicts for the 4 cycles.
|
||||
uint32_t bc[4];
|
||||
|
||||
// A pointer to ROM
|
||||
uint8_t *rom;
|
||||
// The size of the READONLY segment
|
||||
int sz_rom;
|
||||
|
||||
// Initiate the specified memory operation, by creating a
|
||||
// memory request and sending it off to the memory system.
|
||||
@@ -168,16 +163,23 @@ class GPUDynInst : public GPUExecContext
|
||||
|
||||
GPUStaticInst* staticInstruction() { return _staticInst; }
|
||||
|
||||
TheGpuISA::ScalarRegU32 srcLiteral() const;
|
||||
|
||||
bool isALU() const;
|
||||
bool isBranch() const;
|
||||
bool isCondBranch() const;
|
||||
bool isNop() const;
|
||||
bool isReturn() const;
|
||||
bool isEndOfKernel() const;
|
||||
bool isKernelLaunch() const;
|
||||
bool isSDWAInst() const;
|
||||
bool isDPPInst() const;
|
||||
bool isUnconditionalJump() const;
|
||||
bool isSpecialOp() const;
|
||||
bool isWaitcnt() const;
|
||||
|
||||
bool isBarrier() const;
|
||||
bool isMemFence() const;
|
||||
bool isMemSync() const;
|
||||
bool isMemRef() const;
|
||||
bool isFlat() const;
|
||||
bool isLoad() const;
|
||||
@@ -188,10 +190,20 @@ class GPUDynInst : public GPUExecContext
|
||||
bool isAtomicRet() const;
|
||||
|
||||
bool isScalar() const;
|
||||
bool isVector() const;
|
||||
bool readsSCC() const;
|
||||
bool writesSCC() const;
|
||||
bool readsVCC() const;
|
||||
bool writesVCC() const;
|
||||
bool readsEXEC() const;
|
||||
bool writesEXEC() const;
|
||||
bool readsMode() const;
|
||||
bool writesMode() const;
|
||||
bool ignoreExec() const;
|
||||
bool readsFlatScratch() const;
|
||||
bool writesFlatScratch() const;
|
||||
bool readsExecMask() const;
|
||||
bool writesExecMask() const;
|
||||
|
||||
bool isAtomicAnd() const;
|
||||
bool isAtomicOr() const;
|
||||
@@ -217,39 +229,25 @@ class GPUDynInst : public GPUExecContext
|
||||
bool isReadOnlySeg() const;
|
||||
bool isSpillSeg() const;
|
||||
|
||||
bool isWorkitemScope() const;
|
||||
bool isWavefrontScope() const;
|
||||
bool isWorkgroupScope() const;
|
||||
bool isDeviceScope() const;
|
||||
bool isSystemScope() const;
|
||||
bool isNoScope() const;
|
||||
|
||||
bool isRelaxedOrder() const;
|
||||
bool isAcquire() const;
|
||||
bool isRelease() const;
|
||||
bool isAcquireRelease() const;
|
||||
bool isNoOrder() const;
|
||||
|
||||
bool isGloballyCoherent() const;
|
||||
bool isSystemCoherent() const;
|
||||
|
||||
/*
|
||||
* Loads/stores/atomics may have acquire/release semantics associated
|
||||
* withthem. Some protocols want to see the acquire/release as separate
|
||||
* requests from the load/store/atomic. We implement that separation
|
||||
* using continuations (i.e., a function pointer with an object associated
|
||||
* with it). When, for example, the front-end generates a store with
|
||||
* release semantics, we will first issue a normal store and set the
|
||||
* continuation in the GPUDynInst to a function that generate a
|
||||
* release request. That continuation will be called when the normal
|
||||
* store completes (in ComputeUnit::DataPort::recvTimingResponse). The
|
||||
* continuation will be called in the context of the same GPUDynInst
|
||||
* that generated the initial store.
|
||||
*/
|
||||
std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
|
||||
bool isF16() const;
|
||||
bool isF32() const;
|
||||
bool isF64() const;
|
||||
|
||||
// when true, call execContinuation when response arrives
|
||||
bool useContinuation;
|
||||
bool isFMA() const;
|
||||
bool isMAC() const;
|
||||
bool isMAD() const;
|
||||
|
||||
// for FLAT memory ops. check the segment address
|
||||
// against the APE registers to see if it falls
|
||||
// within one of the APE ranges for LDS/SCRATCH/GPUVM.
|
||||
// if it does not fall into one of the three APEs, it
|
||||
// will be a regular global access.
|
||||
void doApertureCheck(const VectorMask &mask);
|
||||
// Function to resolve a flat accesses during execution stage.
|
||||
void resolveFlatSegment(const VectorMask &mask);
|
||||
|
||||
template<typename c0> AtomicOpFunctorPtr
|
||||
makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
|
||||
@@ -282,62 +280,31 @@ class GPUDynInst : public GPUExecContext
|
||||
}
|
||||
|
||||
void
|
||||
setRequestFlags(RequestPtr req, bool setMemOrder=true)
|
||||
setRequestFlags(RequestPtr req) const
|
||||
{
|
||||
// currently these are the easy scopes to deduce
|
||||
if (isPrivateSeg()) {
|
||||
req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
|
||||
} else if (isSpillSeg()) {
|
||||
req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
|
||||
} else if (isGlobalSeg()) {
|
||||
req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
|
||||
} else if (isReadOnlySeg()) {
|
||||
req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
|
||||
} else if (isGroupSeg()) {
|
||||
req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
|
||||
} else if (isFlat()) {
|
||||
panic("TODO: translate to correct scope");
|
||||
} else {
|
||||
fatal("%s has bad segment type\n", disassemble());
|
||||
if (isGloballyCoherent()) {
|
||||
req->setCacheCoherenceFlags(Request::GLC_BIT);
|
||||
}
|
||||
|
||||
if (isWavefrontScope()) {
|
||||
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
|
||||
Request::WAVEFRONT_SCOPE);
|
||||
} else if (isWorkgroupScope()) {
|
||||
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
|
||||
Request::WORKGROUP_SCOPE);
|
||||
} else if (isDeviceScope()) {
|
||||
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
|
||||
Request::DEVICE_SCOPE);
|
||||
} else if (isSystemScope()) {
|
||||
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
|
||||
Request::SYSTEM_SCOPE);
|
||||
} else if (!isNoScope() && !isWorkitemScope()) {
|
||||
fatal("%s has bad scope type\n", disassemble());
|
||||
if (isSystemCoherent()) {
|
||||
req->setCacheCoherenceFlags(Request::SLC_BIT);
|
||||
}
|
||||
|
||||
if (setMemOrder) {
|
||||
// set acquire and release flags
|
||||
if (isAcquire()) {
|
||||
req->setFlags(Request::ACQUIRE);
|
||||
} else if (isRelease()) {
|
||||
req->setFlags(Request::RELEASE);
|
||||
} else if (isAcquireRelease()) {
|
||||
req->setFlags(Request::ACQUIRE | Request::RELEASE);
|
||||
} else if (!isNoOrder()) {
|
||||
fatal("%s has bad memory order\n", disassemble());
|
||||
}
|
||||
}
|
||||
|
||||
// set atomic type
|
||||
// currently, the instruction genenerator only produces atomic return
|
||||
// but a magic instruction can produce atomic no return
|
||||
if (isAtomicRet()) {
|
||||
req->setFlags(Request::ATOMIC_RETURN_OP);
|
||||
} else if (isAtomicNoRet()) {
|
||||
req->setFlags(Request::ATOMIC_NO_RETURN_OP);
|
||||
}
|
||||
|
||||
if (isMemSync()) {
|
||||
// the path for kernel launch and kernel end is different
|
||||
// from non-kernel mem sync.
|
||||
assert(!isKernelLaunch());
|
||||
assert(!isEndOfKernel());
|
||||
|
||||
// must be wbinv inst if not kernel launch/end
|
||||
req->setCacheCoherenceFlags(Request::ACQUIRE);
|
||||
}
|
||||
}
|
||||
|
||||
// Map returned packets and the addresses they satisfy with which lane they
|
||||
@@ -348,12 +315,39 @@ class GPUDynInst : public GPUExecContext
|
||||
// Track the status of memory requests per lane, a bit per lane
|
||||
VectorMask statusBitVector;
|
||||
// for ld_v# or st_v#
|
||||
std::vector<int> statusVector;
|
||||
std::vector<int> tlbHitLevel;
|
||||
|
||||
// for misaligned scalar ops we track the number
|
||||
// of outstanding reqs here
|
||||
int numScalarReqs;
|
||||
|
||||
Tick getAccessTime() const { return accessTime; }
|
||||
|
||||
void setAccessTime(Tick currentTime) { accessTime = currentTime; }
|
||||
|
||||
void profileRoundTripTime(Tick currentTime, int hopId);
|
||||
std::vector<Tick> getRoundTripTime() const { return roundTripTime; }
|
||||
|
||||
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
|
||||
const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
|
||||
{ return lineAddressTime; }
|
||||
|
||||
// inst used to save/restore a wavefront context
|
||||
bool isSaveRestore;
|
||||
private:
|
||||
GPUStaticInst *_staticInst;
|
||||
uint64_t _seqNum;
|
||||
const InstSeqNum _seqNum;
|
||||
|
||||
// the time the request was started
|
||||
Tick accessTime = -1;
|
||||
|
||||
// hold the tick when the instruction arrives at certain hop points
|
||||
// on it's way to main memory
|
||||
std::vector<Tick> roundTripTime;
|
||||
|
||||
// hold each cache block address for the instruction and a vector
|
||||
// to hold the tick when the block arrives at certain hop points
|
||||
std::map<Addr, std::vector<Tick>> lineAddressTime;
|
||||
};
|
||||
|
||||
#endif // __GPU_DYN_INST_HH__
|
||||
|
||||
@@ -59,8 +59,8 @@ GPUExecContext::readMiscReg(int opIdx) const
|
||||
}
|
||||
|
||||
void
|
||||
GPUExecContext::writeMiscReg(int opIdx, RegVal operandVal)
|
||||
GPUExecContext::writeMiscReg(int opIdx, RegVal val)
|
||||
{
|
||||
assert(gpuISA);
|
||||
gpuISA->writeMiscReg(opIdx, operandVal);
|
||||
gpuISA->writeMiscReg(opIdx, val);
|
||||
}
|
||||
|
||||
@@ -34,10 +34,10 @@
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
|
||||
GPUStaticInst::GPUStaticInst(const std::string &opcode)
|
||||
: executed_as(Enums::SC_NONE), opcode(opcode),
|
||||
_instNum(0), _instAddr(0)
|
||||
: executed_as(Enums::SC_NONE), _opcode(opcode),
|
||||
_instNum(0), _instAddr(0), srcVecOperands(-1), dstVecOperands(-1),
|
||||
srcVecDWORDs(-1), dstVecDWORDs(-1)
|
||||
{
|
||||
setFlag(NoOrder);
|
||||
}
|
||||
|
||||
const std::string&
|
||||
@@ -50,3 +50,80 @@ GPUStaticInst::disassemble()
|
||||
|
||||
return disassembly;
|
||||
}
|
||||
|
||||
int
|
||||
GPUStaticInst::numSrcVecOperands()
|
||||
{
|
||||
if (srcVecOperands > -1)
|
||||
return srcVecOperands;
|
||||
|
||||
srcVecOperands = 0;
|
||||
if (!isScalar()) {
|
||||
for (int k = 0; k < getNumOperands(); ++k) {
|
||||
if (isVectorRegister(k) && isSrcOperand(k))
|
||||
srcVecOperands++;
|
||||
}
|
||||
}
|
||||
return srcVecOperands;
|
||||
}
|
||||
|
||||
int
|
||||
GPUStaticInst::numDstVecOperands()
|
||||
{
|
||||
if (dstVecOperands > -1)
|
||||
return dstVecOperands;
|
||||
|
||||
dstVecOperands = 0;
|
||||
if (!isScalar()) {
|
||||
for (int k = 0; k < getNumOperands(); ++k) {
|
||||
if (isVectorRegister(k) && isDstOperand(k))
|
||||
dstVecOperands++;
|
||||
}
|
||||
}
|
||||
return dstVecOperands;
|
||||
}
|
||||
|
||||
int
|
||||
GPUStaticInst::numSrcVecDWORDs()
|
||||
{
|
||||
if (srcVecDWORDs > -1) {
|
||||
return srcVecDWORDs;
|
||||
}
|
||||
|
||||
srcVecDWORDs = 0;
|
||||
if (!isScalar()) {
|
||||
for (int i = 0; i < getNumOperands(); i++) {
|
||||
if (isVectorRegister(i) && isSrcOperand(i)) {
|
||||
int dwords = numOpdDWORDs(i);
|
||||
srcVecDWORDs += dwords;
|
||||
}
|
||||
}
|
||||
}
|
||||
return srcVecDWORDs;
|
||||
}
|
||||
|
||||
int
|
||||
GPUStaticInst::numDstVecDWORDs()
|
||||
{
|
||||
if (dstVecDWORDs > -1) {
|
||||
return dstVecDWORDs;
|
||||
}
|
||||
|
||||
dstVecDWORDs = 0;
|
||||
if (!isScalar()) {
|
||||
for (int i = 0; i < getNumOperands(); i++) {
|
||||
if (isVectorRegister(i) && isDstOperand(i)) {
|
||||
int dwords = numOpdDWORDs(i);
|
||||
dstVecDWORDs += dwords;
|
||||
}
|
||||
}
|
||||
}
|
||||
return dstVecDWORDs;
|
||||
}
|
||||
|
||||
int
|
||||
GPUStaticInst::numOpdDWORDs(int operandIdx)
|
||||
{
|
||||
return getOperandSize(operandIdx) <= 4 ? 1
|
||||
: getOperandSize(operandIdx) / 4;
|
||||
}
|
||||
|
||||
@@ -59,6 +59,7 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
{
|
||||
public:
|
||||
GPUStaticInst(const std::string &opcode);
|
||||
virtual ~GPUStaticInst() { }
|
||||
void instAddr(int inst_addr) { _instAddr = inst_addr; }
|
||||
int instAddr() const { return _instAddr; }
|
||||
int nextInstAddr() const { return _instAddr + instSize(); }
|
||||
@@ -71,15 +72,18 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
|
||||
int ipdInstNum() const { return _ipdInstNum; }
|
||||
|
||||
virtual TheGpuISA::ScalarRegU32 srcLiteral() const { return 0; }
|
||||
|
||||
virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
|
||||
virtual void generateDisassembly() = 0;
|
||||
const std::string& disassemble();
|
||||
virtual int getNumOperands() = 0;
|
||||
virtual bool isCondRegister(int operandIndex) = 0;
|
||||
virtual bool isScalarRegister(int operandIndex) = 0;
|
||||
virtual bool isVectorRegister(int operandIndex) = 0;
|
||||
virtual bool isSrcOperand(int operandIndex) = 0;
|
||||
virtual bool isDstOperand(int operandIndex) = 0;
|
||||
virtual bool isFlatScratchRegister(int opIdx) = 0;
|
||||
virtual bool isExecMaskRegister(int opIdx) = 0;
|
||||
virtual int getOperandSize(int operandIndex) = 0;
|
||||
|
||||
virtual int getRegisterIndex(int operandIndex,
|
||||
@@ -88,12 +92,24 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
virtual int numDstRegOperands() = 0;
|
||||
virtual int numSrcRegOperands() = 0;
|
||||
|
||||
virtual bool isValid() const = 0;
|
||||
virtual int coalescerTokenCount() const { return 0; }
|
||||
|
||||
int numDstVecOperands();
|
||||
int numSrcVecOperands();
|
||||
int numDstVecDWORDs();
|
||||
int numSrcVecDWORDs();
|
||||
|
||||
int numOpdDWORDs(int operandIdx);
|
||||
|
||||
bool isALU() const { return _flags[ALU]; }
|
||||
bool isBranch() const { return _flags[Branch]; }
|
||||
bool isCondBranch() const { return _flags[CondBranch]; }
|
||||
bool isNop() const { return _flags[Nop]; }
|
||||
bool isReturn() const { return _flags[Return]; }
|
||||
bool isEndOfKernel() const { return _flags[EndOfKernel]; }
|
||||
bool isKernelLaunch() const { return _flags[KernelLaunch]; }
|
||||
bool isSDWAInst() const { return _flags[IsSDWA]; }
|
||||
bool isDPPInst() const { return _flags[IsDPP]; }
|
||||
|
||||
bool
|
||||
isUnconditionalJump() const
|
||||
@@ -105,7 +121,7 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
bool isWaitcnt() const { return _flags[Waitcnt]; }
|
||||
|
||||
bool isBarrier() const { return _flags[MemBarrier]; }
|
||||
bool isMemFence() const { return _flags[MemFence]; }
|
||||
bool isMemSync() const { return _flags[MemSync]; }
|
||||
bool isMemRef() const { return _flags[MemoryRef]; }
|
||||
bool isFlat() const { return _flags[Flat]; }
|
||||
bool isLoad() const { return _flags[Load]; }
|
||||
@@ -125,6 +141,13 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
bool writesSCC() const { return _flags[WritesSCC]; }
|
||||
bool readsVCC() const { return _flags[ReadsVCC]; }
|
||||
bool writesVCC() const { return _flags[WritesVCC]; }
|
||||
// Identify instructions that implicitly read the Execute mask
|
||||
// as a source operand but not to dictate which threads execute.
|
||||
bool readsEXEC() const { return _flags[ReadsEXEC]; }
|
||||
bool writesEXEC() const { return _flags[WritesEXEC]; }
|
||||
bool readsMode() const { return _flags[ReadsMode]; }
|
||||
bool writesMode() const { return _flags[WritesMode]; }
|
||||
bool ignoreExec() const { return _flags[IgnoreExec]; }
|
||||
|
||||
bool isAtomicAnd() const { return _flags[AtomicAnd]; }
|
||||
bool isAtomicOr() const { return _flags[AtomicOr]; }
|
||||
@@ -166,34 +189,29 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
bool isReadOnlySeg() const { return _flags[ReadOnlySegment]; }
|
||||
bool isSpillSeg() const { return _flags[SpillSegment]; }
|
||||
|
||||
bool isWorkitemScope() const { return _flags[WorkitemScope]; }
|
||||
bool isWavefrontScope() const { return _flags[WavefrontScope]; }
|
||||
bool isWorkgroupScope() const { return _flags[WorkgroupScope]; }
|
||||
bool isDeviceScope() const { return _flags[DeviceScope]; }
|
||||
bool isSystemScope() const { return _flags[SystemScope]; }
|
||||
bool isNoScope() const { return _flags[NoScope]; }
|
||||
|
||||
bool isRelaxedOrder() const { return _flags[RelaxedOrder]; }
|
||||
bool isAcquire() const { return _flags[Acquire]; }
|
||||
bool isRelease() const { return _flags[Release]; }
|
||||
bool isAcquireRelease() const { return _flags[AcquireRelease]; }
|
||||
bool isNoOrder() const { return _flags[NoOrder]; }
|
||||
|
||||
/**
|
||||
* Coherence domain of a memory instruction. Only valid for
|
||||
* machine ISA. The coherence domain specifies where it is
|
||||
* possible to perform memory synchronization, e.g., acquire
|
||||
* or release, from the shader kernel.
|
||||
* Coherence domain of a memory instruction. The coherence domain
|
||||
* specifies where it is possible to perform memory synchronization
|
||||
* (e.g., acquire or release) from the shader kernel.
|
||||
*
|
||||
* isGloballyCoherent(): returns true if kernel is sharing memory
|
||||
* with other work-items on the same device (GPU)
|
||||
* isGloballyCoherent(): returns true if WIs share same device
|
||||
* isSystemCoherent(): returns true if WIs or threads in different
|
||||
* devices share memory
|
||||
*
|
||||
* isSystemCoherent(): returns true if kernel is sharing memory
|
||||
* with other work-items on a different device (GPU) or the host (CPU)
|
||||
*/
|
||||
bool isGloballyCoherent() const { return _flags[GloballyCoherent]; }
|
||||
bool isSystemCoherent() const { return _flags[SystemCoherent]; }
|
||||
|
||||
// Floating-point instructions
|
||||
bool isF16() const { return _flags[F16]; }
|
||||
bool isF32() const { return _flags[F32]; }
|
||||
bool isF64() const { return _flags[F64]; }
|
||||
|
||||
// FMA, MAC, MAD instructions
|
||||
bool isFMA() const { return _flags[FMA]; }
|
||||
bool isMAC() const { return _flags[MAC]; }
|
||||
bool isMAD() const { return _flags[MAD]; }
|
||||
|
||||
virtual int instSize() const = 0;
|
||||
|
||||
// only used for memory instructions
|
||||
@@ -217,37 +235,36 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
// For flat memory accesses
|
||||
Enums::StorageClassType executed_as;
|
||||
|
||||
void setFlag(Flags flag) { _flags[flag] = true; }
|
||||
void setFlag(Flags flag) {
|
||||
_flags[flag] = true;
|
||||
|
||||
virtual void
|
||||
execLdAcq(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
fatal("calling execLdAcq() on a non-load instruction.\n");
|
||||
}
|
||||
|
||||
virtual void
|
||||
execSt(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
fatal("calling execLdAcq() on a non-load instruction.\n");
|
||||
}
|
||||
|
||||
virtual void
|
||||
execAtomic(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
fatal("calling execAtomic() on a non-atomic instruction.\n");
|
||||
}
|
||||
|
||||
virtual void
|
||||
execAtomicAcq(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
|
||||
if (isGroupSeg()) {
|
||||
executed_as = Enums::SC_GROUP;
|
||||
} else if (isGlobalSeg()) {
|
||||
executed_as = Enums::SC_GLOBAL;
|
||||
} else if (isPrivateSeg()) {
|
||||
executed_as = Enums::SC_PRIVATE;
|
||||
} else if (isSpillSeg()) {
|
||||
executed_as = Enums::SC_SPILL;
|
||||
} else if (isReadOnlySeg()) {
|
||||
executed_as = Enums::SC_READONLY;
|
||||
} else if (isKernArgSeg()) {
|
||||
executed_as = Enums::SC_KERNARG;
|
||||
} else if (isArgSeg()) {
|
||||
executed_as = Enums::SC_ARG;
|
||||
}
|
||||
}
|
||||
const std::string& opcode() const { return _opcode; }
|
||||
|
||||
protected:
|
||||
const std::string opcode;
|
||||
const std::string _opcode;
|
||||
std::string disassembly;
|
||||
int _instNum;
|
||||
int _instAddr;
|
||||
int srcVecOperands;
|
||||
int dstVecOperands;
|
||||
int srcVecDWORDs;
|
||||
int dstVecDWORDs;
|
||||
/**
|
||||
* Identifier of the immediate post-dominator instruction.
|
||||
*/
|
||||
@@ -262,9 +279,9 @@ class KernelLaunchStaticInst : public GPUStaticInst
|
||||
KernelLaunchStaticInst() : GPUStaticInst("kernel_launch")
|
||||
{
|
||||
setFlag(Nop);
|
||||
setFlag(KernelLaunch);
|
||||
setFlag(MemSync);
|
||||
setFlag(Scalar);
|
||||
setFlag(Acquire);
|
||||
setFlag(SystemScope);
|
||||
setFlag(GlobalSegment);
|
||||
}
|
||||
|
||||
@@ -277,11 +294,14 @@ class KernelLaunchStaticInst : public GPUStaticInst
|
||||
void
|
||||
generateDisassembly() override
|
||||
{
|
||||
disassembly = opcode;
|
||||
disassembly = _opcode;
|
||||
}
|
||||
|
||||
int getNumOperands() override { return 0; }
|
||||
bool isCondRegister(int operandIndex) override { return false; }
|
||||
bool isFlatScratchRegister(int opIdx) override { return false; }
|
||||
// return true if the Execute mask is explicitly used as a source
|
||||
// register operand
|
||||
bool isExecMaskRegister(int opIdx) override { return false; }
|
||||
bool isScalarRegister(int operandIndex) override { return false; }
|
||||
bool isVectorRegister(int operandIndex) override { return false; }
|
||||
bool isSrcOperand(int operandIndex) override { return false; }
|
||||
@@ -296,7 +316,6 @@ class KernelLaunchStaticInst : public GPUStaticInst
|
||||
|
||||
int numDstRegOperands() override { return 0; }
|
||||
int numSrcRegOperands() override { return 0; }
|
||||
bool isValid() const override { return true; }
|
||||
int instSize() const override { return 0; }
|
||||
};
|
||||
|
||||
|
||||
@@ -74,7 +74,6 @@ namespace X86ISA
|
||||
allocationPolicy = p->allocationPolicy;
|
||||
hasMemSidePort = false;
|
||||
accessDistance = p->accessDistance;
|
||||
clock = p->clk_domain->clockPeriod();
|
||||
|
||||
tlb.assign(size, TlbEntry());
|
||||
|
||||
@@ -624,8 +623,8 @@ namespace X86ISA
|
||||
{
|
||||
bool delayedResponse;
|
||||
|
||||
return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
|
||||
latency);
|
||||
return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse,
|
||||
false, latency);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -803,13 +802,13 @@ namespace X86ISA
|
||||
}
|
||||
|
||||
/*
|
||||
* We now know the TLB lookup outcome (if it's a hit or a miss), as well
|
||||
* as the TLB access latency.
|
||||
* We now know the TLB lookup outcome (if it's a hit or a miss), as
|
||||
* well as the TLB access latency.
|
||||
*
|
||||
* We create and schedule a new TLBEvent which will help us take the
|
||||
* appropriate actions (e.g., update TLB on a hit, send request to lower
|
||||
* level TLB on a miss, or start a page walk if this was the last-level
|
||||
* TLB)
|
||||
* appropriate actions (e.g., update TLB on a hit, send request to
|
||||
* lower level TLB on a miss, or start a page walk if this was the
|
||||
* last-level TLB)
|
||||
*/
|
||||
TLBEvent *tlb_event =
|
||||
new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
|
||||
@@ -823,15 +822,15 @@ namespace X86ISA
|
||||
assert(tlb_event);
|
||||
|
||||
DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
|
||||
curTick() + this->ticks(hitLatency));
|
||||
curTick() + cyclesToTicks(Cycles(hitLatency)));
|
||||
|
||||
schedule(tlb_event, curTick() + this->ticks(hitLatency));
|
||||
schedule(tlb_event, curTick() + cyclesToTicks(Cycles(hitLatency)));
|
||||
}
|
||||
|
||||
GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
|
||||
PacketPtr _pkt)
|
||||
: Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
|
||||
outcome(tlb_outcome), pkt(_pkt)
|
||||
GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr,
|
||||
tlbOutcome tlb_outcome, PacketPtr _pkt)
|
||||
: Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
|
||||
outcome(tlb_outcome), pkt(_pkt)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -848,7 +847,8 @@ namespace X86ISA
|
||||
bool storeCheck = flags & (StoreCheck << FlagShift);
|
||||
|
||||
// Do paging protection checks.
|
||||
bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
|
||||
bool inUser
|
||||
= (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
|
||||
CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
|
||||
|
||||
bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
|
||||
@@ -874,10 +874,9 @@ namespace X86ISA
|
||||
* The latter calls handelHit with TLB miss as tlbOutcome.
|
||||
*/
|
||||
void
|
||||
GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
|
||||
PacketPtr pkt)
|
||||
GpuTLB::handleTranslationReturn(Addr virt_page_addr,
|
||||
tlbOutcome tlb_outcome, PacketPtr pkt)
|
||||
{
|
||||
|
||||
assert(pkt);
|
||||
Addr vaddr = pkt->req->getVaddr();
|
||||
|
||||
@@ -890,15 +889,18 @@ namespace X86ISA
|
||||
TlbEntry *local_entry, *new_entry;
|
||||
|
||||
if (tlb_outcome == TLB_HIT) {
|
||||
DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
|
||||
DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n",
|
||||
vaddr);
|
||||
local_entry = sender_state->tlbEntry;
|
||||
} else {
|
||||
DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
|
||||
vaddr);
|
||||
|
||||
// We are returning either from a page walk or from a hit at a lower
|
||||
// TLB level. The senderState should be "carrying" a pointer to the
|
||||
// correct TLBEntry.
|
||||
/**
|
||||
* We are returning either from a page walk or from a hit at a
|
||||
* lower TLB level. The senderState should be "carrying" a pointer
|
||||
* to the correct TLBEntry.
|
||||
*/
|
||||
new_entry = sender_state->tlbEntry;
|
||||
assert(new_entry);
|
||||
local_entry = new_entry;
|
||||
@@ -1024,7 +1026,8 @@ namespace X86ISA
|
||||
TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
|
||||
assert(tlb_event);
|
||||
tlb_event->updateOutcome(PAGE_WALK);
|
||||
schedule(tlb_event, curTick() + ticks(missLatency2));
|
||||
schedule(tlb_event,
|
||||
curTick() + cyclesToTicks(Cycles(missLatency2)));
|
||||
}
|
||||
} else if (outcome == PAGE_WALK) {
|
||||
if (update_stats)
|
||||
@@ -1095,7 +1098,7 @@ namespace X86ISA
|
||||
return virtPageAddr;
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* recvTiming receives a coalesced timing request from a TLBCoalescer
|
||||
* and it calls issueTLBLookup()
|
||||
* It only rejects the packet if we have exceeded the max
|
||||
@@ -1145,9 +1148,11 @@ namespace X86ISA
|
||||
DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
|
||||
"%#x\n", vaddr);
|
||||
|
||||
// We are returning either from a page walk or from a hit at a lower
|
||||
// TLB level. The senderState should be "carrying" a pointer to the
|
||||
// correct TLBEntry.
|
||||
/**
|
||||
* We are returning either from a page walk or from a hit at a
|
||||
* lower TLB level. The senderState should be "carrying" a pointer
|
||||
* to the correct TLBEntry.
|
||||
*/
|
||||
new_entry = sender_state->tlbEntry;
|
||||
assert(new_entry);
|
||||
local_entry = new_entry;
|
||||
@@ -1267,8 +1272,8 @@ namespace X86ISA
|
||||
} else {
|
||||
// If this was a prefetch, then do the normal thing if it
|
||||
// was a successful translation. Otherwise, send an empty
|
||||
// TLB entry back so that it can be figured out as empty and
|
||||
// handled accordingly.
|
||||
// TLB entry back so that it can be figured out as empty
|
||||
// and handled accordingly.
|
||||
if (pte) {
|
||||
DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
|
||||
pte->paddr);
|
||||
@@ -1343,7 +1348,7 @@ namespace X86ISA
|
||||
assert(virt_page_addr == tlb_event->getTLBEventVaddr());
|
||||
|
||||
tlb_event->updateOutcome(MISS_RETURN);
|
||||
tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
|
||||
tlb->schedule(tlb_event, curTick()+tlb->clockPeriod());
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -1393,8 +1398,8 @@ namespace X86ISA
|
||||
tmp_access_info.sumDistance = 0;
|
||||
tmp_access_info.meanDistance = 0;
|
||||
|
||||
ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
|
||||
tmp_access_info));
|
||||
ret = TLBFootprint.insert(
|
||||
AccessPatternTable::value_type(virt_page_addr, tmp_access_info));
|
||||
|
||||
bool first_page_access = ret.second;
|
||||
|
||||
@@ -1428,74 +1433,74 @@ namespace X86ISA
|
||||
page_stat_file = simout.create(name().c_str())->stream();
|
||||
|
||||
// print header
|
||||
*page_stat_file << "page,max_access_distance,mean_access_distance, "
|
||||
<< "stddev_distance" << std::endl;
|
||||
*page_stat_file
|
||||
<< "page,max_access_distance,mean_access_distance, "
|
||||
<< "stddev_distance" << std::endl;
|
||||
}
|
||||
|
||||
// update avg. reuse distance footprint
|
||||
AccessPatternTable::iterator iter, iter_begin, iter_end;
|
||||
unsigned int sum_avg_reuse_distance_per_page = 0;
|
||||
|
||||
// iterate through all pages seen by this TLB
|
||||
for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
|
||||
sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
|
||||
iter->second.accessesPerPage;
|
||||
for (auto &iter : TLBFootprint) {
|
||||
sum_avg_reuse_distance_per_page += iter.second.totalReuseDistance /
|
||||
iter.second.accessesPerPage;
|
||||
|
||||
if (accessDistance) {
|
||||
unsigned int tmp = iter->second.localTLBAccesses[0];
|
||||
unsigned int tmp = iter.second.localTLBAccesses[0];
|
||||
unsigned int prev = tmp;
|
||||
|
||||
for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
|
||||
for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
|
||||
if (i) {
|
||||
tmp = prev + 1;
|
||||
}
|
||||
|
||||
prev = iter->second.localTLBAccesses[i];
|
||||
prev = iter.second.localTLBAccesses[i];
|
||||
// update the localTLBAccesses value
|
||||
// with the actual differece
|
||||
iter->second.localTLBAccesses[i] -= tmp;
|
||||
iter.second.localTLBAccesses[i] -= tmp;
|
||||
// compute the sum of AccessDistance per page
|
||||
// used later for mean
|
||||
iter->second.sumDistance +=
|
||||
iter->second.localTLBAccesses[i];
|
||||
iter.second.sumDistance +=
|
||||
iter.second.localTLBAccesses[i];
|
||||
}
|
||||
|
||||
iter->second.meanDistance =
|
||||
iter->second.sumDistance / iter->second.accessesPerPage;
|
||||
iter.second.meanDistance =
|
||||
iter.second.sumDistance / iter.second.accessesPerPage;
|
||||
|
||||
// compute std_dev and max (we need a second round because we
|
||||
// need to know the mean value
|
||||
unsigned int max_distance = 0;
|
||||
unsigned int stddev_distance = 0;
|
||||
|
||||
for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
|
||||
for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
|
||||
unsigned int tmp_access_distance =
|
||||
iter->second.localTLBAccesses[i];
|
||||
iter.second.localTLBAccesses[i];
|
||||
|
||||
if (tmp_access_distance > max_distance) {
|
||||
max_distance = tmp_access_distance;
|
||||
}
|
||||
|
||||
unsigned int diff =
|
||||
tmp_access_distance - iter->second.meanDistance;
|
||||
tmp_access_distance - iter.second.meanDistance;
|
||||
stddev_distance += pow(diff, 2);
|
||||
|
||||
}
|
||||
|
||||
stddev_distance =
|
||||
sqrt(stddev_distance/iter->second.accessesPerPage);
|
||||
sqrt(stddev_distance/iter.second.accessesPerPage);
|
||||
|
||||
if (page_stat_file) {
|
||||
*page_stat_file << std::hex << iter->first << ",";
|
||||
*page_stat_file << std::hex << iter.first << ",";
|
||||
*page_stat_file << std::dec << max_distance << ",";
|
||||
*page_stat_file << std::dec << iter->second.meanDistance
|
||||
*page_stat_file << std::dec << iter.second.meanDistance
|
||||
<< ",";
|
||||
*page_stat_file << std::dec << stddev_distance;
|
||||
*page_stat_file << std::endl;
|
||||
}
|
||||
|
||||
// erase the localTLBAccesses array
|
||||
iter->second.localTLBAccesses.clear();
|
||||
iter.second.localTLBAccesses.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -69,26 +69,7 @@ namespace X86ISA
|
||||
|
||||
uint32_t configAddress;
|
||||
|
||||
// TLB clock: will inherit clock from shader's clock period in terms
|
||||
// of nuber of ticks of curTime (aka global simulation clock)
|
||||
// The assignment of TLB clock from shader clock is done in the python
|
||||
// config files.
|
||||
int clock;
|
||||
|
||||
public:
|
||||
// clock related functions ; maps to-and-from Simulation ticks and
|
||||
// object clocks.
|
||||
Tick frequency() const { return SimClock::Frequency / clock; }
|
||||
|
||||
Tick
|
||||
ticks(int numCycles) const
|
||||
{
|
||||
return (Tick)clock * numCycles;
|
||||
}
|
||||
|
||||
Tick curCycle() const { return curTick() / clock; }
|
||||
Tick tickToCycles(Tick val) const { return val / clock;}
|
||||
|
||||
typedef X86GPUTLBParams Params;
|
||||
GpuTLB(const Params *p);
|
||||
~GpuTLB();
|
||||
|
||||
467
src/gpu-compute/hsa_queue_entry.hh
Normal file
467
src/gpu-compute/hsa_queue_entry.hh
Normal file
@@ -0,0 +1,467 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* HSAQueuEntry is the simulator's internal representation of an
|
||||
* AQL queue entry (task). It encasulates all of the relevant info
|
||||
* about a task, which is gathered from various runtime data
|
||||
* structures including: the AQL MQD, the AQL packet, and the code
|
||||
* object.
|
||||
*/
|
||||
|
||||
#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
|
||||
#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
|
||||
|
||||
#include <bitset>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "base/intmath.hh"
|
||||
#include "base/types.hh"
|
||||
#include "dev/hsa/hsa_packet.hh"
|
||||
#include "dev/hsa/hsa_queue.hh"
|
||||
#include "gpu-compute/kernel_code.hh"
|
||||
|
||||
class HSAQueueEntry
|
||||
{
|
||||
public:
|
||||
HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
|
||||
int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
|
||||
Addr host_pkt_addr, Addr code_addr)
|
||||
: kernName(kernel_name),
|
||||
_wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
|
||||
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
|
||||
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
|
||||
_gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
|
||||
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
|
||||
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
|
||||
numVgprs(akc->workitem_vgpr_count),
|
||||
numSgprs(akc->wavefront_sgpr_count),
|
||||
_queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
|
||||
_hostDispPktAddr(host_pkt_addr),
|
||||
_completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
|
||||
->completion_signal),
|
||||
codeAddress(code_addr),
|
||||
kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
|
||||
_outstandingInvs(-1), _outstandingWbs(0),
|
||||
_ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
|
||||
group_segment_size),
|
||||
_privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
|
||||
private_segment_size),
|
||||
_contextId(0), _wgId{{ 0, 0, 0 }},
|
||||
_numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
|
||||
_globalWgId(0), dispatchComplete(false)
|
||||
|
||||
{
|
||||
initialVgprState.reset();
|
||||
initialSgprState.reset();
|
||||
|
||||
for (int i = 0; i < MAX_DIM; ++i) {
|
||||
_numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
|
||||
_numWgTotal *= _numWg[i];
|
||||
}
|
||||
|
||||
parseKernelCode(akc);
|
||||
}
|
||||
|
||||
const std::string&
|
||||
kernelName() const
|
||||
{
|
||||
return kernName;
|
||||
}
|
||||
|
||||
int
|
||||
wgSize(int dim) const
|
||||
{
|
||||
assert(dim < MAX_DIM);
|
||||
return _wgSize[dim];
|
||||
}
|
||||
|
||||
int
|
||||
gridSize(int dim) const
|
||||
{
|
||||
assert(dim < MAX_DIM);
|
||||
return _gridSize[dim];
|
||||
}
|
||||
|
||||
int
|
||||
numVectorRegs() const
|
||||
{
|
||||
return numVgprs;
|
||||
}
|
||||
|
||||
int
|
||||
numScalarRegs() const
|
||||
{
|
||||
return numSgprs;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
queueId() const
|
||||
{
|
||||
return _queueId;
|
||||
}
|
||||
|
||||
int
|
||||
dispatchId() const
|
||||
{
|
||||
return _dispatchId;
|
||||
}
|
||||
|
||||
void*
|
||||
dispPktPtr()
|
||||
{
|
||||
return dispPkt;
|
||||
}
|
||||
|
||||
Addr
|
||||
hostDispPktAddr() const
|
||||
{
|
||||
return _hostDispPktAddr;
|
||||
}
|
||||
|
||||
Addr
|
||||
completionSignal() const
|
||||
{
|
||||
return _completionSignal;
|
||||
}
|
||||
|
||||
Addr
|
||||
codeAddr() const
|
||||
{
|
||||
return codeAddress;
|
||||
}
|
||||
|
||||
Addr
|
||||
kernargAddr() const
|
||||
{
|
||||
return kernargAddress;
|
||||
}
|
||||
|
||||
int
|
||||
ldsSize() const
|
||||
{
|
||||
return _ldsSize;
|
||||
}
|
||||
|
||||
int privMemPerItem() const { return _privMemPerItem; }
|
||||
|
||||
int
|
||||
contextId() const
|
||||
{
|
||||
return _contextId;
|
||||
}
|
||||
|
||||
bool
|
||||
dispComplete() const
|
||||
{
|
||||
return dispatchComplete;
|
||||
}
|
||||
|
||||
int
|
||||
wgId(int dim) const
|
||||
{
|
||||
assert(dim < MAX_DIM);
|
||||
return _wgId[dim];
|
||||
}
|
||||
|
||||
void
|
||||
wgId(int dim, int val)
|
||||
{
|
||||
assert(dim < MAX_DIM);
|
||||
_wgId[dim] = val;
|
||||
}
|
||||
|
||||
int
|
||||
globalWgId() const
|
||||
{
|
||||
return _globalWgId;
|
||||
}
|
||||
|
||||
void
|
||||
globalWgId(int val)
|
||||
{
|
||||
_globalWgId = val;
|
||||
}
|
||||
|
||||
int
|
||||
numWg(int dim) const
|
||||
{
|
||||
assert(dim < MAX_DIM);
|
||||
return _numWg[dim];
|
||||
}
|
||||
|
||||
void
|
||||
notifyWgCompleted()
|
||||
{
|
||||
++_numWgCompleted;
|
||||
}
|
||||
|
||||
int
|
||||
numWgCompleted() const
|
||||
{
|
||||
return _numWgCompleted;
|
||||
}
|
||||
|
||||
int
|
||||
numWgTotal() const
|
||||
{
|
||||
return _numWgTotal;
|
||||
}
|
||||
|
||||
void
|
||||
markWgDispatch()
|
||||
{
|
||||
++_wgId[0];
|
||||
++_globalWgId;
|
||||
|
||||
if (wgId(0) * wgSize(0) >= gridSize(0)) {
|
||||
_wgId[0] = 0;
|
||||
++_wgId[1];
|
||||
|
||||
if (wgId(1) * wgSize(1) >= gridSize(1)) {
|
||||
_wgId[1] = 0;
|
||||
++_wgId[2];
|
||||
|
||||
if (wgId(2) * wgSize(2) >= gridSize(2)) {
|
||||
dispatchComplete = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
numWgAtBarrier() const
|
||||
{
|
||||
return numWgArrivedAtBarrier;
|
||||
}
|
||||
|
||||
bool vgprBitEnabled(int bit) const
|
||||
{
|
||||
return initialVgprState.test(bit);
|
||||
}
|
||||
|
||||
bool sgprBitEnabled(int bit) const
|
||||
{
|
||||
return initialSgprState.test(bit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Host-side addr of the amd_queue_t on which
|
||||
* this task was queued.
|
||||
*/
|
||||
Addr hostAMDQueueAddr;
|
||||
|
||||
/**
|
||||
* Keep a copy of the AMD HSA queue because we
|
||||
* need info from some of its fields to initialize
|
||||
* register state.
|
||||
*/
|
||||
_amd_queue_t amdQueue;
|
||||
|
||||
// the maximum number of dimensions for a grid or workgroup
|
||||
const static int MAX_DIM = 3;
|
||||
|
||||
/* getter */
|
||||
int
|
||||
outstandingInvs() {
|
||||
return _outstandingInvs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether invalidate has started or finished -1 is the
|
||||
* initial value indicating inv has not started for the
|
||||
* kernel.
|
||||
*/
|
||||
bool
|
||||
isInvStarted()
|
||||
{
|
||||
return (_outstandingInvs != -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* update the number of pending invalidate requests
|
||||
*
|
||||
* val: negative to decrement, positive to increment
|
||||
*/
|
||||
void
|
||||
updateOutstandingInvs(int val)
|
||||
{
|
||||
_outstandingInvs += val;
|
||||
assert(_outstandingInvs >= 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Forcefully change the state to be inv done.
|
||||
*/
|
||||
void
|
||||
markInvDone()
|
||||
{
|
||||
_outstandingInvs = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is invalidate done?
|
||||
*/
|
||||
bool
|
||||
isInvDone() const
|
||||
{
|
||||
assert(_outstandingInvs >= 0);
|
||||
return (_outstandingInvs == 0);
|
||||
}
|
||||
|
||||
int
|
||||
outstandingWbs() const
|
||||
{
|
||||
return _outstandingWbs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the number of pending writeback requests.
|
||||
*
|
||||
* val: negative to decrement, positive to increment
|
||||
*/
|
||||
void
|
||||
updateOutstandingWbs(int val)
|
||||
{
|
||||
_outstandingWbs += val;
|
||||
assert(_outstandingWbs >= 0);
|
||||
}
|
||||
|
||||
private:
|
||||
void
|
||||
parseKernelCode(AMDKernelCode *akc)
|
||||
{
|
||||
/** set the enable bits for the initial SGPR state */
|
||||
initialSgprState.set(PrivateSegBuf,
|
||||
akc->enable_sgpr_private_segment_buffer);
|
||||
initialSgprState.set(DispatchPtr,
|
||||
akc->enable_sgpr_dispatch_ptr);
|
||||
initialSgprState.set(QueuePtr,
|
||||
akc->enable_sgpr_queue_ptr);
|
||||
initialSgprState.set(KernargSegPtr,
|
||||
akc->enable_sgpr_kernarg_segment_ptr);
|
||||
initialSgprState.set(DispatchId,
|
||||
akc->enable_sgpr_dispatch_id);
|
||||
initialSgprState.set(FlatScratchInit,
|
||||
akc->enable_sgpr_flat_scratch_init);
|
||||
initialSgprState.set(PrivateSegSize,
|
||||
akc->enable_sgpr_private_segment_size);
|
||||
initialSgprState.set(GridWorkgroupCountX,
|
||||
akc->enable_sgpr_grid_workgroup_count_x);
|
||||
initialSgprState.set(GridWorkgroupCountY,
|
||||
akc->enable_sgpr_grid_workgroup_count_y);
|
||||
initialSgprState.set(GridWorkgroupCountZ,
|
||||
akc->enable_sgpr_grid_workgroup_count_z);
|
||||
initialSgprState.set(WorkgroupIdX,
|
||||
akc->enable_sgpr_workgroup_id_x);
|
||||
initialSgprState.set(WorkgroupIdY,
|
||||
akc->enable_sgpr_workgroup_id_y);
|
||||
initialSgprState.set(WorkgroupIdZ,
|
||||
akc->enable_sgpr_workgroup_id_z);
|
||||
initialSgprState.set(WorkgroupInfo,
|
||||
akc->enable_sgpr_workgroup_info);
|
||||
initialSgprState.set(PrivSegWaveByteOffset,
|
||||
akc->enable_sgpr_private_segment_wave_byte_offset);
|
||||
|
||||
/**
|
||||
* set the enable bits for the initial VGPR state. the
|
||||
* workitem Id in the X dimension is always initialized.
|
||||
*/
|
||||
initialVgprState.set(WorkitemIdX, true);
|
||||
initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id_y);
|
||||
initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id_z);
|
||||
}
|
||||
|
||||
// name of the kernel associated with the AQL entry
|
||||
std::string kernName;
|
||||
// workgroup Size (3 dimensions)
|
||||
std::array<int, MAX_DIM> _wgSize;
|
||||
// grid Size (3 dimensions)
|
||||
std::array<int, MAX_DIM> _gridSize;
|
||||
// total number of VGPRs per work-item
|
||||
int numVgprs;
|
||||
// total number of SGPRs per wavefront
|
||||
int numSgprs;
|
||||
// id of AQL queue in which this entry is placed
|
||||
uint32_t _queueId;
|
||||
int _dispatchId;
|
||||
// raw AQL packet pointer
|
||||
void *dispPkt;
|
||||
// host-side addr of the dispatch packet
|
||||
Addr _hostDispPktAddr;
|
||||
// pointer to bool
|
||||
Addr _completionSignal;
|
||||
// base address of the raw machine code
|
||||
Addr codeAddress;
|
||||
// base address of the kernel args
|
||||
Addr kernargAddress;
|
||||
/**
|
||||
* Number of outstanding invs for the kernel.
|
||||
* values:
|
||||
* -1: initial value, invalidate has not started for the kernel
|
||||
* 0: 1)-1->0, about to start (a transient state, added in the same cycle)
|
||||
* 2)+1->0, all inv requests are finished, i.e., invalidate done
|
||||
* ?: positive value, indicating the number of pending inv requests
|
||||
*/
|
||||
int _outstandingInvs;
|
||||
/**
|
||||
* Number of outstanding wbs for the kernel
|
||||
* values:
|
||||
* 0: 1)initial value, flush has not started for the kernel
|
||||
* 2)+1->0: all wb requests are finished, i.e., flush done
|
||||
* ?: positive value, indicating the number of pending wb requests
|
||||
*/
|
||||
int _outstandingWbs;
|
||||
int _ldsSize;
|
||||
int _privMemPerItem;
|
||||
int _contextId;
|
||||
std::array<int, MAX_DIM> _wgId;
|
||||
std::array<int, MAX_DIM> _numWg;
|
||||
int _numWgTotal;
|
||||
int numWgArrivedAtBarrier;
|
||||
// The number of completed work groups
|
||||
int _numWgCompleted;
|
||||
int _globalWgId;
|
||||
bool dispatchComplete;
|
||||
|
||||
std::bitset<NumVectorInitFields> initialVgprState;
|
||||
std::bitset<NumScalarInitFields> initialSgprState;
|
||||
};
|
||||
|
||||
#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__
|
||||
193
src/gpu-compute/kernel_code.hh
Normal file
193
src/gpu-compute/kernel_code.hh
Normal file
@@ -0,0 +1,193 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Anthony Gutierrez
|
||||
*/
|
||||
|
||||
#ifndef __GPU_COMPUTE_KERNEL_CODE_HH__
|
||||
#define __GPU_COMPUTE_KERNEL_CODE_HH__
|
||||
|
||||
#include <bitset>
|
||||
#include <cstdint>
|
||||
|
||||
/**
|
||||
* these enums represent the indices into the
|
||||
* initialRegState bitfields in HsaKernelInfo.
|
||||
* each bit specifies whether or not the
|
||||
* particular piece of state that the bit
|
||||
* corresponds to should be initialized into
|
||||
* the VGPRs/SGPRs. the order in which the
|
||||
* fields are placed matters, as all enabled
|
||||
* pieces of state will be initialized into
|
||||
* contiguous registers in the same order
|
||||
* as their position in the bitfield - which
|
||||
* is specified in the HSA ABI.
|
||||
*/
|
||||
enum ScalarRegInitFields : int
|
||||
{
|
||||
PrivateSegBuf = 0,
|
||||
DispatchPtr = 1,
|
||||
QueuePtr = 2,
|
||||
KernargSegPtr = 3,
|
||||
DispatchId = 4,
|
||||
FlatScratchInit = 5,
|
||||
PrivateSegSize = 6,
|
||||
GridWorkgroupCountX = 7,
|
||||
GridWorkgroupCountY = 8,
|
||||
GridWorkgroupCountZ = 9,
|
||||
WorkgroupIdX = 10,
|
||||
WorkgroupIdY = 11,
|
||||
WorkgroupIdZ = 12,
|
||||
WorkgroupInfo = 13,
|
||||
PrivSegWaveByteOffset = 14,
|
||||
NumScalarInitFields = 15
|
||||
};
|
||||
|
||||
enum VectorRegInitFields : int
|
||||
{
|
||||
WorkitemIdX = 0,
|
||||
WorkitemIdY = 1,
|
||||
WorkitemIdZ = 2,
|
||||
NumVectorInitFields = 3
|
||||
};
|
||||
|
||||
struct AMDKernelCode
|
||||
{
|
||||
uint32_t amd_kernel_code_version_major;
|
||||
uint32_t amd_kernel_code_version_minor;
|
||||
uint16_t amd_machine_kind;
|
||||
uint16_t amd_machine_version_major;
|
||||
uint16_t amd_machine_version_minor;
|
||||
uint16_t amd_machine_version_stepping;
|
||||
int64_t kernel_code_entry_byte_offset;
|
||||
int64_t kernel_code_prefetch_byte_offset;
|
||||
uint64_t kernel_code_prefetch_byte_size;
|
||||
uint64_t max_scratch_backing_memory_byte_size;
|
||||
|
||||
/**
|
||||
* The fields below are used to set program settings for
|
||||
* compute shaders. Here they are primarily used to setup
|
||||
* initial register state. See the following for full details
|
||||
* about kernel launch, state initialization, and the AMD kernel
|
||||
* code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/
|
||||
* blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
|
||||
* #initial-kernel-register-state
|
||||
*/
|
||||
|
||||
// the 32b below here represent the fields of
|
||||
// the COMPUTE_PGM_RSRC1 register
|
||||
uint32_t granulated_workitem_vgpr_count : 6;
|
||||
uint32_t granulated_wavefront_sgpr_count : 4;
|
||||
uint32_t priority : 2;
|
||||
uint32_t float_mode_round_32 : 2;
|
||||
uint32_t float_mode_round_16_64 : 2;
|
||||
uint32_t float_mode_denorm_32 : 2;
|
||||
uint32_t float_mode_denorm_16_64 : 2;
|
||||
uint32_t priv : 1;
|
||||
uint32_t enable_dx10_clamp : 1;
|
||||
uint32_t debug_mode : 1;
|
||||
uint32_t enable_ieee_mode : 1;
|
||||
uint32_t bulky : 1;
|
||||
uint32_t cdbg_user : 1;
|
||||
uint32_t compute_pgm_rsrc1_reserved : 6;
|
||||
// end COMPUTE_PGM_RSRC1 register
|
||||
|
||||
// the 32b below here represent the fields of
|
||||
// the COMPUTE_PGM_RSRC2 register
|
||||
uint32_t enable_sgpr_private_segment_wave_byte_offset : 1;
|
||||
uint32_t user_sgpr_count : 5;
|
||||
uint32_t enable_trap_handler : 1;
|
||||
uint32_t enable_sgpr_workgroup_id_x : 1;
|
||||
uint32_t enable_sgpr_workgroup_id_y : 1;
|
||||
uint32_t enable_sgpr_workgroup_id_z : 1;
|
||||
uint32_t enable_sgpr_workgroup_info : 1;
|
||||
uint32_t enable_vgpr_workitem_id_y : 1;
|
||||
uint32_t enable_vgpr_workitem_id_z : 1;
|
||||
uint32_t enable_exception_address_watch : 1;
|
||||
uint32_t enable_exception_memory_violation : 1;
|
||||
uint32_t granulated_lds_size : 9;
|
||||
uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
|
||||
uint32_t enable_exception_fp_denormal_source : 1;
|
||||
uint32_t enable_exception_ieee_754_fp_division_by_zero : 1;
|
||||
uint32_t enable_exception_ieee_754_fp_overflow : 1;
|
||||
uint32_t enable_exception_ieee_754_fp_underflow : 1;
|
||||
uint32_t enable_exception_ieee_754_fp_inexact : 1;
|
||||
uint32_t enable_exception_int_divide_by_zero : 1;
|
||||
uint32_t compute_pgm_rsrc2_reserved : 1;
|
||||
// end COMPUTE_PGM_RSRC2
|
||||
|
||||
// the 32b below here represent the fields of
|
||||
// KERNEL_CODE_PROPERTIES
|
||||
uint32_t enable_sgpr_private_segment_buffer : 1;
|
||||
uint32_t enable_sgpr_dispatch_ptr : 1;
|
||||
uint32_t enable_sgpr_queue_ptr : 1;
|
||||
uint32_t enable_sgpr_kernarg_segment_ptr : 1;
|
||||
uint32_t enable_sgpr_dispatch_id : 1;
|
||||
uint32_t enable_sgpr_flat_scratch_init : 1;
|
||||
uint32_t enable_sgpr_private_segment_size : 1;
|
||||
uint32_t enable_sgpr_grid_workgroup_count_x : 1;
|
||||
uint32_t enable_sgpr_grid_workgroup_count_y : 1;
|
||||
uint32_t enable_sgpr_grid_workgroup_count_z : 1;
|
||||
uint32_t kernel_code_properties_reserved1 : 6;
|
||||
uint32_t enable_ordered_append_gds : 1;
|
||||
uint32_t private_element_size : 2;
|
||||
uint32_t is_ptr64 : 1;
|
||||
uint32_t is_dynamic_callstack : 1;
|
||||
uint32_t is_debug_enabled : 1;
|
||||
uint32_t is_xnack_enabled : 1;
|
||||
uint32_t kernel_code_properties_reserved2 : 9;
|
||||
// end KERNEL_CODE_PROPERTIES
|
||||
|
||||
uint32_t workitem_private_segment_byte_size;
|
||||
uint32_t workgroup_group_segment_byte_size;
|
||||
uint32_t gds_segment_byte_size;
|
||||
uint64_t kernarg_segment_byte_size;
|
||||
uint32_t workgroup_fbarrier_count;
|
||||
uint16_t wavefront_sgpr_count;
|
||||
uint16_t workitem_vgpr_count;
|
||||
uint16_t reserved_vgpr_first;
|
||||
uint16_t reserved_vgpr_count;
|
||||
uint16_t reserved_sgpr_first;
|
||||
uint16_t reserved_sgpr_count;
|
||||
uint16_t debug_wavefront_private_segment_offset_sgpr;
|
||||
uint16_t debug_private_segment_buffer_sgpr;
|
||||
uint8_t kernarg_segment_alignment;
|
||||
uint8_t group_segment_alignment;
|
||||
uint8_t private_segment_alignment;
|
||||
uint8_t wavefront_size;
|
||||
int32_t call_convention;
|
||||
uint8_t reserved[12];
|
||||
uint64_t runtime_loader_kernel_symbol;
|
||||
uint64_t control_directives[16];
|
||||
};
|
||||
|
||||
#endif // __GPU_COMPUTE_KERNEL_CODE_HH__
|
||||
@@ -210,8 +210,8 @@ LdsState::processPacket(PacketPtr packet)
|
||||
parent->loadBusLength();
|
||||
// delay for accessing the LDS
|
||||
Tick processingTime =
|
||||
parent->shader->ticks(bankConflicts * bankConflictPenalty) +
|
||||
parent->shader->ticks(busLength);
|
||||
parent->cyclesToTicks(Cycles(bankConflicts * bankConflictPenalty)) +
|
||||
parent->cyclesToTicks(Cycles(busLength));
|
||||
// choose (delay + last packet in queue) or (now + delay) as the time to
|
||||
// return this
|
||||
Tick doneAt = earliestReturnTime() + processingTime;
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "enums/MemType.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "mem/port.hh"
|
||||
#include "params/LdsState.hh"
|
||||
@@ -50,8 +49,8 @@
|
||||
class ComputeUnit;
|
||||
|
||||
/**
|
||||
* this represents a slice of the overall LDS, intended to be associated with an
|
||||
* individual workgroup
|
||||
* this represents a slice of the overall LDS, intended to be associated with
|
||||
* an individual workgroup
|
||||
*/
|
||||
class LdsChunk
|
||||
{
|
||||
@@ -71,7 +70,8 @@ class LdsChunk
|
||||
read(const uint32_t index)
|
||||
{
|
||||
fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
|
||||
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
|
||||
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
|
||||
"chunk");
|
||||
T *p0 = (T *) (&(chunk.at(index)));
|
||||
return *p0;
|
||||
}
|
||||
@@ -84,7 +84,8 @@ class LdsChunk
|
||||
write(const uint32_t index, const T value)
|
||||
{
|
||||
fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
|
||||
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
|
||||
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
|
||||
"chunk");
|
||||
T *p0 = (T *) (&(chunk.at(index)));
|
||||
*p0 = value;
|
||||
}
|
||||
@@ -203,14 +204,16 @@ class LdsState: public ClockedObject
|
||||
|
||||
protected:
|
||||
|
||||
// the lds reference counter
|
||||
// The key is the workgroup ID and dispatch ID
|
||||
// The value is the number of wavefronts that reference this LDS, as
|
||||
// wavefronts are launched, the counter goes up for that workgroup and when
|
||||
// they return it decreases, once it reaches 0 then this chunk of the LDS is
|
||||
// returned to the available pool. However,it is deallocated on the 1->0
|
||||
// transition, not whenever the counter is 0 as it always starts with 0 when
|
||||
// the workgroup asks for space
|
||||
/**
|
||||
* the lds reference counter
|
||||
* The key is the workgroup ID and dispatch ID
|
||||
* The value is the number of wavefronts that reference this LDS, as
|
||||
* wavefronts are launched, the counter goes up for that workgroup and when
|
||||
* they return it decreases, once it reaches 0 then this chunk of the LDS
|
||||
* is returned to the available pool. However,it is deallocated on the 1->0
|
||||
* transition, not whenever the counter is 0 as it always starts with 0
|
||||
* when the workgroup asks for space
|
||||
*/
|
||||
std::unordered_map<uint32_t,
|
||||
std::unordered_map<uint32_t, int32_t>> refCounter;
|
||||
|
||||
@@ -356,22 +359,41 @@ class LdsState: public ClockedObject
|
||||
const uint32_t size)
|
||||
{
|
||||
if (chunkMap.find(dispatchId) != chunkMap.end()) {
|
||||
fatal_if(
|
||||
panic_if(
|
||||
chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
|
||||
"duplicate workgroup ID asking for space in the LDS "
|
||||
"did[%d] wgid[%d]", dispatchId, wgId);
|
||||
}
|
||||
|
||||
fatal_if(bytesAllocated + size > maximumSize,
|
||||
"request would ask for more space than is available");
|
||||
if (bytesAllocated + size > maximumSize) {
|
||||
return nullptr;
|
||||
} else {
|
||||
bytesAllocated += size;
|
||||
|
||||
bytesAllocated += size;
|
||||
auto value = chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
|
||||
panic_if(!value.second, "was unable to allocate a new chunkMap");
|
||||
|
||||
chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
|
||||
// make an entry for this workgroup
|
||||
refCounter[dispatchId][wgId] = 0;
|
||||
// make an entry for this workgroup
|
||||
refCounter[dispatchId][wgId] = 0;
|
||||
|
||||
return &chunkMap[dispatchId][wgId];
|
||||
return &chunkMap[dispatchId][wgId];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* return pointer to lds chunk for wgid
|
||||
*/
|
||||
LdsChunk *
|
||||
getLdsChunk(const uint32_t dispatchId, const uint32_t wgId)
|
||||
{
|
||||
fatal_if(chunkMap.find(dispatchId) == chunkMap.end(),
|
||||
"fetch for unknown dispatch ID did[%d]", dispatchId);
|
||||
|
||||
fatal_if(chunkMap[dispatchId].find(wgId) == chunkMap[dispatchId].end(),
|
||||
"fetch for unknown workgroup ID wgid[%d] in dispatch ID did[%d]",
|
||||
wgId, dispatchId);
|
||||
|
||||
return &chunkMap[dispatchId][wgId];
|
||||
}
|
||||
|
||||
bool
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
|
||||
#include "gpu-compute/local_memory_pipeline.hh"
|
||||
|
||||
#include "debug/GPUMem.hh"
|
||||
#include "debug/GPUPort.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
@@ -62,24 +63,31 @@ LocalMemPipeline::exec()
|
||||
bool accessVrf = true;
|
||||
Wavefront *w = nullptr;
|
||||
|
||||
if ((m) && (m->isLoad() || m->isAtomicRet())) {
|
||||
if ((m) && m->latency.rdy() && (m->isLoad() || m->isAtomicRet())) {
|
||||
w = m->wavefront();
|
||||
|
||||
accessVrf =
|
||||
w->computeUnit->vrf[w->simdId]->
|
||||
vrfOperandAccessReady(m->seqNum(), w, m,
|
||||
VrfAccessType::WRITE);
|
||||
accessVrf = w->computeUnit->vrf[w->simdId]->
|
||||
canScheduleWriteOperandsFromLoad(w, m);
|
||||
|
||||
}
|
||||
|
||||
if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
|
||||
computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
|
||||
|| computeUnit->wfWait.at(m->pipeId).rdy())) {
|
||||
computeUnit->locMemToVrfBus.rdy()
|
||||
&& (computeUnit->shader->coissue_return
|
||||
|| computeUnit->vectorSharedMemUnit.rdy())) {
|
||||
|
||||
lmReturnedRequests.pop();
|
||||
w = m->wavefront();
|
||||
|
||||
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
|
||||
m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
|
||||
m->completeAcc(m);
|
||||
|
||||
if (m->isLoad() || m->isAtomicRet()) {
|
||||
w->computeUnit->vrf[w->simdId]->
|
||||
scheduleWriteOperandsFromLoad(w, m);
|
||||
}
|
||||
|
||||
// Decrement outstanding request count
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
|
||||
|
||||
@@ -96,7 +104,7 @@ LocalMemPipeline::exec()
|
||||
// Mark write bus busy for appropriate amount of time
|
||||
computeUnit->locMemToVrfBus.set(m->time);
|
||||
if (computeUnit->shader->coissue_return == 0)
|
||||
w->computeUnit->wfWait.at(m->pipeId).set(m->time);
|
||||
w->computeUnit->vectorSharedMemUnit.set(m->time);
|
||||
}
|
||||
|
||||
// If pipeline has executed a local memory instruction
|
||||
@@ -114,6 +122,13 @@ LocalMemPipeline::exec()
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
gpuDynInst->setAccessTime(curTick());
|
||||
lmIssuedRequests.push(gpuDynInst);
|
||||
}
|
||||
|
||||
void
|
||||
LocalMemPipeline::regStats()
|
||||
{
|
||||
|
||||
@@ -58,10 +58,11 @@ class LocalMemPipeline
|
||||
LocalMemPipeline(const ComputeUnitParams *params);
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
|
||||
std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
|
||||
std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
|
||||
|
||||
void issueRequest(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
|
||||
bool
|
||||
isLMRespFIFOWrRdy() const
|
||||
{
|
||||
|
||||
@@ -39,34 +39,62 @@
|
||||
#include <memory>
|
||||
|
||||
#include "base/logging.hh"
|
||||
#include "sim/clocked_object.hh"
|
||||
|
||||
class GPUDynInst;
|
||||
|
||||
typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
|
||||
typedef std::bitset<std::numeric_limits<unsigned long long>::digits>
|
||||
VectorMask;
|
||||
typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
|
||||
|
||||
enum InstMemoryHop : int {
|
||||
Initiate = 0,
|
||||
CoalsrSend = 1,
|
||||
CoalsrRecv = 2,
|
||||
GMEnqueue = 3,
|
||||
Complete = 4,
|
||||
InstMemoryHopMax = 5
|
||||
};
|
||||
|
||||
enum BlockMemoryHop : int {
|
||||
BlockSend = 0,
|
||||
BlockRecv = 1
|
||||
};
|
||||
|
||||
class WaitClass
|
||||
{
|
||||
public:
|
||||
WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
|
||||
void init(uint64_t *_tcnt, uint32_t _numStages=0)
|
||||
WaitClass() : nxtAvail(0), lookAheadAvail(0), clockedObject(nullptr) { }
|
||||
|
||||
WaitClass(ClockedObject *_clockedObject, uint64_t _numStages=0)
|
||||
: nxtAvail(0), lookAheadAvail(0), clockedObject(_clockedObject),
|
||||
numStages(_numStages) { }
|
||||
|
||||
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
|
||||
{
|
||||
tcnt = _tcnt;
|
||||
clockedObject = _clockedObject;
|
||||
numStages = _numStages;
|
||||
}
|
||||
|
||||
void set(uint32_t i)
|
||||
void set(uint64_t i)
|
||||
{
|
||||
fatal_if(nxtAvail > *tcnt,
|
||||
fatal_if(nxtAvail > clockedObject->clockEdge(),
|
||||
"Can't allocate resource because it is busy!!!");
|
||||
nxtAvail = *tcnt + i;
|
||||
nxtAvail = clockedObject->clockEdge() + i;
|
||||
}
|
||||
void preset(uint32_t delay)
|
||||
void preset(uint64_t delay)
|
||||
{
|
||||
lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
|
||||
lookAheadAvail = std::max(lookAheadAvail, delay +
|
||||
(clockedObject->clockEdge()) - numStages);
|
||||
}
|
||||
bool rdy(Cycles cycles = Cycles(0)) const
|
||||
{
|
||||
return clockedObject->clockEdge(cycles) >= nxtAvail;
|
||||
}
|
||||
bool prerdy() const
|
||||
{
|
||||
return clockedObject->clockEdge() >= lookAheadAvail;
|
||||
}
|
||||
bool rdy() const { return *tcnt >= nxtAvail; }
|
||||
bool prerdy() const { return *tcnt >= lookAheadAvail; }
|
||||
|
||||
private:
|
||||
// timestamp indicating when resource will be available
|
||||
@@ -75,11 +103,11 @@ class WaitClass
|
||||
// pending uses of the resource (when there is a cycle gap between
|
||||
// rdy() and set()
|
||||
uint64_t lookAheadAvail;
|
||||
// current timestamp
|
||||
uint64_t *tcnt;
|
||||
// clockedObject for current timestamp
|
||||
ClockedObject *clockedObject;
|
||||
// number of stages between checking if a resource is ready and
|
||||
// setting the resource's utilization
|
||||
uint32_t numStages;
|
||||
uint64_t numStages;
|
||||
};
|
||||
|
||||
class Float16
|
||||
@@ -93,7 +121,7 @@ class Float16
|
||||
|
||||
Float16(float x)
|
||||
{
|
||||
uint32_t ai = *(uint32_t *)&x;
|
||||
uint32_t ai = *(reinterpret_cast<uint32_t *>(&x));
|
||||
|
||||
uint32_t s = (ai >> 31) & 0x1;
|
||||
uint32_t exp = (ai >> 23) & 0xff;
|
||||
@@ -139,7 +167,7 @@ class Float16
|
||||
val1 |= (exp << 23);
|
||||
val1 |= (mant << 13);
|
||||
|
||||
return *(float*)&val1;
|
||||
return *(reinterpret_cast<float *>(&val1));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -33,8 +33,8 @@
|
||||
|
||||
#include "gpu-compute/pool_manager.hh"
|
||||
|
||||
PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
|
||||
: _minAllocation(minAlloc), _poolSize(poolSize)
|
||||
PoolManager::PoolManager(const PoolManagerParams *p)
|
||||
: SimObject(p), _minAllocation(p->min_alloc), _poolSize(p->pool_size)
|
||||
{
|
||||
assert(poolSize > 0);
|
||||
assert(_poolSize > 0);
|
||||
}
|
||||
|
||||
@@ -38,11 +38,15 @@
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
#include "params/PoolManager.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
|
||||
// Pool Manager Logic
|
||||
class PoolManager
|
||||
class PoolManager : public SimObject
|
||||
{
|
||||
public:
|
||||
PoolManager(uint32_t minAlloc, uint32_t poolSize);
|
||||
PoolManager(const PoolManagerParams *p);
|
||||
virtual ~PoolManager() { _poolSize = 0; }
|
||||
uint32_t minAllocation() { return _minAllocation; }
|
||||
virtual std::string printRegion() = 0;
|
||||
virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> ®ion) = 0;
|
||||
|
||||
223
src/gpu-compute/register_file.cc
Normal file
223
src/gpu-compute/register_file.cc
Normal file
@@ -0,0 +1,223 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: John Kalamatianos,
|
||||
* Mark Wyse
|
||||
*/
|
||||
|
||||
#include "gpu-compute/register_file.hh"
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "base/intmath.hh"
|
||||
#include "base/logging.hh"
|
||||
#include "debug/GPURF.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "params/RegisterFile.hh"
|
||||
|
||||
RegisterFile::RegisterFile(const RegisterFileParams *p)
|
||||
: SimObject(p), simdId(p->simd_id), _numRegs(p->num_regs)
|
||||
{
|
||||
fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n");
|
||||
fatal_if(simdId < 0, "Illegal SIMD id for VRF");
|
||||
|
||||
busy.clear();
|
||||
busy.resize(_numRegs, 0);
|
||||
}
|
||||
|
||||
RegisterFile::~RegisterFile()
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::setParent(ComputeUnit *_computeUnit)
|
||||
{
|
||||
computeUnit = _computeUnit;
|
||||
}
|
||||
|
||||
std::string
|
||||
RegisterFile::dump() const
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "Busy: ";
|
||||
for (int i = 0; i < busy.size(); i++) {
|
||||
ss << (int)busy[i];
|
||||
}
|
||||
ss << "\n";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
// Scoreboard functions
|
||||
|
||||
bool
|
||||
RegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
RegisterFile::regBusy(int idx) const
|
||||
{
|
||||
return busy.at(idx);
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::markReg(int regIdx, bool value)
|
||||
{
|
||||
DPRINTF(GPURF, "SIMD[%d] markReg(): physReg[%d] = %d\n",
|
||||
simdId, regIdx, (int)value);
|
||||
busy.at(regIdx) = value;
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::enqRegFreeEvent(uint32_t regIdx, uint64_t delay)
|
||||
{
|
||||
DPRINTF(GPURF, "SIMD[%d] enqRegFreeEvent physReg[%d] at %llu\n",
|
||||
simdId, regIdx, curTick() + delay);
|
||||
schedule(new MarkRegFreeScbEvent(this, regIdx),
|
||||
curTick() + delay);
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::enqRegBusyEvent(uint32_t regIdx, uint64_t delay)
|
||||
{
|
||||
DPRINTF(GPURF, "SIMD[%d] enqRegBusyEvent physReg[%d] at %llu\n",
|
||||
simdId, regIdx, curTick() + delay);
|
||||
schedule(new MarkRegBusyScbEvent(this, regIdx),
|
||||
curTick() + delay);
|
||||
}
|
||||
|
||||
// Schedule functions
|
||||
bool
|
||||
RegisterFile::canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
}
|
||||
|
||||
bool
|
||||
RegisterFile::canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
}
|
||||
|
||||
bool
|
||||
RegisterFile::canScheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
}
|
||||
|
||||
bool
|
||||
RegisterFile::operandReadComplete(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Exec functions
|
||||
void
|
||||
RegisterFile::exec()
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
}
|
||||
|
||||
RegisterFile*
|
||||
RegisterFileParams::create()
|
||||
{
|
||||
return new RegisterFile(this);
|
||||
}
|
||||
|
||||
// Events
|
||||
|
||||
// Mark a register as free in the scoreboard/busy vector
|
||||
void
|
||||
RegisterFile::MarkRegFreeScbEvent::process()
|
||||
{
|
||||
rf->markReg(regIdx, false);
|
||||
}
|
||||
|
||||
// Mark a register as busy in the scoreboard/busy vector
|
||||
void
|
||||
RegisterFile::MarkRegBusyScbEvent::process()
|
||||
{
|
||||
rf->markReg(regIdx, true);
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::dispatchInstruction(GPUDynInstPtr ii)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFile::regStats()
|
||||
{
|
||||
registerReads
|
||||
.name(name() + ".register_reads")
|
||||
.desc("Total number of DWORDs read from register file")
|
||||
;
|
||||
|
||||
registerWrites
|
||||
.name(name() + ".register_writes")
|
||||
.desc("Total number of DWORDS written to register file")
|
||||
;
|
||||
|
||||
sramReads
|
||||
.name(name() + ".sram_reads")
|
||||
.desc("Total number of register file bank SRAM activations for reads")
|
||||
;
|
||||
|
||||
sramWrites
|
||||
.name(name() + ".sram_writes")
|
||||
.desc("Total number of register file bank SRAM activations for writes")
|
||||
;
|
||||
}
|
||||
171
src/gpu-compute/register_file.hh
Normal file
171
src/gpu-compute/register_file.hh
Normal file
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: John Kalamatianos,
|
||||
* Mark Wyse
|
||||
*/
|
||||
|
||||
#ifndef __REGISTER_FILE_HH__
|
||||
#define __REGISTER_FILE_HH__
|
||||
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "base/types.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class Shader;
|
||||
class PoolManager;
|
||||
class Wavefront;
|
||||
|
||||
struct RegisterFileParams;
|
||||
|
||||
// Abstract Register File
|
||||
// This register file class can be inherited from to create both
|
||||
// scalar and vector register files.
|
||||
class RegisterFile : public SimObject
|
||||
{
|
||||
public:
|
||||
RegisterFile(const RegisterFileParams *p);
|
||||
virtual ~RegisterFile();
|
||||
virtual void setParent(ComputeUnit *_computeUnit);
|
||||
int numRegs() const { return _numRegs; }
|
||||
virtual void regStats() override;
|
||||
|
||||
// State functions
|
||||
|
||||
// Scoreboard functions
|
||||
virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
|
||||
virtual bool regBusy(int idx) const;
|
||||
virtual void markReg(int regIdx, bool value);
|
||||
|
||||
// Abstract Register Event
|
||||
class RegisterEvent : public Event
|
||||
{
|
||||
protected:
|
||||
RegisterFile *rf;
|
||||
int regIdx;
|
||||
|
||||
public:
|
||||
RegisterEvent(RegisterFile *_rf, int _regIdx)
|
||||
: rf(_rf), regIdx(_regIdx) { setFlags(AutoDelete); }
|
||||
};
|
||||
|
||||
// Register Event to mark a register as free in the scoreboard/busy vector
|
||||
class MarkRegFreeScbEvent : public RegisterEvent
|
||||
{
|
||||
public:
|
||||
MarkRegFreeScbEvent(RegisterFile *_rf, int _regIdx)
|
||||
: RegisterEvent(_rf, _regIdx) { }
|
||||
void process();
|
||||
};
|
||||
|
||||
// Register Event to mark a register as busy in the scoreboard/busy vector
|
||||
class MarkRegBusyScbEvent : public RegisterEvent
|
||||
{
|
||||
public:
|
||||
MarkRegBusyScbEvent(RegisterFile *_rf, int _regIdx)
|
||||
: RegisterEvent(_rf, _regIdx) { }
|
||||
void process();
|
||||
};
|
||||
|
||||
// Schedule an event to mark a register as free/busy in
|
||||
// the scoreboard/busy vector. Delay is already in Ticks
|
||||
virtual void enqRegFreeEvent(uint32_t regIdx, uint64_t delay);
|
||||
virtual void enqRegBusyEvent(uint32_t regIdx, uint64_t delay);
|
||||
|
||||
// Schedule functions
|
||||
|
||||
// The following functions are called by the SCH stage when attempting
|
||||
// to move a wave from the readyList to the schList.
|
||||
// canSchedule* checks if the RF is ready to provide operands for
|
||||
// the instruction, while schedule* requests the RF to begin reading
|
||||
// and writing of operands. Calling schedule* may only occur
|
||||
// immediately after canSchedule* was called and returned True
|
||||
virtual bool canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii);
|
||||
virtual bool canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii);
|
||||
virtual void scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii);
|
||||
virtual void scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii);
|
||||
|
||||
// The following function is called to check if all operands
|
||||
// have been read for the given instruction
|
||||
virtual bool operandReadComplete(Wavefront *w, GPUDynInstPtr ii);
|
||||
|
||||
// The following two functions are only called by returning loads to
|
||||
// check if the register file can support the incoming writes
|
||||
virtual bool canScheduleWriteOperandsFromLoad(Wavefront *w,
|
||||
GPUDynInstPtr ii);
|
||||
// Queue the register writes. Assumes canScheduleWriteOperandsFromLoad
|
||||
// was called immediately prior and returned True
|
||||
virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
|
||||
GPUDynInstPtr ii);
|
||||
|
||||
// ExecRF is invoked every cycle by the compute unit and may be
|
||||
// used to model detailed timing of the register file.
|
||||
virtual void exec();
|
||||
|
||||
// Called to inform RF that an instruction is executing
|
||||
// to schedule events for writeback, etc., as needed
|
||||
virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii);
|
||||
|
||||
// Debug functions
|
||||
virtual std::string dump() const;
|
||||
|
||||
virtual void dispatchInstruction(GPUDynInstPtr ii);
|
||||
|
||||
protected:
|
||||
ComputeUnit* computeUnit;
|
||||
int simdId;
|
||||
|
||||
// flag indicating if a register is busy
|
||||
std::vector<bool> busy;
|
||||
|
||||
// numer of registers in this register file
|
||||
int _numRegs;
|
||||
// Stats
|
||||
// Total number of register reads, incremented once per DWORD per thread
|
||||
Stats::Scalar registerReads;
|
||||
// Total number of register writes, incremented once per DWORD per thread
|
||||
Stats::Scalar registerWrites;
|
||||
|
||||
// Number of register file SRAM activations for reads.
|
||||
// The register file may be implemented with multiple SRAMs. This stat
|
||||
// tracks how many times the SRAMs are accessed for reads.
|
||||
Stats::Scalar sramReads;
|
||||
// Number of register file SRAM activations for writes
|
||||
Stats::Scalar sramWrites;
|
||||
};
|
||||
|
||||
#endif // __REGISTER_FILE_HH__
|
||||
143
src/gpu-compute/register_manager.cc
Normal file
143
src/gpu-compute/register_manager.cc
Normal file
@@ -0,0 +1,143 @@
|
||||
/*
|
||||
* Copyright (c) 2016, 2017 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Mark Wyse
|
||||
*/
|
||||
|
||||
#include "gpu-compute/register_manager.hh"
|
||||
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "debug/GPURename.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/scalar_register_file.hh"
|
||||
#include "gpu-compute/static_register_manager_policy.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "params/RegisterManager.hh"
|
||||
|
||||
RegisterManager::RegisterManager(const RegisterManagerParams *p)
|
||||
: SimObject(p), srfPoolMgrs(p->srf_pool_managers),
|
||||
vrfPoolMgrs(p->vrf_pool_managers)
|
||||
{
|
||||
if (p->policy == "static") {
|
||||
policy = new StaticRegisterManagerPolicy();
|
||||
} else {
|
||||
fatal("Unimplemented Register Manager Policy");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
RegisterManager::~RegisterManager()
|
||||
{
|
||||
for (auto mgr : srfPoolMgrs) {
|
||||
delete mgr;
|
||||
}
|
||||
for (auto mgr : vrfPoolMgrs) {
|
||||
delete mgr;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
RegisterManager::exec()
|
||||
{
|
||||
policy->exec();
|
||||
}
|
||||
|
||||
void
|
||||
RegisterManager::setParent(ComputeUnit *cu)
|
||||
{
|
||||
computeUnit = cu;
|
||||
policy->setParent(computeUnit);
|
||||
for (int i = 0; i < srfPoolMgrs.size(); i++) {
|
||||
fatal_if(computeUnit->srf[i]->numRegs() %
|
||||
srfPoolMgrs[i]->minAllocation(),
|
||||
"Min SGPR allocation is not multiple of VRF size\n");
|
||||
}
|
||||
for (int i = 0; i < vrfPoolMgrs.size(); i++) {
|
||||
fatal_if(computeUnit->vrf[i]->numRegs() %
|
||||
vrfPoolMgrs[i]->minAllocation(),
|
||||
"Min VGPG allocation is not multiple of VRF size\n");
|
||||
}
|
||||
}
|
||||
|
||||
// compute mapping for vector register
|
||||
int
|
||||
RegisterManager::mapVgpr(Wavefront* w, int vgprIndex)
|
||||
{
|
||||
return policy->mapVgpr(w, vgprIndex);
|
||||
}
|
||||
|
||||
// compute mapping for scalar register
|
||||
int
|
||||
RegisterManager::mapSgpr(Wavefront* w, int sgprIndex)
|
||||
{
|
||||
return policy->mapSgpr(w, sgprIndex);
|
||||
}
|
||||
|
||||
// check if we can allocate registers
|
||||
bool
|
||||
RegisterManager::canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
|
||||
{
|
||||
return policy->canAllocateVgprs(simdId, nWfs, demandPerWf);
|
||||
}
|
||||
|
||||
bool
|
||||
RegisterManager::canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
|
||||
{
|
||||
return policy->canAllocateSgprs(simdId, nWfs, demandPerWf);
|
||||
}
|
||||
|
||||
// allocate registers
|
||||
void
|
||||
RegisterManager::allocateRegisters(Wavefront *w, int vectorDemand,
|
||||
int scalarDemand)
|
||||
{
|
||||
policy->allocateRegisters(w, vectorDemand, scalarDemand);
|
||||
}
|
||||
|
||||
void
|
||||
RegisterManager::freeRegisters(Wavefront* w)
|
||||
{
|
||||
policy->freeRegisters(w);
|
||||
}
|
||||
|
||||
void
|
||||
RegisterManager::regStats()
|
||||
{
|
||||
policy->regStats();
|
||||
}
|
||||
|
||||
RegisterManager*
|
||||
RegisterManagerParams::create()
|
||||
{
|
||||
return new RegisterManager(this);
|
||||
}
|
||||
94
src/gpu-compute/register_manager.hh
Normal file
94
src/gpu-compute/register_manager.hh
Normal file
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Copyright (c) 2016, 2017 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Mark Wyse
|
||||
*/
|
||||
|
||||
#ifndef __REGISTER_MANAGER_HH__
|
||||
#define __REGISTER_MANAGER_HH__
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "gpu-compute/pool_manager.hh"
|
||||
#include "gpu-compute/register_manager_policy.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
#include "sim/stats.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class Wavefront;
|
||||
|
||||
struct RegisterManagerParams;
|
||||
|
||||
/*
|
||||
* Rename stage.
|
||||
*/
|
||||
class RegisterManager : public SimObject
|
||||
{
|
||||
public:
|
||||
RegisterManager(const RegisterManagerParams* params);
|
||||
~RegisterManager();
|
||||
void setParent(ComputeUnit *cu);
|
||||
void exec();
|
||||
|
||||
// Stats related variables and methods
|
||||
void regStats();
|
||||
|
||||
// lookup virtual to physical register translation
|
||||
int mapVgpr(Wavefront* w, int vgprIndex);
|
||||
int mapSgpr(Wavefront* w, int sgprIndex);
|
||||
|
||||
// check if we can allocate registers
|
||||
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf);
|
||||
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf);
|
||||
|
||||
// allocate registers
|
||||
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand);
|
||||
|
||||
// free all registers used by the WF
|
||||
void freeRegisters(Wavefront *w);
|
||||
|
||||
std::vector<PoolManager*> srfPoolMgrs;
|
||||
std::vector<PoolManager*> vrfPoolMgrs;
|
||||
|
||||
private:
|
||||
RegisterManagerPolicy *policy;
|
||||
|
||||
ComputeUnit *computeUnit;
|
||||
|
||||
std::string _name;
|
||||
};
|
||||
|
||||
#endif // __REGISTER_MANAGER_HH__
|
||||
86
src/gpu-compute/register_manager_policy.hh
Normal file
86
src/gpu-compute/register_manager_policy.hh
Normal file
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Mark Wyse
|
||||
*/
|
||||
|
||||
#ifndef __REGISTER_MANAGER_POLICY_HH__
|
||||
#define __REGISTER_MANAGER_POLICY_HH__
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
class ComputeUnit;
|
||||
class HSAQueueEntry;
|
||||
class Wavefront;
|
||||
|
||||
/**
|
||||
* Register Manager Policy abstract class
|
||||
*
|
||||
* A Register Manager Policy implements all of the functionality
|
||||
* of the Register Manager, including register mapping, allocation,
|
||||
* and freeing. Different policies may be implemented that support
|
||||
* different architectures or different methods of mapping and
|
||||
* allocation.
|
||||
*/
|
||||
class RegisterManagerPolicy
|
||||
{
|
||||
public:
|
||||
virtual void setParent(ComputeUnit *_cu) { cu = _cu; }
|
||||
|
||||
// Execute: called by RenameStage::execute()
|
||||
virtual void exec() = 0;
|
||||
|
||||
// provide virtual to physical register mapping
|
||||
virtual int mapVgpr(Wavefront* w, int vgprIndex) = 0;
|
||||
virtual int mapSgpr(Wavefront* w, int sgprIndex) = 0;
|
||||
|
||||
// check if requested number of vector registers can be allocated
|
||||
virtual bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) = 0;
|
||||
// check if requested number of scalar registers can be allocated
|
||||
// machine ISA only
|
||||
virtual bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) = 0;
|
||||
|
||||
// allocate vector registers and reserve from register pool
|
||||
virtual void allocateRegisters(Wavefront *w, int vectorDemand,
|
||||
int scalarDemand) = 0;
|
||||
|
||||
// free all remaining registers held by specified WF
|
||||
virtual void freeRegisters(Wavefront *w) = 0;
|
||||
|
||||
// stats
|
||||
virtual void regStats() = 0;
|
||||
|
||||
protected:
|
||||
ComputeUnit *cu;
|
||||
};
|
||||
|
||||
#endif // __REGISTER_MANAGER_POLICY_HH__
|
||||
@@ -36,6 +36,7 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "base/logging.hh"
|
||||
#include "gpu-compute/scheduling_policy.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
|
||||
153
src/gpu-compute/scalar_memory_pipeline.cc
Normal file
153
src/gpu-compute/scalar_memory_pipeline.cc
Normal file
@@ -0,0 +1,153 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: John Kalamatianos
|
||||
*/
|
||||
|
||||
#include "gpu-compute/scalar_memory_pipeline.hh"
|
||||
|
||||
#include "debug/GPUMem.hh"
|
||||
#include "debug/GPUReg.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/scalar_register_file.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ScalarMemPipeline::ScalarMemPipeline(const ComputeUnitParams* p) :
|
||||
computeUnit(nullptr), queueSize(p->scalar_mem_queue_size),
|
||||
inflightStores(0), inflightLoads(0)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
ScalarMemPipeline::init(ComputeUnit *cu)
|
||||
{
|
||||
computeUnit = cu;
|
||||
_name = computeUnit->name() + ".ScalarMemPipeline";
|
||||
}
|
||||
|
||||
void
|
||||
ScalarMemPipeline::exec()
|
||||
{
|
||||
// afind oldest scalar request whose data has arrived
|
||||
GPUDynInstPtr m = !returnedLoads.empty() ? returnedLoads.front() :
|
||||
!returnedStores.empty() ? returnedStores.front() : nullptr;
|
||||
|
||||
Wavefront *w = nullptr;
|
||||
|
||||
bool accessSrf = true;
|
||||
// check the SRF to see if the operands of a load (or load component
|
||||
// of an atomic) are accessible
|
||||
if ((m) && (m->isLoad() || m->isAtomicRet())) {
|
||||
w = m->wavefront();
|
||||
|
||||
accessSrf =
|
||||
w->computeUnit->srf[w->simdId]->
|
||||
canScheduleWriteOperandsFromLoad(w, m);
|
||||
}
|
||||
|
||||
if ((!returnedStores.empty() || !returnedLoads.empty()) &&
|
||||
m->latency.rdy() && computeUnit->scalarMemToSrfBus.rdy() &&
|
||||
accessSrf &&
|
||||
(computeUnit->shader->coissue_return ||
|
||||
computeUnit->scalarMemUnit.rdy())) {
|
||||
|
||||
w = m->wavefront();
|
||||
|
||||
if (m->isLoad() || m->isAtomicRet()) {
|
||||
w->computeUnit->srf[w->simdId]->
|
||||
scheduleWriteOperandsFromLoad(w, m);
|
||||
}
|
||||
|
||||
m->completeAcc(m);
|
||||
|
||||
if (m->isLoad() || m->isAtomic()) {
|
||||
returnedLoads.pop();
|
||||
assert(inflightLoads > 0);
|
||||
--inflightLoads;
|
||||
} else {
|
||||
returnedStores.pop();
|
||||
assert(inflightStores > 0);
|
||||
--inflightStores;
|
||||
}
|
||||
|
||||
// Decrement outstanding register count
|
||||
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
|
||||
|
||||
if (m->isStore() || m->isAtomic()) {
|
||||
computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
if (m->isLoad() || m->isAtomic()) {
|
||||
computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
|
||||
m->time, -1);
|
||||
}
|
||||
|
||||
// Mark write bus busy for appropriate amount of time
|
||||
computeUnit->scalarMemToSrfBus.set(m->time);
|
||||
if (!computeUnit->shader->coissue_return)
|
||||
w->computeUnit->scalarMemUnit.set(m->time);
|
||||
}
|
||||
|
||||
// If pipeline has executed a global memory instruction
|
||||
// execute global memory packets and issue global
|
||||
// memory packets to DTLB
|
||||
if (!issuedRequests.empty()) {
|
||||
GPUDynInstPtr mp = issuedRequests.front();
|
||||
if (mp->isLoad() || mp->isAtomic()) {
|
||||
|
||||
if (inflightLoads >= queueSize) {
|
||||
return;
|
||||
} else {
|
||||
++inflightLoads;
|
||||
}
|
||||
} else {
|
||||
if (inflightStores >= queueSize) {
|
||||
return;
|
||||
} else {
|
||||
++inflightStores;
|
||||
}
|
||||
}
|
||||
mp->initiateAcc(mp);
|
||||
issuedRequests.pop();
|
||||
|
||||
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping scalar mem_op\n",
|
||||
computeUnit->cu_id, mp->simdId, mp->wfSlotId);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScalarMemPipeline::regStats()
|
||||
{
|
||||
}
|
||||
114
src/gpu-compute/scalar_memory_pipeline.hh
Normal file
114
src/gpu-compute/scalar_memory_pipeline.hh
Normal file
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: John Kalamatianos
|
||||
*/
|
||||
|
||||
#ifndef __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
|
||||
#define __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
|
||||
|
||||
#include <queue>
|
||||
#include <string>
|
||||
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "params/ComputeUnit.hh"
|
||||
#include "sim/stats.hh"
|
||||
|
||||
/*
|
||||
* @file scalar_memory_pipeline.hh
|
||||
*
|
||||
* The scalar memory pipeline issues global memory packets
|
||||
* from the scalar ALU to the DTLB and L1 Scalar Data Cache.
|
||||
* The exec() method of the memory packet issues
|
||||
* the packet to the DTLB if there is space available in the return fifo.
|
||||
* This exec() method also retires previously issued loads and stores that have
|
||||
* returned from the memory sub-system.
|
||||
*/
|
||||
|
||||
class ComputeUnit;
|
||||
|
||||
class ScalarMemPipeline
|
||||
{
|
||||
public:
|
||||
ScalarMemPipeline(const ComputeUnitParams *params);
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
|
||||
std::queue<GPUDynInstPtr> &getGMReqFIFO() { return issuedRequests; }
|
||||
std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return returnedStores; }
|
||||
std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return returnedLoads; }
|
||||
|
||||
bool
|
||||
isGMLdRespFIFOWrRdy() const
|
||||
{
|
||||
return returnedLoads.size() < queueSize;
|
||||
}
|
||||
|
||||
bool
|
||||
isGMStRespFIFOWrRdy() const
|
||||
{
|
||||
return returnedStores.size() < queueSize;
|
||||
}
|
||||
|
||||
bool
|
||||
isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
|
||||
{
|
||||
return (issuedRequests.size() + pendReqs) < queueSize;
|
||||
}
|
||||
|
||||
const std::string &name() const { return _name; }
|
||||
void regStats();
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
std::string _name;
|
||||
int queueSize;
|
||||
|
||||
// Counters to track and limit the inflight scalar loads and stores
|
||||
// generated by this memory pipeline.
|
||||
int inflightStores;
|
||||
int inflightLoads;
|
||||
|
||||
// Scalar Memory Request FIFO: all global memory scalar requests
|
||||
// are issued to this FIFO from the scalar memory pipelines
|
||||
std::queue<GPUDynInstPtr> issuedRequests;
|
||||
|
||||
// Scalar Store Response FIFO: all responses of global memory
|
||||
// scalar stores are sent to this FIFO from L1 Scalar Data Cache
|
||||
std::queue<GPUDynInstPtr> returnedStores;
|
||||
|
||||
// Scalar Load Response FIFO: all responses of global memory
|
||||
// scalar loads are sent to this FIFO from L1 Scalar Data Cache
|
||||
std::queue<GPUDynInstPtr> returnedLoads;
|
||||
};
|
||||
|
||||
#endif // __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
|
||||
164
src/gpu-compute/scalar_register_file.cc
Normal file
164
src/gpu-compute/scalar_register_file.cc
Normal file
@@ -0,0 +1,164 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: John Kalamatianos,
|
||||
* Mark Wyse
|
||||
*/
|
||||
|
||||
#include "gpu-compute/scalar_register_file.hh"
|
||||
|
||||
#include "base/logging.hh"
|
||||
#include "debug/GPUSRF.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "params/ScalarRegisterFile.hh"
|
||||
|
||||
ScalarRegisterFile::ScalarRegisterFile(const ScalarRegisterFileParams *p)
|
||||
: RegisterFile(p)
|
||||
{
|
||||
regFile.resize(numRegs(), 0);
|
||||
}
|
||||
|
||||
bool
|
||||
ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
|
||||
{
|
||||
for (int i = 0; i < ii->getNumOperands(); ++i) {
|
||||
if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
|
||||
|
||||
int sgprIdx = ii->getRegisterIndex(i, ii);
|
||||
int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
|
||||
ii->getOperandSize(i) / 4;
|
||||
|
||||
for (int j = 0; j < nRegs; ++j) {
|
||||
int pSgpr =
|
||||
computeUnit->registerManager->mapSgpr(w, sgprIdx + j);
|
||||
|
||||
if (regBusy(pSgpr)) {
|
||||
if (ii->isDstOperand(i)) {
|
||||
w->numTimesBlockedDueWAXDependencies++;
|
||||
} else if (ii->isSrcOperand(i)) {
|
||||
DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
|
||||
w->wfDynId, ii->disassemble(), pSgpr);
|
||||
w->numTimesBlockedDueRAWDependencies++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} // nRegs
|
||||
} // isScalar
|
||||
} // operand
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
ScalarRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
// iterate over all register destination operands
|
||||
for (int i = 0; i < ii->getNumOperands(); ++i) {
|
||||
if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
|
||||
|
||||
int sgprIdx = ii->getRegisterIndex(i, ii);
|
||||
int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
|
||||
ii->getOperandSize(i) / 4;
|
||||
|
||||
for (int j = 0; j < nRegs; ++j) {
|
||||
int physReg =
|
||||
computeUnit->registerManager->mapSgpr(w, sgprIdx + j);
|
||||
|
||||
// mark the destination scalar register as busy
|
||||
markReg(physReg, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
for (int i = 0; i < ii->getNumOperands(); i++) {
|
||||
if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
|
||||
int DWORDs = ii->getOperandSize(i) <= 4 ? 1
|
||||
: ii->getOperandSize(i) / 4;
|
||||
registerReads += DWORDs;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ii->isLoad() && !(ii->isAtomic() || ii->isMemSync())) {
|
||||
Cycles delay(computeUnit->scalarPipeLength());
|
||||
Tick tickDelay = computeUnit->cyclesToTicks(delay);
|
||||
|
||||
for (int i = 0; i < ii->getNumOperands(); i++) {
|
||||
if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
|
||||
int sgprIdx = ii->getRegisterIndex(i, ii);
|
||||
int nRegs = ii->getOperandSize(i) <= 4 ? 1
|
||||
: ii->getOperandSize(i) / 4;
|
||||
for (int j = 0; j < nRegs; j++) {
|
||||
int physReg = computeUnit->registerManager->
|
||||
mapSgpr(w, sgprIdx + j);
|
||||
enqRegFreeEvent(physReg, tickDelay);
|
||||
}
|
||||
|
||||
registerWrites += nRegs;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w,
|
||||
GPUDynInstPtr ii)
|
||||
{
|
||||
assert(ii->isLoad() || ii->isAtomicRet());
|
||||
for (int i = 0; i < ii->getNumOperands(); ++i) {
|
||||
if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
|
||||
|
||||
int sgprIdx = ii->getRegisterIndex(i, ii);
|
||||
int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
|
||||
ii->getOperandSize(i) / 4;
|
||||
|
||||
for (int j = 0; j < nRegs; ++j) {
|
||||
int physReg = computeUnit->registerManager->
|
||||
mapSgpr(w, sgprIdx + j);
|
||||
enqRegFreeEvent(physReg, computeUnit->clockPeriod());
|
||||
}
|
||||
|
||||
registerWrites += nRegs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ScalarRegisterFile*
|
||||
ScalarRegisterFileParams::create()
|
||||
{
|
||||
return new ScalarRegisterFile(this);
|
||||
}
|
||||
104
src/gpu-compute/scalar_register_file.hh
Normal file
104
src/gpu-compute/scalar_register_file.hh
Normal file
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: John Kalamatianos,
|
||||
* Mark Wyse
|
||||
*/
|
||||
|
||||
#ifndef __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
|
||||
#define __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
|
||||
|
||||
#include "arch/gpu_isa.hh"
|
||||
#include "base/statistics.hh"
|
||||
#include "base/trace.hh"
|
||||
#include "base/types.hh"
|
||||
#include "debug/GPUSRF.hh"
|
||||
#include "gpu-compute/register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
struct ScalarRegisterFileParams;
|
||||
|
||||
// Scalar Register File
|
||||
class ScalarRegisterFile : public RegisterFile
|
||||
{
|
||||
public:
|
||||
using ScalarRegU32 = TheGpuISA::ScalarRegU32;
|
||||
|
||||
ScalarRegisterFile(const ScalarRegisterFileParams *p);
|
||||
~ScalarRegisterFile() { }
|
||||
|
||||
virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override;
|
||||
virtual void scheduleWriteOperands(Wavefront *w,
|
||||
GPUDynInstPtr ii) override;
|
||||
virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
|
||||
GPUDynInstPtr ii) override;
|
||||
virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override;
|
||||
|
||||
void
|
||||
setParent(ComputeUnit *_computeUnit) override
|
||||
{
|
||||
RegisterFile::setParent(_computeUnit);
|
||||
}
|
||||
|
||||
// Read a register that is writeable (e.g., a DST operand)
|
||||
ScalarRegU32&
|
||||
readWriteable(int regIdx)
|
||||
{
|
||||
return regFile[regIdx];
|
||||
}
|
||||
|
||||
// Read a register that is not writeable (e.g., src operand)
|
||||
ScalarRegU32
|
||||
read(int regIdx) const
|
||||
{
|
||||
return regFile[regIdx];
|
||||
}
|
||||
|
||||
// Write a register
|
||||
void
|
||||
write(int regIdx, ScalarRegU32 value)
|
||||
{
|
||||
regFile[regIdx] = value;
|
||||
}
|
||||
|
||||
void
|
||||
printReg(Wavefront *wf, int regIdx) const
|
||||
{
|
||||
DPRINTF(GPUSRF, "WF[%d][%d]: Id%d s[%d] = %#x\n", wf->simdId,
|
||||
wf->wfSlotId, wf->wfDynId, regIdx, regFile[regIdx]);
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<ScalarRegU32> regFile;
|
||||
};
|
||||
|
||||
#endif // __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
|
||||
@@ -33,24 +33,36 @@
|
||||
|
||||
#include "gpu-compute/schedule_stage.hh"
|
||||
|
||||
#include <unordered_set>
|
||||
|
||||
#include "debug/GPUSched.hh"
|
||||
#include "debug/GPUVRF.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/scalar_register_file.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
|
||||
: numSIMDs(p->num_SIMDs),
|
||||
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
|
||||
ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu)
|
||||
: vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
|
||||
scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
|
||||
locMemBusRdy(false), locMemIssueRdy(false)
|
||||
{
|
||||
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
|
||||
for (int j = 0; j < cu->numExeUnits(); ++j) {
|
||||
scheduler.emplace_back(p);
|
||||
}
|
||||
wavesInSch.clear();
|
||||
schList.resize(cu->numExeUnits());
|
||||
for (auto &dq : schList) {
|
||||
dq.clear();
|
||||
}
|
||||
}
|
||||
|
||||
ScheduleStage::~ScheduleStage()
|
||||
{
|
||||
scheduler.clear();
|
||||
waveStatusList.clear();
|
||||
wavesInSch.clear();
|
||||
schList.clear();
|
||||
}
|
||||
|
||||
void
|
||||
@@ -59,56 +71,597 @@ ScheduleStage::init(ComputeUnit *cu)
|
||||
computeUnit = cu;
|
||||
_name = computeUnit->name() + ".ScheduleStage";
|
||||
|
||||
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
|
||||
fatal_if(scheduler.size() != computeUnit->readyList.size(),
|
||||
"Scheduler should have same number of entries as CU's readyList");
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
scheduler[j].bindList(&computeUnit->readyList[j]);
|
||||
}
|
||||
|
||||
for (int j = 0; j < numSIMDs; ++j) {
|
||||
waveStatusList.push_back(&computeUnit->waveStatusList[j]);
|
||||
}
|
||||
|
||||
dispatchList = &computeUnit->dispatchList;
|
||||
|
||||
assert(computeUnit->numVectorGlobalMemUnits == 1);
|
||||
assert(computeUnit->numVectorSharedMemUnits == 1);
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::arbitrate()
|
||||
ScheduleStage::exec()
|
||||
{
|
||||
// iterate over all Memory pipelines
|
||||
for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
|
||||
if (dispatchList->at(j).first) {
|
||||
Wavefront *waveToMemPipe = dispatchList->at(j).first;
|
||||
// iterate over all execution pipelines
|
||||
for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
|
||||
if ((i != j) && (dispatchList->at(i).first)) {
|
||||
Wavefront *waveToExePipe = dispatchList->at(i).first;
|
||||
// if the two selected wavefronts are mapped to the same
|
||||
// SIMD unit then they share the VRF
|
||||
if (waveToMemPipe->simdId == waveToExePipe->simdId) {
|
||||
int simdId = waveToMemPipe->simdId;
|
||||
// Read VRF port arbitration:
|
||||
// If there are read VRF port conflicts between the
|
||||
// a memory and another instruction we drop the other
|
||||
// instruction. We don't need to check for write VRF
|
||||
// port conflicts because the memory instruction either
|
||||
// does not need to write to the VRF (store) or will
|
||||
// write to the VRF when the data comes back (load) in
|
||||
// which case the arbiter of the memory pipes will
|
||||
// resolve any conflicts
|
||||
if (computeUnit->vrf[simdId]->
|
||||
isReadConflict(waveToMemPipe->wfSlotId,
|
||||
waveToExePipe->wfSlotId)) {
|
||||
// FIXME: The "second" member variable is never
|
||||
// used in the model. I am setting it to READY
|
||||
// simply to follow the protocol of setting it
|
||||
// when the WF has an instruction ready to issue
|
||||
waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
|
||||
.second = READY;
|
||||
// Update readyList
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
// delete all ready wavefronts whose instruction buffers are now
|
||||
// empty because the last instruction was executed
|
||||
computeUnit->updateReadyList(j);
|
||||
/**
|
||||
* Remove any wave that already has an instruction present in SCH
|
||||
* waiting for RF reads to complete. This prevents out of order
|
||||
* execution within a wave.
|
||||
*/
|
||||
for (auto wIt = computeUnit->readyList.at(j).begin();
|
||||
wIt != computeUnit->readyList.at(j).end();) {
|
||||
if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
|
||||
*wIt = nullptr;
|
||||
wIt = computeUnit->readyList.at(j).erase(wIt);
|
||||
} else {
|
||||
wIt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dispatchList->at(i).first = nullptr;
|
||||
dispatchList->at(i).second = EMPTY;
|
||||
break;
|
||||
}
|
||||
// Attempt to add another wave for each EXE type to schList queues
|
||||
// VMEM resources are iterated first, effectively giving priority
|
||||
// to VMEM over VALU for scheduling read of operands to the RFs.
|
||||
// Scalar Memory are iterated after VMEM
|
||||
|
||||
// Iterate VMEM and SMEM
|
||||
int firstMemUnit = computeUnit->firstMemUnit();
|
||||
int lastMemUnit = computeUnit->lastMemUnit();
|
||||
for (int j = firstMemUnit; j <= lastMemUnit; j++) {
|
||||
int readyListSize = computeUnit->readyList[j].size();
|
||||
// If no wave is ready to be scheduled on the execution resource
|
||||
// then skip scheduling for this execution resource
|
||||
if (!readyListSize) {
|
||||
rdyListEmpty[j]++;
|
||||
continue;
|
||||
}
|
||||
rdyListNotEmpty[j]++;
|
||||
|
||||
// Pick a wave and attempt to add it to schList
|
||||
Wavefront *w = scheduler[j].chooseWave();
|
||||
if (!addToSchList(j, w)) {
|
||||
// For waves not added to schList, increment count of cycles
|
||||
// this wave spends in SCH stage.
|
||||
w->schCycles++;
|
||||
addToSchListStalls[j]++;
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate everything else
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
// skip the VMEM resources
|
||||
if (j >= firstMemUnit && j <= lastMemUnit) {
|
||||
continue;
|
||||
}
|
||||
int readyListSize = computeUnit->readyList[j].size();
|
||||
// If no wave is ready to be scheduled on the execution resource
|
||||
// then skip scheduling for this execution resource
|
||||
if (!readyListSize) {
|
||||
rdyListEmpty[j]++;
|
||||
continue;
|
||||
}
|
||||
rdyListNotEmpty[j]++;
|
||||
|
||||
// Pick a wave and attempt to add it to schList
|
||||
Wavefront *w = scheduler[j].chooseWave();
|
||||
if (!addToSchList(j, w)) {
|
||||
// For waves not added to schList, increment count of cycles
|
||||
// this wave spends in SCH stage.
|
||||
w->schCycles++;
|
||||
addToSchListStalls[j]++;
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, the schList queue per EXE type may contain
|
||||
// multiple waves, in order of age (oldest to youngest).
|
||||
// Wave may be in RFBUSY, indicating they are waiting for registers
|
||||
// to be read, or in RFREADY, indicating they are candidates for
|
||||
// the dispatchList and execution
|
||||
|
||||
// Iterate schList queues and check if any of the waves have finished
|
||||
// reading their operands, moving those waves to RFREADY status
|
||||
checkRfOperandReadComplete();
|
||||
|
||||
// Fill the dispatch list with the oldest wave of each EXE type that
|
||||
// is ready to execute
|
||||
// Wave is picked if status in schList is RFREADY and it passes resource
|
||||
// ready checks similar to those currently in SCB
|
||||
fillDispatchList();
|
||||
|
||||
// Resource arbitration on waves in dispatchList
|
||||
// Losing waves are re-inserted to the schList at a location determined
|
||||
// by wave age
|
||||
|
||||
// Arbitrate access to the VRF->LDS bus
|
||||
arbitrateVrfToLdsBus();
|
||||
|
||||
// Schedule write operations to the register files
|
||||
scheduleRfDestOperands();
|
||||
|
||||
// Lastly, reserve resources for waves that are ready to execute.
|
||||
reserveResources();
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
|
||||
Wavefront *w)
|
||||
{
|
||||
dispatchList->at(unitId).first = w;
|
||||
dispatchList->at(unitId).second = s;
|
||||
}
|
||||
|
||||
bool
|
||||
ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
|
||||
{
|
||||
GPUDynInstPtr ii = w->instructionBuffer.front();
|
||||
assert(ii);
|
||||
bool accessVrfWr = true;
|
||||
if (!ii->isScalar()) {
|
||||
accessVrfWr =
|
||||
computeUnit->vrf[w->simdId]->canScheduleWriteOperands(w, ii);
|
||||
}
|
||||
bool accessSrfWr =
|
||||
computeUnit->srf[w->simdId]->canScheduleWriteOperands(w, ii);
|
||||
bool accessRf = accessVrfWr && accessSrfWr;
|
||||
if (accessRf) {
|
||||
if (!ii->isScalar()) {
|
||||
computeUnit->vrf[w->simdId]->scheduleWriteOperands(w, ii);
|
||||
}
|
||||
computeUnit->srf[w->simdId]->scheduleWriteOperands(w, ii);
|
||||
return true;
|
||||
} else {
|
||||
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
|
||||
if (!accessSrfWr) {
|
||||
rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
|
||||
}
|
||||
if (!accessVrfWr) {
|
||||
rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
|
||||
}
|
||||
|
||||
// Increment stall counts for WF
|
||||
w->schStalls++;
|
||||
w->schRfAccessStalls++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::scheduleRfDestOperands()
|
||||
{
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
if (!dispatchList->at(j).first) {
|
||||
continue;
|
||||
}
|
||||
// get the wave on dispatch list and attempt to allocate write
|
||||
// resources in the RFs
|
||||
Wavefront *w = dispatchList->at(j).first;
|
||||
if (!schedRfWrites(j, w)) {
|
||||
reinsertToSchList(j, w);
|
||||
doDispatchListTransition(j, EMPTY);
|
||||
// if this is a flat inst, also transition the LM pipe to empty
|
||||
// Note: since FLAT/LM arbitration occurs before scheduling
|
||||
// destination operands to the RFs, it is possible that a LM
|
||||
// instruction lost arbitration, but would have been able to
|
||||
// pass the RF destination operand check here, and execute
|
||||
// instead of the FLAT.
|
||||
if (w->instructionBuffer.front()->isFlat()) {
|
||||
assert(dispatchList->at(w->localMem).second == SKIP);
|
||||
doDispatchListTransition(w->localMem, EMPTY);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
ScheduleStage::addToSchList(int exeType, Wavefront *w)
|
||||
{
|
||||
// Attempt to add the wave to the schList if the VRF can support the
|
||||
// wave's next instruction
|
||||
GPUDynInstPtr ii = w->instructionBuffer.front();
|
||||
assert(ii);
|
||||
bool accessVrf = true;
|
||||
if (!ii->isScalar()) {
|
||||
accessVrf =
|
||||
computeUnit->vrf[w->simdId]->canScheduleReadOperands(w, ii);
|
||||
}
|
||||
bool accessSrf =
|
||||
computeUnit->srf[w->simdId]->canScheduleReadOperands(w, ii);
|
||||
// If RFs can support instruction, add to schList in RFBUSY state,
|
||||
// place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
|
||||
// to the VRF
|
||||
bool accessRf = accessVrf && accessSrf;
|
||||
if (accessRf) {
|
||||
DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
|
||||
exeType, w->simdId, w->wfDynId,
|
||||
ii->seqNum(), ii->disassemble());
|
||||
|
||||
computeUnit->insertInPipeMap(w);
|
||||
wavesInSch.emplace(w->wfDynId);
|
||||
schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
|
||||
if (w->isOldestInstWaitcnt()) {
|
||||
w->setStatus(Wavefront::S_WAITCNT);
|
||||
}
|
||||
if (!ii->isScalar()) {
|
||||
computeUnit->vrf[w->simdId]->scheduleReadOperands(w, ii);
|
||||
}
|
||||
computeUnit->srf[w->simdId]->scheduleReadOperands(w, ii);
|
||||
|
||||
DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
|
||||
exeType, w->simdId, w->wfDynId,
|
||||
ii->seqNum(), ii->disassemble());
|
||||
return true;
|
||||
} else {
|
||||
// Number of stall cycles due to RF access denied
|
||||
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
|
||||
// Count number of denials due to each reason
|
||||
// Multiple items may contribute to the denied request
|
||||
if (!accessVrf) {
|
||||
rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
|
||||
}
|
||||
if (!accessSrf) {
|
||||
rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
|
||||
}
|
||||
|
||||
// Increment stall counts for WF
|
||||
w->schStalls++;
|
||||
w->schRfAccessStalls++;
|
||||
DPRINTF(GPUSched, "schList[%d]: Could not add: "
|
||||
"SIMD[%d] WV[%d]: %d: %s\n",
|
||||
exeType, w->simdId, w->wfDynId,
|
||||
ii->seqNum(), ii->disassemble());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::reinsertToSchList(int exeType, Wavefront *w)
|
||||
{
|
||||
// Insert wave w into schList for specified exeType.
|
||||
// Wave is inserted in age order, with oldest wave being at the
|
||||
// front of the schList
|
||||
auto schIter = schList.at(exeType).begin();
|
||||
while (schIter != schList.at(exeType).end()
|
||||
&& schIter->first->wfDynId < w->wfDynId) {
|
||||
schIter++;
|
||||
}
|
||||
schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY));
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::checkMemResources()
|
||||
{
|
||||
// Check for resource availability in the next cycle
|
||||
scalarMemBusRdy = false;
|
||||
scalarMemIssueRdy = false;
|
||||
// check if there is a SRF->Global Memory bus available and
|
||||
if (computeUnit->srfToScalarMemPipeBus.rdy(Cycles(1))) {
|
||||
scalarMemBusRdy = true;
|
||||
}
|
||||
// check if we can issue a scalar memory instruction
|
||||
if (computeUnit->scalarMemUnit.rdy(Cycles(1))) {
|
||||
scalarMemIssueRdy = true;
|
||||
}
|
||||
|
||||
glbMemBusRdy = false;
|
||||
glbMemIssueRdy = false;
|
||||
// check if there is a VRF->Global Memory bus available
|
||||
if (computeUnit->vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
|
||||
glbMemBusRdy = true;
|
||||
}
|
||||
// check if we can issue a Global memory instruction
|
||||
if (computeUnit->vectorGlobalMemUnit.rdy(Cycles(1))) {
|
||||
glbMemIssueRdy = true;
|
||||
}
|
||||
|
||||
locMemBusRdy = false;
|
||||
locMemIssueRdy = false;
|
||||
// check if there is a VRF->LDS bus available
|
||||
if (computeUnit->vrfToLocalMemPipeBus.rdy(Cycles(1))) {
|
||||
locMemBusRdy = true;
|
||||
}
|
||||
// check if we can issue a LDS instruction
|
||||
if (computeUnit->vectorSharedMemUnit.rdy(Cycles(1))) {
|
||||
locMemIssueRdy = true;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
ScheduleStage::dispatchReady(Wavefront *w)
|
||||
{
|
||||
vectorAluRdy = false;
|
||||
scalarAluRdy = false;
|
||||
// check for available vector/scalar ALUs in the next cycle
|
||||
if (computeUnit->vectorALUs[w->simdId].rdy(Cycles(1))) {
|
||||
vectorAluRdy = true;
|
||||
}
|
||||
if (computeUnit->scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
|
||||
scalarAluRdy = true;
|
||||
}
|
||||
GPUDynInstPtr ii = w->instructionBuffer.front();
|
||||
|
||||
if (ii->isNop()) {
|
||||
// S_NOP requires SALU. V_NOP requires VALU.
|
||||
// TODO: Scalar NOP does not require SALU in hardware,
|
||||
// and is executed out of IB directly.
|
||||
if (ii->isScalar() && !scalarAluRdy) {
|
||||
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
return false;
|
||||
} else if (!ii->isScalar() && !vectorAluRdy) {
|
||||
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
|
||||
return false;
|
||||
}
|
||||
} else if (ii->isEndOfKernel()) {
|
||||
// EndPgm instruction
|
||||
if (ii->isScalar() && !scalarAluRdy) {
|
||||
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
return false;
|
||||
}
|
||||
} else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) {
|
||||
// Barrier, Branch, or ALU instruction
|
||||
if (ii->isScalar() && !scalarAluRdy) {
|
||||
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
|
||||
return false;
|
||||
} else if (!ii->isScalar() && !vectorAluRdy) {
|
||||
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
|
||||
return false;
|
||||
}
|
||||
} else if (!ii->isScalar() && ii->isGlobalMem()) {
|
||||
// Vector Global Memory instruction
|
||||
bool rdy = true;
|
||||
if (!glbMemIssueRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
|
||||
}
|
||||
if (!glbMemBusRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
}
|
||||
} else if (ii->isScalar() && ii->isGlobalMem()) {
|
||||
// Scalar Global Memory instruction
|
||||
bool rdy = true;
|
||||
if (!scalarMemIssueRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
|
||||
}
|
||||
if (!scalarMemBusRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->scalarMemoryPipe.
|
||||
isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
|
||||
w->scalarWrGmReqsInPipe)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
}
|
||||
} else if (!ii->isScalar() && ii->isLocalMem()) {
|
||||
// Vector Local Memory instruction
|
||||
bool rdy = true;
|
||||
if (!locMemIssueRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
|
||||
}
|
||||
if (!locMemBusRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->localMemoryPipe.
|
||||
isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
}
|
||||
} else if (!ii->isScalar() && ii->isFlat()) {
|
||||
// Vector Flat memory instruction
|
||||
bool rdy = true;
|
||||
if (!glbMemIssueRdy || !locMemIssueRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
|
||||
}
|
||||
if (!glbMemBusRdy || !locMemBusRdy) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
|
||||
}
|
||||
if (!computeUnit->localMemoryPipe.
|
||||
isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
|
||||
rdy = false;
|
||||
dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
|
||||
}
|
||||
if (!rdy) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
panic("%s: unknown instr checked for readiness", ii->disassemble());
|
||||
return false;
|
||||
}
|
||||
dispNrdyStalls[SCH_RDY]++;
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::fillDispatchList()
|
||||
{
|
||||
// update execution resource status
|
||||
checkMemResources();
|
||||
// iterate execution resources
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); j++) {
|
||||
assert(dispatchList->at(j).second == EMPTY);
|
||||
|
||||
// iterate waves in schList to pick one for dispatch
|
||||
auto schIter = schList.at(j).begin();
|
||||
bool dispatched = false;
|
||||
while (schIter != schList.at(j).end()) {
|
||||
// only attempt to dispatch if status is RFREADY
|
||||
if (schIter->second == RFREADY) {
|
||||
// Check if this wave is ready for dispatch
|
||||
bool dispRdy = dispatchReady(schIter->first);
|
||||
if (!dispatched && dispRdy) {
|
||||
// No other wave has been dispatched for this exe
|
||||
// resource, and this wave is ready. Place this wave
|
||||
// on dispatchList and make it ready for execution
|
||||
// next cycle.
|
||||
|
||||
// Acquire a coalescer token if it is a global mem
|
||||
// operation.
|
||||
GPUDynInstPtr mp = schIter->first->
|
||||
instructionBuffer.front();
|
||||
if (!mp->isMemSync() && !mp->isScalar() &&
|
||||
(mp->isGlobalMem() || mp->isFlat())) {
|
||||
computeUnit->globalMemoryPipe.acqCoalescerToken(mp);
|
||||
}
|
||||
|
||||
doDispatchListTransition(j, EXREADY, schIter->first);
|
||||
DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
|
||||
"EMPTY->EXREADY\n", j);
|
||||
schIter->first = nullptr;
|
||||
schIter = schList.at(j).erase(schIter);
|
||||
dispatched = true;
|
||||
} else {
|
||||
// Either another wave has been dispatched, or this wave
|
||||
// was not ready, so it is stalled this cycle
|
||||
schIter->first->schStalls++;
|
||||
if (!dispRdy) {
|
||||
// not ready for dispatch, increment stall stat
|
||||
schIter->first->schResourceStalls++;
|
||||
}
|
||||
// Examine next wave for this resource
|
||||
schIter++;
|
||||
}
|
||||
} else {
|
||||
// Wave not in RFREADY, try next wave
|
||||
schIter++;
|
||||
}
|
||||
}
|
||||
|
||||
// Increment stall count if no wave sent to dispatchList for
|
||||
// current execution resource
|
||||
if (!dispatched) {
|
||||
schListToDispListStalls[j]++;
|
||||
} else {
|
||||
schListToDispList[j]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::arbitrateVrfToLdsBus()
|
||||
{
|
||||
// Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
|
||||
// Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
|
||||
// and a VRF->LDS bus. In GFx9, this is not the case.
|
||||
|
||||
// iterate the GM pipelines
|
||||
for (int i = 0; i < computeUnit->numVectorGlobalMemUnits; i++) {
|
||||
// get the GM pipe index in the dispatchList
|
||||
int gm_exe_unit = computeUnit->firstMemUnit() + i;
|
||||
// get the wave in the dispatchList
|
||||
Wavefront *w = dispatchList->at(gm_exe_unit).first;
|
||||
// If the WF is valid, ready to execute, and the instruction
|
||||
// is a flat access, arbitrate with the WF's assigned LM pipe
|
||||
if (w && dispatchList->at(gm_exe_unit).second == EXREADY &&
|
||||
w->instructionBuffer.front()->isFlat()) {
|
||||
// If the associated LM pipe also has a wave selected, block
|
||||
// that wave and let the Flat instruction issue. The WF in the
|
||||
// LM pipe is added back to the schList for consideration next
|
||||
// cycle.
|
||||
if (dispatchList->at(w->localMem).second == EXREADY) {
|
||||
reinsertToSchList(w->localMem,
|
||||
dispatchList->at(w->localMem).first);
|
||||
// Increment stall stats for LDS-VRF arbitration
|
||||
ldsBusArbStalls++;
|
||||
dispatchList->at(w->localMem).first->schLdsArbStalls++;
|
||||
}
|
||||
// With arbitration of LM pipe complete, transition the
|
||||
// LM pipe to SKIP state in the dispatchList to inform EX stage
|
||||
// that a Flat instruction is executing next cycle
|
||||
doDispatchListTransition(w->localMem, SKIP, w);
|
||||
DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
|
||||
"EXREADY->SKIP\n", w->localMem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::checkRfOperandReadComplete()
|
||||
{
|
||||
// Iterate the schList queues and check if operand reads
|
||||
// have completed in the RFs. If so, mark the wave as ready for
|
||||
// selection for dispatchList
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
for (auto &p : schList.at(j)) {
|
||||
Wavefront *w = p.first;
|
||||
assert(w);
|
||||
|
||||
// Increment the number of cycles the wave spends in the
|
||||
// SCH stage, since this loop visits every wave in SCH.
|
||||
w->schCycles++;
|
||||
|
||||
GPUDynInstPtr ii = w->instructionBuffer.front();
|
||||
bool vrfRdy = true;
|
||||
if (!ii->isScalar()) {
|
||||
vrfRdy =
|
||||
computeUnit->vrf[w->simdId]->operandReadComplete(w, ii);
|
||||
}
|
||||
bool srfRdy =
|
||||
computeUnit->srf[w->simdId]->operandReadComplete(w, ii);
|
||||
bool operandsReady = vrfRdy && srfRdy;
|
||||
if (operandsReady) {
|
||||
DPRINTF(GPUSched,
|
||||
"schList[%d]: WV[%d] operands ready for: %d: %s\n",
|
||||
j, w->wfDynId, ii->seqNum(), ii->disassemble());
|
||||
DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
|
||||
j, w->wfDynId);
|
||||
p.second = RFREADY;
|
||||
} else {
|
||||
DPRINTF(GPUSched,
|
||||
"schList[%d]: WV[%d] operands not ready for: %d: %s\n",
|
||||
j, w->wfDynId, ii->seqNum(), ii->disassemble());
|
||||
|
||||
// operands not ready yet, increment SCH stage stats
|
||||
// aggregate to all wavefronts on the CU
|
||||
p.second = RFBUSY;
|
||||
|
||||
// Increment stall stats
|
||||
w->schStalls++;
|
||||
w->schOpdNrdyStalls++;
|
||||
|
||||
opdNrdyStalls[SCH_RF_OPD_NRDY]++;
|
||||
if (!vrfRdy) {
|
||||
opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
|
||||
}
|
||||
if (!srfRdy) {
|
||||
opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -116,33 +669,177 @@ ScheduleStage::arbitrate()
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::exec()
|
||||
ScheduleStage::reserveResources()
|
||||
{
|
||||
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
|
||||
uint32_t readyListSize = computeUnit->readyList[j].size();
|
||||
std::vector<bool> exeUnitReservations;
|
||||
exeUnitReservations.resize(computeUnit->numExeUnits(), false);
|
||||
|
||||
// If no wave is ready to be scheduled on the execution resource
|
||||
// then skip scheduling for this execution resource
|
||||
if (!readyListSize) {
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
|
||||
Wavefront *dispatchedWave = dispatchList->at(j).first;
|
||||
if (dispatchedWave) {
|
||||
DISPATCH_STATUS s = dispatchList->at(j).second;
|
||||
if (s == EMPTY) {
|
||||
continue;
|
||||
} else if (s == EXREADY) {
|
||||
// Wave is ready for execution
|
||||
std::vector<int> execUnitIds =
|
||||
dispatchedWave->reserveResources();
|
||||
GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
|
||||
|
||||
Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
|
||||
dispatchList->at(j).first = waveToBeDispatched;
|
||||
waveToBeDispatched->updateResources();
|
||||
dispatchList->at(j).second = FILLED;
|
||||
if (!ii->isScalar()) {
|
||||
computeUnit->vrf[dispatchedWave->simdId]->
|
||||
dispatchInstruction(ii);
|
||||
}
|
||||
computeUnit->srf[dispatchedWave->simdId]->
|
||||
dispatchInstruction(ii);
|
||||
|
||||
waveStatusList[waveToBeDispatched->simdId]->at(
|
||||
waveToBeDispatched->wfSlotId).second = BLOCKED;
|
||||
std::stringstream ss;
|
||||
for (auto id : execUnitIds) {
|
||||
ss << id << " ";
|
||||
}
|
||||
DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
|
||||
" Reserving ExeRes[ %s]\n",
|
||||
j, dispatchedWave->simdId, dispatchedWave->wfDynId,
|
||||
ii->seqNum(), ii->disassemble(), ss.str());
|
||||
// mark the resources as reserved for this cycle
|
||||
for (auto execUnitId : execUnitIds) {
|
||||
panic_if(exeUnitReservations.at(execUnitId),
|
||||
"Execution unit %d is reserved!!!\n"
|
||||
"SIMD[%d] WV[%d]: %d: %s",
|
||||
execUnitId, dispatchedWave->simdId,
|
||||
dispatchedWave->wfDynId,
|
||||
ii->seqNum(), ii->disassemble());
|
||||
exeUnitReservations.at(execUnitId) = true;
|
||||
}
|
||||
|
||||
assert(computeUnit->readyList[j].size() == readyListSize - 1);
|
||||
// If wavefront::reserveResources reserved multiple resources,
|
||||
// then we're executing a flat memory instruction. This means
|
||||
// that we've reserved a global and local memory unit. Thus,
|
||||
// we need to mark the latter execution unit as not available.
|
||||
if (execUnitIds.size() > 1) {
|
||||
int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem;
|
||||
assert(dispatchList->at(lm_exec_unit).second == SKIP);
|
||||
}
|
||||
} else if (s == SKIP) {
|
||||
// Shared Memory pipe reserved for FLAT instruction.
|
||||
// Verify the GM pipe for this wave is ready to execute
|
||||
// and the wave in the GM pipe is the same as the wave
|
||||
// in the LM pipe
|
||||
int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem;
|
||||
assert(dispatchList->at(gm_exec_unit).first->wfDynId ==
|
||||
dispatchedWave->wfDynId);
|
||||
assert(dispatchList->at(gm_exec_unit).second == EXREADY);
|
||||
}
|
||||
}
|
||||
}
|
||||
// arbitrate over all shared resources among instructions being issued
|
||||
// simultaneously
|
||||
arbitrate();
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::deleteFromSch(Wavefront *w)
|
||||
{
|
||||
wavesInSch.erase(w->wfDynId);
|
||||
}
|
||||
|
||||
void
|
||||
ScheduleStage::regStats()
|
||||
{
|
||||
rdyListNotEmpty
|
||||
.init(computeUnit->numExeUnits())
|
||||
.name(name() + ".rdy_list_not_empty")
|
||||
.desc("number of cycles one or more wave on ready list per "
|
||||
"execution resource")
|
||||
;
|
||||
|
||||
rdyListEmpty
|
||||
.init(computeUnit->numExeUnits())
|
||||
.name(name() + ".rdy_list_empty")
|
||||
.desc("number of cycles no wave on ready list per "
|
||||
"execution resource")
|
||||
;
|
||||
|
||||
addToSchListStalls
|
||||
.init(computeUnit->numExeUnits())
|
||||
.name(name() + ".sch_list_add_stalls")
|
||||
.desc("number of cycles a wave is not added to schList per "
|
||||
"execution resource when ready list is not empty")
|
||||
;
|
||||
|
||||
schListToDispList
|
||||
.init(computeUnit->numExeUnits())
|
||||
.name(name() + ".sch_list_to_disp_list")
|
||||
.desc("number of cycles a wave is added to dispatchList per "
|
||||
"execution resource")
|
||||
;
|
||||
|
||||
schListToDispListStalls
|
||||
.init(computeUnit->numExeUnits())
|
||||
.name(name() + ".sch_list_to_disp_list_stalls")
|
||||
.desc("number of cycles no wave is added to dispatchList per "
|
||||
"execution resource")
|
||||
;
|
||||
|
||||
// Operand Readiness Stall Cycles
|
||||
opdNrdyStalls
|
||||
.init(SCH_RF_OPD_NRDY_CONDITIONS)
|
||||
.name(name() + ".opd_nrdy_stalls")
|
||||
.desc("number of stalls in SCH due to operands not ready")
|
||||
;
|
||||
opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
|
||||
opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
|
||||
opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
|
||||
|
||||
// dispatchReady Stall Cycles
|
||||
dispNrdyStalls
|
||||
.init(SCH_NRDY_CONDITIONS)
|
||||
.name(name() + ".disp_nrdy_stalls")
|
||||
.desc("number of stalls in SCH due to resource not ready")
|
||||
;
|
||||
dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
|
||||
dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
|
||||
dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
|
||||
csprintf("VectorMemIssue"));
|
||||
dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,
|
||||
csprintf("VectorMemBusBusy"));
|
||||
dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,
|
||||
csprintf("VectorMemCoalescer"));
|
||||
dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
|
||||
dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,
|
||||
csprintf("ScalarMemIssue"));
|
||||
dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,
|
||||
csprintf("ScalarMemBusBusy"));
|
||||
dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,
|
||||
csprintf("ScalarMemFIFO"));
|
||||
dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,
|
||||
csprintf("LocalMemIssue"));
|
||||
dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,
|
||||
csprintf("LocalMemBusBusy"));
|
||||
dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,
|
||||
csprintf("LocalMemFIFO"));
|
||||
dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,
|
||||
csprintf("FlatMemIssue"));
|
||||
dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,
|
||||
csprintf("FlatMemBusBusy"));
|
||||
dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,
|
||||
csprintf("FlatMemCoalescer"));
|
||||
dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,
|
||||
csprintf("FlatMemFIFO"));
|
||||
dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
|
||||
|
||||
// RF Access Stall Cycles
|
||||
rfAccessStalls
|
||||
.init(SCH_RF_ACCESS_NRDY_CONDITIONS)
|
||||
.name(name() + ".rf_access_stalls")
|
||||
.desc("number of stalls due to RF access denied")
|
||||
;
|
||||
rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
|
||||
rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
|
||||
rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
|
||||
rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
|
||||
rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
|
||||
|
||||
// Stall cycles due to wave losing LDS bus arbitration
|
||||
ldsBusArbStalls
|
||||
.name(name() + ".lds_bus_arb_stalls")
|
||||
.desc("number of stalls due to VRF->LDS bus conflicts")
|
||||
;
|
||||
}
|
||||
|
||||
@@ -34,6 +34,9 @@
|
||||
#ifndef __SCHEDULE_STAGE_HH__
|
||||
#define __SCHEDULE_STAGE_HH__
|
||||
|
||||
#include <deque>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
@@ -54,40 +57,169 @@ struct ComputeUnitParams;
|
||||
class ScheduleStage
|
||||
{
|
||||
public:
|
||||
ScheduleStage(const ComputeUnitParams *params);
|
||||
ScheduleStage(const ComputeUnitParams *params, ComputeUnit *cu);
|
||||
~ScheduleStage();
|
||||
void init(ComputeUnit *cu);
|
||||
void exec();
|
||||
void arbitrate();
|
||||
|
||||
// Stats related variables and methods
|
||||
std::string name() { return _name; }
|
||||
enum SchNonRdyType {
|
||||
SCH_SCALAR_ALU_NRDY,
|
||||
SCH_VECTOR_ALU_NRDY,
|
||||
SCH_VECTOR_MEM_ISSUE_NRDY,
|
||||
SCH_VECTOR_MEM_BUS_BUSY_NRDY,
|
||||
SCH_VECTOR_MEM_COALESCER_NRDY,
|
||||
SCH_VECTOR_MEM_REQS_NRDY,
|
||||
SCH_CEDE_SIMD_NRDY,
|
||||
SCH_SCALAR_MEM_ISSUE_NRDY,
|
||||
SCH_SCALAR_MEM_BUS_BUSY_NRDY,
|
||||
SCH_SCALAR_MEM_FIFO_NRDY,
|
||||
SCH_LOCAL_MEM_ISSUE_NRDY,
|
||||
SCH_LOCAL_MEM_BUS_BUSY_NRDY,
|
||||
SCH_LOCAL_MEM_FIFO_NRDY,
|
||||
SCH_FLAT_MEM_ISSUE_NRDY,
|
||||
SCH_FLAT_MEM_BUS_BUSY_NRDY,
|
||||
SCH_FLAT_MEM_COALESCER_NRDY,
|
||||
SCH_FLAT_MEM_REQS_NRDY,
|
||||
SCH_FLAT_MEM_FIFO_NRDY,
|
||||
SCH_RDY,
|
||||
SCH_NRDY_CONDITIONS
|
||||
};
|
||||
enum schopdnonrdytype_e {
|
||||
SCH_VRF_OPD_NRDY,
|
||||
SCH_SRF_OPD_NRDY,
|
||||
SCH_RF_OPD_NRDY,
|
||||
SCH_RF_OPD_NRDY_CONDITIONS
|
||||
};
|
||||
enum schrfaccessnonrdytype_e {
|
||||
SCH_VRF_RD_ACCESS_NRDY,
|
||||
SCH_VRF_WR_ACCESS_NRDY,
|
||||
SCH_SRF_RD_ACCESS_NRDY,
|
||||
SCH_SRF_WR_ACCESS_NRDY,
|
||||
SCH_RF_ACCESS_NRDY,
|
||||
SCH_RF_ACCESS_NRDY_CONDITIONS
|
||||
};
|
||||
|
||||
void regStats();
|
||||
|
||||
// Called by ExecStage to inform SCH of instruction execution
|
||||
void deleteFromSch(Wavefront *w);
|
||||
|
||||
// Schedule List status
|
||||
enum SCH_STATUS
|
||||
{
|
||||
RFBUSY = 0, // RF busy reading operands
|
||||
RFREADY, // ready for exec
|
||||
};
|
||||
|
||||
private:
|
||||
ComputeUnit *computeUnit;
|
||||
uint32_t numSIMDs;
|
||||
uint32_t numMemUnits;
|
||||
|
||||
// Each execution resource will have its own
|
||||
// scheduler and a dispatch list
|
||||
std::vector<Scheduler> scheduler;
|
||||
|
||||
// Stores the status of waves. A READY implies the
|
||||
// wave is ready to be scheduled this cycle and
|
||||
// is already present in the readyList
|
||||
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
|
||||
waveStatusList;
|
||||
|
||||
// List of waves which will be dispatched to
|
||||
// each execution resource. A FILLED implies
|
||||
// dispatch list is non-empty and
|
||||
// execution unit has something to execute
|
||||
// this cycle. Currently, the dispatch list of
|
||||
// each execution resource.
|
||||
// Currently, the dispatch list of
|
||||
// an execution resource can hold only one wave because
|
||||
// an execution resource can execute only one wave in a cycle.
|
||||
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
|
||||
|
||||
// Stats
|
||||
|
||||
// Number of cycles with empty (or not empty) readyList, per execution
|
||||
// resource, when the CU is active (not sleeping)
|
||||
Stats::Vector rdyListEmpty;
|
||||
Stats::Vector rdyListNotEmpty;
|
||||
|
||||
// Number of cycles, per execution resource, when at least one wave
|
||||
// was on the readyList and picked by scheduler, but was unable to be
|
||||
// added to the schList, when the CU is active (not sleeping)
|
||||
Stats::Vector addToSchListStalls;
|
||||
|
||||
// Number of cycles, per execution resource, when a wave is selected
|
||||
// as candidate for dispatchList from schList
|
||||
// Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
|
||||
Stats::Vector schListToDispList;
|
||||
|
||||
// Per execution resource stat, incremented once per cycle if no wave
|
||||
// was selected as candidate for dispatch and moved to dispatchList
|
||||
Stats::Vector schListToDispListStalls;
|
||||
|
||||
// Number of times a wave is selected by the scheduler but cannot
|
||||
// be added to the schList due to register files not being able to
|
||||
// support reads or writes of operands. RF_ACCESS_NRDY condition is always
|
||||
// incremented if at least one read/write not supported, other
|
||||
// conditions are incremented independently from each other.
|
||||
Stats::Vector rfAccessStalls;
|
||||
|
||||
// Number of times a wave is executing FLAT instruction and
|
||||
// forces another wave occupying its required local memory resource
|
||||
// to be deselected for execution, and placed back on schList
|
||||
Stats::Scalar ldsBusArbStalls;
|
||||
|
||||
// Count of times VRF and/or SRF blocks waves on schList from
|
||||
// performing RFBUSY->RFREADY transition
|
||||
Stats::Vector opdNrdyStalls;
|
||||
|
||||
// Count of times resource required for dispatch is not ready and
|
||||
// blocks wave in RFREADY state on schList from potentially moving
|
||||
// to dispatchList
|
||||
Stats::Vector dispNrdyStalls;
|
||||
|
||||
std::string _name;
|
||||
|
||||
// called by exec() to add a wave to schList if the RFs can support it
|
||||
bool addToSchList(int exeType, Wavefront *w);
|
||||
// re-insert a wave to schList if wave lost arbitration
|
||||
// wave is inserted such that age order (oldest to youngest) is preserved
|
||||
void reinsertToSchList(int exeType, Wavefront *w);
|
||||
// check waves in schList to see if RF reads complete
|
||||
void checkRfOperandReadComplete();
|
||||
// check execution resources for readiness
|
||||
bool vectorAluRdy;
|
||||
bool scalarAluRdy;
|
||||
bool scalarMemBusRdy;
|
||||
bool scalarMemIssueRdy;
|
||||
bool glbMemBusRdy;
|
||||
bool glbMemIssueRdy;
|
||||
bool locMemBusRdy;
|
||||
bool locMemIssueRdy;
|
||||
// check status of memory pipes and RF to Mem buses
|
||||
void checkMemResources();
|
||||
// resource ready check called by fillDispatchList
|
||||
bool dispatchReady(Wavefront *w);
|
||||
// pick waves from schList and populate dispatchList with one wave
|
||||
// per EXE resource type
|
||||
void fillDispatchList();
|
||||
// arbitrate Shared Mem Pipe VRF/LDS bus for waves in dispatchList
|
||||
void arbitrateVrfToLdsBus();
|
||||
// schedule destination operand writes to register files for waves in
|
||||
// dispatchList
|
||||
void scheduleRfDestOperands();
|
||||
// invoked by scheduleRfDestOperands to schedule RF writes for a wave
|
||||
bool schedRfWrites(int exeType, Wavefront *w);
|
||||
// reserve resources for waves surviving arbitration in dispatchList
|
||||
void reserveResources();
|
||||
|
||||
void doDispatchListTransition(int unitId, DISPATCH_STATUS s,
|
||||
Wavefront *w = nullptr);
|
||||
|
||||
// Set tracking wfDynId for each wave present in schedule stage
|
||||
// Used to allow only one instruction per wave in schedule
|
||||
std::unordered_set<uint64_t> wavesInSch;
|
||||
|
||||
// List of waves (one list per exe resource) that are in schedule
|
||||
// stage. Waves are added to this list after selected by scheduler
|
||||
// from readyList. Waves are removed from this list and placed on
|
||||
// dispatchList when status reaches SCHREADY.
|
||||
// Waves are kept ordered by age for each resource, always favoring
|
||||
// forward progress for the oldest wave.
|
||||
// The maximum number of waves per resource can be determined by either
|
||||
// the VRF/SRF availability or limits imposed by paremeters (to be added)
|
||||
// of the SCH stage or CU.
|
||||
std::vector<std::deque<std::pair<Wavefront*, SCH_STATUS>>> schList;
|
||||
};
|
||||
|
||||
#endif // __SCHEDULE_STAGE_HH__
|
||||
|
||||
@@ -33,29 +33,23 @@
|
||||
|
||||
#include "gpu-compute/scoreboard_check_stage.hh"
|
||||
|
||||
#include "debug/GPUExec.hh"
|
||||
#include "debug/GPUSched.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/scalar_register_file.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "params/ComputeUnit.hh"
|
||||
|
||||
ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
|
||||
: numSIMDs(p->num_SIMDs),
|
||||
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
|
||||
numShrMemPipes(p->num_shared_mem_pipes),
|
||||
vectorAluInstAvail(nullptr),
|
||||
lastGlbMemSimd(-1),
|
||||
lastShrMemSimd(-1), glbMemInstAvail(nullptr),
|
||||
shrMemInstAvail(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
ScoreboardCheckStage::~ScoreboardCheckStage()
|
||||
{
|
||||
readyList.clear();
|
||||
waveStatusList.clear();
|
||||
shrMemInstAvail = nullptr;
|
||||
glbMemInstAvail = nullptr;
|
||||
}
|
||||
|
||||
void
|
||||
@@ -64,102 +58,212 @@ ScoreboardCheckStage::init(ComputeUnit *cu)
|
||||
computeUnit = cu;
|
||||
_name = computeUnit->name() + ".ScoreboardCheckStage";
|
||||
|
||||
for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
|
||||
for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
|
||||
readyList.push_back(&computeUnit->readyList[unitId]);
|
||||
}
|
||||
|
||||
for (int unitId = 0; unitId < numSIMDs; ++unitId) {
|
||||
waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
|
||||
}
|
||||
|
||||
vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
|
||||
glbMemInstAvail= &computeUnit->glbMemInstAvail;
|
||||
shrMemInstAvail= &computeUnit->shrMemInstAvail;
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckStage::initStatistics()
|
||||
ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus)
|
||||
{
|
||||
lastGlbMemSimd = -1;
|
||||
lastShrMemSimd = -1;
|
||||
*glbMemInstAvail = 0;
|
||||
*shrMemInstAvail = 0;
|
||||
|
||||
for (int unitId = 0; unitId < numSIMDs; ++unitId)
|
||||
vectorAluInstAvail->at(unitId) = false;
|
||||
panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS,
|
||||
"Instruction ready status %d is illegal!!!", rdyStatus);
|
||||
stallCycles[rdyStatus]++;
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
|
||||
// Return true if this wavefront is ready
|
||||
// to execute an instruction of the specified type.
|
||||
// It also returns the reason (in rdyStatus) if the instruction is not
|
||||
// ready. Finally it sets the execution resource type (in exesResType)
|
||||
// of the instruction, only if it ready.
|
||||
bool
|
||||
ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
|
||||
int *exeResType, int wfSlot)
|
||||
{
|
||||
if (curWave->instructionBuffer.empty())
|
||||
return;
|
||||
/**
|
||||
* The waitCnt checks have to be done BEFORE checking for Instruction
|
||||
* buffer empty condition. Otherwise, it will result into a deadlock if
|
||||
* the last instruction in the Instruction buffer is a waitCnt: after
|
||||
* executing the waitCnt, the Instruction buffer would be empty and the
|
||||
* ready check logic will exit BEFORE checking for wait counters being
|
||||
* satisfied.
|
||||
*/
|
||||
|
||||
// track which vector SIMD unit has at least one WV with a vector
|
||||
// ALU as the oldest instruction in its Instruction buffer
|
||||
vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
|
||||
curWave->isOldestInstALU();
|
||||
|
||||
// track how many vector SIMD units have at least one WV with a
|
||||
// vector Global memory instruction as the oldest instruction
|
||||
// in its Instruction buffer
|
||||
if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
|
||||
curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
|
||||
*glbMemInstAvail <= 1) {
|
||||
(*glbMemInstAvail)++;
|
||||
lastGlbMemSimd = unitId;
|
||||
// waitCnt instruction has been dispatched or executed: next
|
||||
// instruction should be blocked until waitCnts are satisfied.
|
||||
if (w->getStatus() == Wavefront::S_WAITCNT) {
|
||||
if (!w->waitCntsSatisfied()) {
|
||||
*rdyStatus = NRDY_WAIT_CNT;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// track how many vector SIMD units have at least one WV with a
|
||||
// vector shared memory (LDS) instruction as the oldest instruction
|
||||
// in its Instruction buffer
|
||||
// TODO: parametrize the limit of the LDS units
|
||||
if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
|
||||
lastShrMemSimd != unitId) {
|
||||
(*shrMemInstAvail)++;
|
||||
lastShrMemSimd = unitId;
|
||||
// Is the wave waiting at a barrier. Check this condition BEFORE checking
|
||||
// for instruction buffer occupancy to avoid a deadlock when the barrier is
|
||||
// the last instruction in the instruction buffer.
|
||||
if (w->stalledAtBarrier) {
|
||||
if (!computeUnit->AllAtBarrier(w->barrierId,w->barrierCnt,
|
||||
computeUnit->getRefCounter(w->dispatchId, w->wgId))) {
|
||||
// Are all threads at barrier?
|
||||
*rdyStatus = NRDY_BARRIER_WAIT;
|
||||
return false;
|
||||
}
|
||||
w->oldBarrierCnt = w->barrierCnt;
|
||||
w->stalledAtBarrier = false;
|
||||
}
|
||||
|
||||
// Check WF status: it has to be running
|
||||
if (w->getStatus() == Wavefront::S_STOPPED ||
|
||||
w->getStatus() == Wavefront::S_RETURNING ||
|
||||
w->getStatus() == Wavefront::S_STALLED) {
|
||||
*rdyStatus = NRDY_WF_STOP;
|
||||
return false;
|
||||
}
|
||||
|
||||
// is the Instruction buffer empty
|
||||
if ( w->instructionBuffer.empty()) {
|
||||
*rdyStatus = NRDY_IB_EMPTY;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check next instruction from instruction buffer
|
||||
GPUDynInstPtr ii = w->nextInstr();
|
||||
// Only instruction in the instruction buffer has been dispatched.
|
||||
// No need to check it again for readiness
|
||||
if (!ii) {
|
||||
*rdyStatus = NRDY_IB_EMPTY;
|
||||
return false;
|
||||
}
|
||||
|
||||
// The following code is very error prone and the entire process for
|
||||
// checking readiness will be fixed eventually. In the meantime, let's
|
||||
// make sure that we do not silently let an instruction type slip
|
||||
// through this logic and always return not ready.
|
||||
if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
|
||||
ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
|
||||
ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat())) {
|
||||
panic("next instruction: %s is of unknown type\n", ii->disassemble());
|
||||
}
|
||||
|
||||
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Ready for Inst : %s\n",
|
||||
computeUnit->cu_id, w->simdId, w->wfSlotId, ii->disassemble());
|
||||
|
||||
// Non-scalar (i.e., vector) instructions may use VGPRs
|
||||
if (!ii->isScalar()) {
|
||||
if (!computeUnit->vrf[w->simdId]->operandsReady(w, ii)) {
|
||||
*rdyStatus = NRDY_VGPR_NRDY;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Scalar and non-scalar instructions may use SGPR
|
||||
if (!computeUnit->srf[w->simdId]->operandsReady(w, ii)) {
|
||||
*rdyStatus = NRDY_SGPR_NRDY;
|
||||
return false;
|
||||
}
|
||||
|
||||
// The hardware implicitly executes S_WAITCNT 0 before executing
|
||||
// the S_ENDPGM instruction. Implementing this implicit S_WAITCNT.
|
||||
// isEndOfKernel() is used to identify the S_ENDPGM instruction
|
||||
// On identifying it, we do the following:
|
||||
// 1. Wait for all older instruction to execute
|
||||
// 2. Once all the older instruction are executed, we add a wait
|
||||
// count for the executed instruction(s) to complete.
|
||||
if (ii->isEndOfKernel()) {
|
||||
// Waiting for older instruction to execute
|
||||
if (w->instructionBuffer.front()->seqNum() != ii->seqNum()) {
|
||||
*rdyStatus = NRDY_WAIT_CNT;
|
||||
return false;
|
||||
}
|
||||
// Older instructions have executed, adding implicit wait count
|
||||
w->setStatus(Wavefront::S_WAITCNT);
|
||||
w->setWaitCnts(0, 0, 0);
|
||||
if (!w->waitCntsSatisfied()) {
|
||||
*rdyStatus = NRDY_WAIT_CNT;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
|
||||
w->simdId, w->wfSlotId, ii->disassemble());
|
||||
*exeResType = mapWaveToExeUnit(w);
|
||||
*rdyStatus = INST_RDY;
|
||||
return true;
|
||||
}
|
||||
|
||||
int
|
||||
ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w)
|
||||
{
|
||||
GPUDynInstPtr ii = w->nextInstr();
|
||||
assert(ii);
|
||||
if (ii->isFlat()) {
|
||||
/**
|
||||
* NOTE: Flat memory ops requires both GM and LM resources.
|
||||
* The simulator models consumption of both GM and LM
|
||||
* resources in the schedule stage. At instruction execution time,
|
||||
* after the aperture check is performed, only the GM or LM pipe
|
||||
* is actually reserved by the timing model. The GM unit is returned
|
||||
* here since Flat ops occupy the GM slot in the ready and dispatch
|
||||
* lists. They also consume the LM slot in the dispatch list.
|
||||
*/
|
||||
return w->globalMem;
|
||||
} else if (ii->isLocalMem()) {
|
||||
return w->localMem;
|
||||
} else if (ii->isGlobalMem()) {
|
||||
if (!ii->isScalar()) {
|
||||
return w->globalMem;
|
||||
} else {
|
||||
return w->scalarMem;
|
||||
}
|
||||
} else if (ii->isBranch() ||
|
||||
ii->isALU() ||
|
||||
(ii->isKernArgSeg() && ii->isLoad()) ||
|
||||
ii->isArgSeg() ||
|
||||
ii->isReturn() ||
|
||||
ii->isEndOfKernel() ||
|
||||
ii->isNop() ||
|
||||
ii->isBarrier()) {
|
||||
if (!ii->isScalar()) {
|
||||
return w->simdId;
|
||||
} else {
|
||||
return w->scalarAluGlobalIdx;
|
||||
}
|
||||
}
|
||||
panic("%s: unmapped to an execution resource", ii->disassemble());
|
||||
return computeUnit->numExeUnits();
|
||||
}
|
||||
|
||||
void
|
||||
ScoreboardCheckStage::exec()
|
||||
{
|
||||
initStatistics();
|
||||
|
||||
// reset the ready list for all execution units; it will be
|
||||
// constructed every cycle since resource availability may change
|
||||
for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
|
||||
for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
|
||||
// Reset wavefront pointers to nullptr so clear() on the vector
|
||||
// does not accidentally destruct the wavefront object
|
||||
for (int i = 0; i < readyList[unitId]->size(); i++) {
|
||||
readyList[unitId]->at(i) = nullptr;
|
||||
}
|
||||
readyList[unitId]->clear();
|
||||
}
|
||||
|
||||
// iterate over the Wavefronts of all SIMD units
|
||||
for (int unitId = 0; unitId < numSIMDs; ++unitId) {
|
||||
for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
|
||||
// iterate over all WF slots across all vector ALUs
|
||||
for (int simdId = 0; simdId < computeUnit->numVectorALUs; ++simdId) {
|
||||
for (int wfSlot = 0; wfSlot < computeUnit->shader->n_wf; ++wfSlot) {
|
||||
// reset the ready status of each wavefront
|
||||
waveStatusList[unitId]->at(wvId).second = BLOCKED;
|
||||
Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
|
||||
collectStatistics(curWave, unitId);
|
||||
|
||||
if (curWave->ready(Wavefront::I_ALU)) {
|
||||
readyList[unitId]->push_back(curWave);
|
||||
waveStatusList[unitId]->at(wvId).second = READY;
|
||||
} else if (curWave->ready(Wavefront::I_GLOBAL)) {
|
||||
if (computeUnit->cedeSIMD(unitId, wvId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
|
||||
waveStatusList[unitId]->at(wvId).second = READY;
|
||||
} else if (curWave->ready(Wavefront::I_SHARED)) {
|
||||
readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
|
||||
waveStatusList[unitId]->at(wvId).second = READY;
|
||||
} else if (curWave->ready(Wavefront::I_FLAT)) {
|
||||
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
|
||||
waveStatusList[unitId]->at(wvId).second = READY;
|
||||
} else if (curWave->ready(Wavefront::I_PRIVATE)) {
|
||||
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
|
||||
waveStatusList[unitId]->at(wvId).second = READY;
|
||||
Wavefront *curWave = computeUnit->wfList[simdId][wfSlot];
|
||||
nonrdytype_e rdyStatus = NRDY_ILLEGAL;
|
||||
int exeResType = -1;
|
||||
// check WF readiness: If the WF's oldest
|
||||
// instruction is ready to issue then add the WF to the ready list
|
||||
if (ready(curWave, &rdyStatus, &exeResType, wfSlot)) {
|
||||
assert(curWave->simdId == simdId);
|
||||
DPRINTF(GPUSched,
|
||||
"Adding to readyList[%d]: SIMD[%d] WV[%d]: %d: %s\n",
|
||||
exeResType,
|
||||
curWave->simdId, curWave->wfDynId,
|
||||
curWave->nextInstr()->seqNum(),
|
||||
curWave->nextInstr()->disassemble());
|
||||
readyList.at(exeResType)->push_back(curWave);
|
||||
}
|
||||
collectStatistics(rdyStatus);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -167,4 +271,16 @@ ScoreboardCheckStage::exec()
|
||||
void
|
||||
ScoreboardCheckStage::regStats()
|
||||
{
|
||||
stallCycles
|
||||
.init(NRDY_CONDITIONS)
|
||||
.name(name() + ".stall_cycles")
|
||||
.desc("number of cycles wave stalled in SCB")
|
||||
;
|
||||
stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop"));
|
||||
stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty"));
|
||||
stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt"));
|
||||
stallCycles.subname(NRDY_BARRIER_WAIT, csprintf("BarrierWait"));
|
||||
stallCycles.subname(NRDY_VGPR_NRDY, csprintf("VgprBusy"));
|
||||
stallCycles.subname(NRDY_SGPR_NRDY, csprintf("SgprBusy"));
|
||||
stallCycles.subname(INST_RDY, csprintf("InstrReady"));
|
||||
}
|
||||
|
||||
@@ -36,20 +36,17 @@
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "sim/stats.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class Wavefront;
|
||||
|
||||
struct ComputeUnitParams;
|
||||
|
||||
enum WAVE_STATUS
|
||||
{
|
||||
BLOCKED = 0,
|
||||
READY
|
||||
};
|
||||
|
||||
/*
|
||||
* Scoreboard check stage.
|
||||
* All wavefronts are analyzed to see if they are ready
|
||||
@@ -61,6 +58,18 @@ enum WAVE_STATUS
|
||||
class ScoreboardCheckStage
|
||||
{
|
||||
public:
|
||||
enum nonrdytype_e {
|
||||
NRDY_ILLEGAL,
|
||||
NRDY_WF_STOP,
|
||||
NRDY_IB_EMPTY,
|
||||
NRDY_WAIT_CNT,
|
||||
NRDY_BARRIER_WAIT,
|
||||
NRDY_VGPR_NRDY,
|
||||
NRDY_SGPR_NRDY,
|
||||
INST_RDY,
|
||||
NRDY_CONDITIONS
|
||||
};
|
||||
|
||||
ScoreboardCheckStage(const ComputeUnitParams* params);
|
||||
~ScoreboardCheckStage();
|
||||
void init(ComputeUnit *cu);
|
||||
@@ -71,31 +80,18 @@ class ScoreboardCheckStage
|
||||
void regStats();
|
||||
|
||||
private:
|
||||
void collectStatistics(Wavefront *curWave, int unitId);
|
||||
void initStatistics();
|
||||
void collectStatistics(nonrdytype_e rdyStatus);
|
||||
int mapWaveToExeUnit(Wavefront *w);
|
||||
bool ready(Wavefront *w, nonrdytype_e *rdyStatus,
|
||||
int *exeResType, int wfSlot);
|
||||
ComputeUnit *computeUnit;
|
||||
uint32_t numSIMDs;
|
||||
uint32_t numMemUnits;
|
||||
uint32_t numShrMemPipes;
|
||||
|
||||
// flag per vector SIMD unit that is set when there is at least one
|
||||
// WF that has a vector ALU instruction as the oldest in its
|
||||
// Instruction Buffer
|
||||
std::vector<bool> *vectorAluInstAvail;
|
||||
int lastGlbMemSimd;
|
||||
int lastShrMemSimd;
|
||||
|
||||
int *glbMemInstAvail;
|
||||
int *shrMemInstAvail;
|
||||
// List of waves which are ready to be scheduled.
|
||||
// Each execution resource has a ready list
|
||||
std::vector<std::vector<Wavefront*>*> readyList;
|
||||
|
||||
// Stores the status of waves. A READY implies the
|
||||
// wave is ready to be scheduled this cycle and
|
||||
// is already present in the readyList
|
||||
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
|
||||
waveStatusList;
|
||||
// Stats
|
||||
Stats::Vector stallCycles;
|
||||
|
||||
std::string _name;
|
||||
};
|
||||
|
||||
@@ -39,37 +39,63 @@
|
||||
#include "base/chunk_generator.hh"
|
||||
#include "debug/GPUDisp.hh"
|
||||
#include "debug/GPUMem.hh"
|
||||
#include "debug/HSAIL.hh"
|
||||
#include "debug/GPUShader.hh"
|
||||
#include "debug/GPUWgLatency.hh"
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
#include "gpu-compute/gpu_command_processor.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/qstruct.hh"
|
||||
#include "gpu-compute/hsa_queue_entry.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "mem/packet.hh"
|
||||
#include "mem/ruby/system/RubySystem.hh"
|
||||
#include "sim/sim_exit.hh"
|
||||
|
||||
Shader::Shader(const Params *p)
|
||||
: ClockedObject(p), clock(p->clk_domain->clockPeriod()),
|
||||
cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p->cpu_pointer),
|
||||
tickEvent([this]{ processTick(); }, "Shader tick",
|
||||
false, Event::CPU_Tick_Pri),
|
||||
timingSim(p->timing), hsail_mode(SIMT),
|
||||
impl_kern_boundary_sync(p->impl_kern_boundary_sync),
|
||||
separate_acquire_release(p->separate_acquire_release), coissue_return(1),
|
||||
trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
|
||||
globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
|
||||
box_tick_cnt(0), start_tick_cnt(0)
|
||||
Shader::Shader(const Params *p) : ClockedObject(p),
|
||||
_activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
|
||||
gpuTc(nullptr), cpuPointer(p->cpu_pointer),
|
||||
tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
|
||||
false, Event::CPU_Tick_Pri),
|
||||
timingSim(p->timing), hsail_mode(SIMT),
|
||||
impl_kern_boundary_sync(p->impl_kern_boundary_sync),
|
||||
coissue_return(1),
|
||||
trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
|
||||
globalMemSize(p->globalmem),
|
||||
nextSchedCu(0), sa_n(0), gpuCmdProc(*p->gpu_cmd_proc),
|
||||
_dispatcher(*p->dispatcher),
|
||||
max_valu_insts(p->max_valu_insts), total_valu_insts(0)
|
||||
{
|
||||
gpuCmdProc.setShader(this);
|
||||
_dispatcher.setShader(this);
|
||||
|
||||
_gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
|
||||
_gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
|
||||
|
||||
_ldsApe.base = ((Addr)1 << 61) + 0x0;
|
||||
_ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
|
||||
|
||||
_scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
|
||||
_scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
|
||||
|
||||
shHiddenPrivateBaseVmid = 0;
|
||||
|
||||
cuList.resize(n_cu);
|
||||
|
||||
panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
|
||||
|
||||
for (int i = 0; i < n_cu; ++i) {
|
||||
cuList[i] = p->CUs[i];
|
||||
assert(i == cuList[i]->cu_id);
|
||||
cuList[i]->shader = this;
|
||||
cuList[i]->idleCUTimeout = p->idlecu_timeout;
|
||||
}
|
||||
}
|
||||
|
||||
GPUDispatcher&
|
||||
Shader::dispatcher()
|
||||
{
|
||||
return _dispatcher;
|
||||
}
|
||||
|
||||
Addr
|
||||
Shader::mmap(int length)
|
||||
{
|
||||
@@ -83,11 +109,11 @@ Shader::mmap(int length)
|
||||
auto mem_state = proc->memState;
|
||||
|
||||
if (proc->mmapGrowsDown()) {
|
||||
DPRINTF(HSAIL, "GROWS DOWN");
|
||||
DPRINTF(GPUShader, "GROWS DOWN");
|
||||
start = mem_state->getMmapEnd() - length;
|
||||
mem_state->setMmapEnd(start);
|
||||
} else {
|
||||
DPRINTF(HSAIL, "GROWS UP");
|
||||
DPRINTF(GPUShader, "GROWS UP");
|
||||
start = mem_state->getMmapEnd();
|
||||
mem_state->setMmapEnd(start + length);
|
||||
|
||||
@@ -96,7 +122,7 @@ Shader::mmap(int length)
|
||||
mem_state->getMmapEnd());
|
||||
}
|
||||
|
||||
DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
|
||||
DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
|
||||
|
||||
proc->allocateMem(start, length);
|
||||
|
||||
@@ -146,15 +172,15 @@ ShaderParams::create()
|
||||
}
|
||||
|
||||
void
|
||||
Shader::exec()
|
||||
Shader::execScheduledAdds()
|
||||
{
|
||||
tick_cnt = curTick();
|
||||
box_tick_cnt = curTick() - start_tick_cnt;
|
||||
assert(!sa_when.empty());
|
||||
|
||||
// apply any scheduled adds
|
||||
for (int i = 0; i < sa_n; ++i) {
|
||||
if (sa_when[i] <= tick_cnt) {
|
||||
if (sa_when[i] <= curTick()) {
|
||||
*sa_val[i] += sa_x[i];
|
||||
panic_if(*sa_val[i] < 0, "Negative counter value\n");
|
||||
sa_val.erase(sa_val.begin() + i);
|
||||
sa_x.erase(sa_x.begin() + i);
|
||||
sa_when.erase(sa_when.begin() + i);
|
||||
@@ -162,14 +188,62 @@ Shader::exec()
|
||||
--i;
|
||||
}
|
||||
}
|
||||
if (!sa_when.empty()) {
|
||||
Tick shader_wakeup = *std::max_element(sa_when.begin(),
|
||||
sa_when.end());
|
||||
DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
|
||||
schedule(tickEvent, shader_wakeup);
|
||||
} else {
|
||||
DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
|
||||
}
|
||||
}
|
||||
|
||||
// clock all of the cu's
|
||||
for (int i = 0; i < n_cu; ++i)
|
||||
cuList[i]->exec();
|
||||
/*
|
||||
* dispatcher/shader arranges invalidate requests to the CUs
|
||||
*/
|
||||
void
|
||||
Shader::prepareInvalidate(HSAQueueEntry *task) {
|
||||
// if invalidate has already started/finished, then do nothing
|
||||
if (task->isInvStarted()) return;
|
||||
|
||||
// invalidate has never started; it can only perform once at kernel launch
|
||||
assert(task->outstandingInvs() == -1);
|
||||
int kernId = task->dispatchId();
|
||||
// counter value is 0 now, indicating the inv is about to start
|
||||
_dispatcher.updateInvCounter(kernId, +1);
|
||||
|
||||
// iterate all cus managed by the shader, to perform invalidate.
|
||||
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
|
||||
// create a request to hold INV info; the request's fields will
|
||||
// be updated in cu before use
|
||||
auto req = std::make_shared<Request>(0, 0, 0,
|
||||
cuList[i_cu]->masterId(),
|
||||
0, -1);
|
||||
|
||||
_dispatcher.updateInvCounter(kernId, +1);
|
||||
// all necessary INV flags are all set now, call cu to execute
|
||||
cuList[i_cu]->doInvalidate(req, task->dispatchId());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* dispatcher/shader arranges flush requests to the CUs
|
||||
*/
|
||||
void
|
||||
Shader::prepareFlush(GPUDynInstPtr gpuDynInst){
|
||||
int kernId = gpuDynInst->kern_id;
|
||||
// flush has never been started, performed only once at kernel end
|
||||
assert(_dispatcher.getOutstandingWbs(kernId) == 0);
|
||||
|
||||
// iterate all cus, managed by the shader, to perform flush.
|
||||
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
|
||||
_dispatcher.updateWbCounter(kernId, +1);
|
||||
cuList[i_cu]->doFlush(gpuDynInst);
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
Shader::dispatch_workgroups(NDRange *ndr)
|
||||
Shader::dispatchWorkgroups(HSAQueueEntry *task)
|
||||
{
|
||||
bool scheduledSomething = false;
|
||||
int cuCount = 0;
|
||||
@@ -182,32 +256,24 @@ Shader::dispatch_workgroups(NDRange *ndr)
|
||||
// dispatch workgroup iff the following two conditions are met:
|
||||
// (a) wg_rem is true - there are unassigned workgroups in the grid
|
||||
// (b) there are enough free slots in cu cuList[i] for this wg
|
||||
if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
|
||||
if (!task->dispComplete() && cuList[curCu]->hasDispResources(task)) {
|
||||
scheduledSomething = true;
|
||||
DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
|
||||
DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
|
||||
curCu, task->globalWgId());
|
||||
DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
|
||||
curTick(), task->globalWgId(), curCu);
|
||||
|
||||
// ticks() member function translates cycles to simulation ticks.
|
||||
if (!tickEvent.scheduled()) {
|
||||
schedule(tickEvent, curTick() + this->ticks(1));
|
||||
if (!cuList[curCu]->tickEvent.scheduled()) {
|
||||
if (!_activeCus)
|
||||
_lastInactiveTick = curTick();
|
||||
_activeCus++;
|
||||
}
|
||||
|
||||
cuList[curCu]->StartWorkgroup(ndr);
|
||||
ndr->wgId[0]++;
|
||||
ndr->globalWgId++;
|
||||
if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
|
||||
ndr->wgId[0] = 0;
|
||||
ndr->wgId[1]++;
|
||||
panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
|
||||
"Invalid activeCu size\n");
|
||||
cuList[curCu]->dispWorkgroup(task);
|
||||
|
||||
if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
|
||||
ndr->wgId[1] = 0;
|
||||
ndr->wgId[2]++;
|
||||
|
||||
if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
|
||||
ndr->wg_disp_rem = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
task->markWgDispatch();
|
||||
}
|
||||
|
||||
++cuCount;
|
||||
@@ -218,9 +284,83 @@ Shader::dispatch_workgroups(NDRange *ndr)
|
||||
}
|
||||
|
||||
void
|
||||
Shader::handshake(GpuDispatcher *_dispatcher)
|
||||
Shader::regStats()
|
||||
{
|
||||
dispatcher = _dispatcher;
|
||||
ClockedObject::regStats();
|
||||
|
||||
shaderActiveTicks
|
||||
.name(name() + ".shader_active_ticks")
|
||||
.desc("Total ticks that any CU attached to this shader is active")
|
||||
;
|
||||
allLatencyDist
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".allLatencyDist")
|
||||
.desc("delay distribution for all")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
loadLatencyDist
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".loadLatencyDist")
|
||||
.desc("delay distribution for loads")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
storeLatencyDist
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".storeLatencyDist")
|
||||
.desc("delay distribution for stores")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
vectorInstSrcOperand
|
||||
.init(4)
|
||||
.name(name() + ".vec_inst_src_operand")
|
||||
.desc("vector instruction source operand distribution");
|
||||
|
||||
vectorInstDstOperand
|
||||
.init(4)
|
||||
.name(name() + ".vec_inst_dst_operand")
|
||||
.desc("vector instruction destination operand distribution");
|
||||
|
||||
initToCoalesceLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".initToCoalesceLatency")
|
||||
.desc("Ticks from vmem inst initiateAcc to coalescer issue")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
rubyNetworkLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".rubyNetworkLatency")
|
||||
.desc("Ticks from coalescer issue to coalescer hit callback")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
gmEnqueueLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".gmEnqueueLatency")
|
||||
.desc("Ticks from coalescer hit callback to GM pipe enqueue")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
gmToCompleteLatency
|
||||
.init(0, 1600000, 10000)
|
||||
.name(name() + ".gmToCompleteLatency")
|
||||
.desc("Ticks queued in GM pipes ordered response buffer")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
coalsrLineAddresses
|
||||
.init(0, 20, 1)
|
||||
.name(name() + ".coalsrLineAddresses")
|
||||
.desc("Number of cache lines for coalesced request")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
|
||||
int wfSize = cuList[0]->wfSize();
|
||||
cacheBlockRoundTrip = new Stats::Distribution[wfSize];
|
||||
for (int idx = 0; idx < wfSize; ++idx) {
|
||||
std::stringstream namestr;
|
||||
ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
|
||||
cacheBlockRoundTrip[idx]
|
||||
.init(0, 1600000, 10000)
|
||||
.name(namestr.str())
|
||||
.desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
|
||||
.flags(Stats::pdf | Stats::oneline);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -251,7 +391,6 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
|
||||
RequestPtr req1, req2;
|
||||
req->splitOnVaddr(split_addr, req1, req2);
|
||||
|
||||
|
||||
PacketPtr pkt1 = new Packet(req2, cmd);
|
||||
PacketPtr pkt2 = new Packet(req1, cmd);
|
||||
|
||||
@@ -297,34 +436,22 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
Shader::busy()
|
||||
{
|
||||
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
|
||||
if (!cuList[i_cu]->isDone()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
|
||||
Shader::ScheduleAdd(int *val,Tick when,int x)
|
||||
{
|
||||
sa_val.push_back(val);
|
||||
sa_when.push_back(tick_cnt + when);
|
||||
when += curTick();
|
||||
sa_when.push_back(when);
|
||||
sa_x.push_back(x);
|
||||
++sa_n;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
Shader::processTick()
|
||||
{
|
||||
if (busy()) {
|
||||
exec();
|
||||
schedule(tickEvent, curTick() + ticks(1));
|
||||
if (!tickEvent.scheduled() || (when < tickEvent.when())) {
|
||||
DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
|
||||
"%lu\n", when);
|
||||
reschedule(tickEvent, when, true);
|
||||
} else {
|
||||
assert(tickEvent.scheduled());
|
||||
DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
|
||||
"%lu\n", when);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -356,7 +483,8 @@ void
|
||||
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
||||
bool suppress_func_errors)
|
||||
{
|
||||
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
|
||||
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
|
||||
suppress_func_errors);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -385,15 +513,11 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
|
||||
pkt->senderState =
|
||||
new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
|
||||
|
||||
if (cu_id == n_cu) {
|
||||
dispatcher->tlbPort->sendFunctional(pkt);
|
||||
} else {
|
||||
// even when the perLaneTLB flag is turned on
|
||||
// it's ok tp send all accesses through lane 0
|
||||
// since the lane # is not known here,
|
||||
// This isn't important since these are functional accesses.
|
||||
cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
|
||||
}
|
||||
// even when the perLaneTLB flag is turned on
|
||||
// it's ok tp send all accesses through lane 0
|
||||
// since the lane # is not known here,
|
||||
// This isn't important since these are functional accesses.
|
||||
cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
|
||||
|
||||
/* safe_cast the senderState */
|
||||
TheISA::GpuTLB::TranslationState *sender_state =
|
||||
@@ -402,3 +526,82 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
|
||||
delete sender_state->tlbEntry;
|
||||
delete pkt->senderState;
|
||||
}
|
||||
|
||||
/*
|
||||
* allow the shader to sample stats from constituent devices
|
||||
*/
|
||||
void
|
||||
Shader::sampleStore(const Tick accessTime)
|
||||
{
|
||||
storeLatencyDist.sample(accessTime);
|
||||
allLatencyDist.sample(accessTime);
|
||||
}
|
||||
|
||||
/*
|
||||
* allow the shader to sample stats from constituent devices
|
||||
*/
|
||||
void
|
||||
Shader::sampleLoad(const Tick accessTime)
|
||||
{
|
||||
loadLatencyDist.sample(accessTime);
|
||||
allLatencyDist.sample(accessTime);
|
||||
}
|
||||
|
||||
void
|
||||
Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)
|
||||
{
|
||||
// Only sample instructions that go all the way to main memory
|
||||
if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
|
||||
return;
|
||||
}
|
||||
|
||||
Tick t1 = roundTripTime[0];
|
||||
Tick t2 = roundTripTime[1];
|
||||
Tick t3 = roundTripTime[2];
|
||||
Tick t4 = roundTripTime[3];
|
||||
Tick t5 = roundTripTime[4];
|
||||
|
||||
initToCoalesceLatency.sample(t2-t1);
|
||||
rubyNetworkLatency.sample(t3-t2);
|
||||
gmEnqueueLatency.sample(t4-t3);
|
||||
gmToCompleteLatency.sample(t5-t4);
|
||||
}
|
||||
|
||||
void
|
||||
Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
|
||||
{
|
||||
coalsrLineAddresses.sample(lineMap.size());
|
||||
std::vector<Tick> netTimes;
|
||||
|
||||
// For each cache block address generated by a vmem inst, calculate
|
||||
// the round-trip time for that cache block.
|
||||
for (auto& it : lineMap) {
|
||||
const std::vector<Tick>& timeVec = it.second;
|
||||
if (timeVec.size() == 2) {
|
||||
netTimes.push_back(timeVec[1] - timeVec[0]);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the cache block round trip times so that the first
|
||||
// distrubtion is always measuring the fastests and the last
|
||||
// distrubtion is always measuring the slowest cache block.
|
||||
std::sort(netTimes.begin(), netTimes.end());
|
||||
|
||||
// Sample the round trip time for each N cache blocks into the
|
||||
// Nth distribution.
|
||||
int idx = 0;
|
||||
for (auto& time : netTimes) {
|
||||
cacheBlockRoundTrip[idx].sample(time);
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Shader::notifyCuSleep() {
|
||||
// If all CUs attached to his shader are asleep, update shaderActiveTicks
|
||||
panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
|
||||
"Invalid activeCu size\n");
|
||||
_activeCus--;
|
||||
if (!_activeCus)
|
||||
shaderActiveTicks += curTick() - _lastInactiveTick;
|
||||
}
|
||||
|
||||
@@ -14,9 +14,9 @@
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
@@ -30,7 +30,7 @@
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Steve Reinhardt
|
||||
* Authors: Steve Reinhardt
|
||||
*/
|
||||
|
||||
#ifndef __SHADER_HH__
|
||||
@@ -47,11 +47,11 @@
|
||||
#include "cpu/simple_thread.hh"
|
||||
#include "cpu/thread_context.hh"
|
||||
#include "cpu/thread_state.hh"
|
||||
#include "enums/MemType.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/gpu_tlb.hh"
|
||||
#include "gpu-compute/hsa_queue_entry.hh"
|
||||
#include "gpu-compute/lds_state.hh"
|
||||
#include "gpu-compute/qstruct.hh"
|
||||
#include "mem/page_table.hh"
|
||||
#include "mem/port.hh"
|
||||
#include "mem/request.hh"
|
||||
@@ -61,7 +61,8 @@
|
||||
#include "sim/sim_object.hh"
|
||||
|
||||
class BaseTLB;
|
||||
class GpuDispatcher;
|
||||
class GPUCommandProcessor;
|
||||
class GPUDispatcher;
|
||||
|
||||
namespace TheISA
|
||||
{
|
||||
@@ -70,36 +71,144 @@ namespace TheISA
|
||||
|
||||
static const int LDS_SIZE = 65536;
|
||||
|
||||
// aperture (APE) registers define the base/limit
|
||||
// pair for the ATC mapped memory space. currently
|
||||
// the only APEs we consider are for GPUVM/LDS/scratch.
|
||||
// the APEs are registered with unique values based
|
||||
// on a per-device basis
|
||||
struct ApertureRegister
|
||||
{
|
||||
Addr base;
|
||||
Addr limit;
|
||||
};
|
||||
|
||||
// Class Shader: This describes a single shader instance. Most
|
||||
// configurations will only have a single shader.
|
||||
|
||||
class Shader : public ClockedObject
|
||||
{
|
||||
protected:
|
||||
// Shader's clock period in terms of number of ticks of curTime,
|
||||
// aka global simulation clock
|
||||
Tick clock;
|
||||
private:
|
||||
ApertureRegister _gpuVmApe;
|
||||
ApertureRegister _ldsApe;
|
||||
ApertureRegister _scratchApe;
|
||||
Addr shHiddenPrivateBaseVmid;
|
||||
|
||||
// Number of active Cus attached to this shader
|
||||
int _activeCus;
|
||||
|
||||
// Last tick that all CUs attached to this shader were inactive
|
||||
Tick _lastInactiveTick;
|
||||
|
||||
// some stats for measuring latency
|
||||
Stats::Distribution allLatencyDist;
|
||||
Stats::Distribution loadLatencyDist;
|
||||
Stats::Distribution storeLatencyDist;
|
||||
|
||||
// average ticks from vmem inst initiateAcc to coalescer issue,
|
||||
// average ticks from coalescer issue to coalescer hit callback,
|
||||
// average ticks from coalescer hit callback to GM pipe enqueue,
|
||||
// and average ticks spent in GM pipe's ordered resp buffer.
|
||||
Stats::Distribution initToCoalesceLatency;
|
||||
Stats::Distribution rubyNetworkLatency;
|
||||
Stats::Distribution gmEnqueueLatency;
|
||||
Stats::Distribution gmToCompleteLatency;
|
||||
|
||||
// average number of cache blocks requested by vmem inst, and
|
||||
// average ticks for cache blocks to main memory for the Nth
|
||||
// cache block generated by a vmem inst.
|
||||
Stats::Distribution coalsrLineAddresses;
|
||||
Stats::Distribution *cacheBlockRoundTrip;
|
||||
|
||||
public:
|
||||
typedef ShaderParams Params;
|
||||
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
|
||||
|
||||
// clock related functions ; maps to-and-from
|
||||
// Simulation ticks and shader clocks.
|
||||
Tick frequency() const { return SimClock::Frequency / clock; }
|
||||
|
||||
Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
|
||||
|
||||
Tick getClock() const { return clock; }
|
||||
Tick curCycle() const { return curTick() / clock; }
|
||||
Tick tickToCycles(Tick val) const { return val / clock;}
|
||||
|
||||
GPUDispatcher &dispatcher();
|
||||
void sampleLoad(const Tick accessTime);
|
||||
void sampleStore(const Tick accessTime);
|
||||
void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
|
||||
void sampleLineRoundTrip(const std::map<Addr,
|
||||
std::vector<Tick>> &roundTripTime);
|
||||
|
||||
SimpleThread *cpuThread;
|
||||
ThreadContext *gpuTc;
|
||||
BaseCPU *cpuPointer;
|
||||
|
||||
void processTick();
|
||||
const ApertureRegister&
|
||||
gpuVmApe() const
|
||||
{
|
||||
return _gpuVmApe;
|
||||
}
|
||||
|
||||
const ApertureRegister&
|
||||
ldsApe() const
|
||||
{
|
||||
return _ldsApe;
|
||||
}
|
||||
|
||||
const ApertureRegister&
|
||||
scratchApe() const
|
||||
{
|
||||
return _scratchApe;
|
||||
}
|
||||
|
||||
bool
|
||||
isGpuVmApe(Addr addr) const
|
||||
{
|
||||
bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
|
||||
|
||||
return is_gpu_vm;
|
||||
}
|
||||
|
||||
bool
|
||||
isLdsApe(Addr addr) const
|
||||
{
|
||||
bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
|
||||
|
||||
return is_lds;
|
||||
}
|
||||
|
||||
bool
|
||||
isScratchApe(Addr addr) const
|
||||
{
|
||||
bool is_scratch
|
||||
= addr >= _scratchApe.base && addr <= _scratchApe.limit;
|
||||
|
||||
return is_scratch;
|
||||
}
|
||||
|
||||
Addr
|
||||
getScratchBase()
|
||||
{
|
||||
return _scratchApe.base;
|
||||
}
|
||||
|
||||
Addr
|
||||
getHiddenPrivateBase()
|
||||
{
|
||||
return shHiddenPrivateBaseVmid;
|
||||
}
|
||||
|
||||
void
|
||||
initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
|
||||
{
|
||||
Addr sh_hidden_base_new = queueBase - offset;
|
||||
|
||||
// We are initializing sh_hidden_private_base_vmid from the
|
||||
// amd queue descriptor from the first queue.
|
||||
// The sh_hidden_private_base_vmid is supposed to be same for
|
||||
// all the queues from the same process
|
||||
if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
|
||||
// Do not panic if shHiddenPrivateBaseVmid == 0,
|
||||
// that is if it is uninitialized. Panic only
|
||||
// if the value is initilized and we get
|
||||
// a differnt base later.
|
||||
panic_if(shHiddenPrivateBaseVmid != 0,
|
||||
"Currently we support only single process\n");
|
||||
}
|
||||
shHiddenPrivateBaseVmid = sh_hidden_base_new;
|
||||
}
|
||||
|
||||
EventFunctionWrapper tickEvent;
|
||||
|
||||
// is this simulation going to be timing mode in the memory?
|
||||
@@ -108,30 +217,18 @@ class Shader : public ClockedObject
|
||||
|
||||
// If set, issue acq packet @ kernel launch
|
||||
int impl_kern_boundary_sync;
|
||||
// If set, generate a separate packet for acquire/release on
|
||||
// ld_acquire/st_release/atomic operations
|
||||
int separate_acquire_release;
|
||||
// If set, fetch returns may be coissued with instructions
|
||||
int coissue_return;
|
||||
// If set, always dump all 64 gprs to trace
|
||||
int trace_vgpr_all;
|
||||
// Number of cu units in the shader
|
||||
int n_cu;
|
||||
// Number of wavefront slots per cu
|
||||
// Number of wavefront slots per SIMD per CU
|
||||
int n_wf;
|
||||
|
||||
// The size of global memory
|
||||
int globalMemSize;
|
||||
|
||||
/*
|
||||
* Bytes/work-item for call instruction
|
||||
* The number of arguments for an hsail function will
|
||||
* vary. We simply determine the maximum # of arguments
|
||||
* required by any hsail function up front before the
|
||||
* simulation (during parsing of the Brig) and record
|
||||
* that number here.
|
||||
*/
|
||||
int funcargs_size;
|
||||
|
||||
// Tracks CU that rr dispatcher should attempt scheduling
|
||||
int nextSchedCu;
|
||||
|
||||
@@ -139,7 +236,7 @@ class Shader : public ClockedObject
|
||||
uint32_t sa_n;
|
||||
|
||||
// Pointer to value to be increments
|
||||
std::vector<uint32_t*> sa_val;
|
||||
std::vector<int*> sa_val;
|
||||
// When to do the increment
|
||||
std::vector<uint64_t> sa_when;
|
||||
// Amount to increment by
|
||||
@@ -148,24 +245,29 @@ class Shader : public ClockedObject
|
||||
// List of Compute Units (CU's)
|
||||
std::vector<ComputeUnit*> cuList;
|
||||
|
||||
uint64_t tick_cnt;
|
||||
uint64_t box_tick_cnt;
|
||||
uint64_t start_tick_cnt;
|
||||
GPUCommandProcessor &gpuCmdProc;
|
||||
GPUDispatcher &_dispatcher;
|
||||
|
||||
GpuDispatcher *dispatcher;
|
||||
/**
|
||||
* Statistics
|
||||
*/
|
||||
Stats::Scalar shaderActiveTicks;
|
||||
Stats::Vector vectorInstSrcOperand;
|
||||
Stats::Vector vectorInstDstOperand;
|
||||
void regStats();
|
||||
|
||||
int max_valu_insts;
|
||||
int total_valu_insts;
|
||||
|
||||
Shader(const Params *p);
|
||||
~Shader();
|
||||
virtual void init();
|
||||
|
||||
// Run shader
|
||||
void exec();
|
||||
|
||||
// Check to see if shader is busy
|
||||
bool busy();
|
||||
// Run shader scheduled adds
|
||||
void execScheduledAdds();
|
||||
|
||||
// Schedule a 32-bit value to be incremented some time in the future
|
||||
void ScheduleAdd(uint32_t *val, Tick when, int x);
|
||||
void ScheduleAdd(int *val, Tick when, int x);
|
||||
bool processTimingPacket(PacketPtr pkt);
|
||||
|
||||
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
||||
@@ -190,12 +292,15 @@ class Shader : public ClockedObject
|
||||
cuList[cu_id] = compute_unit;
|
||||
}
|
||||
|
||||
void handshake(GpuDispatcher *dispatcher);
|
||||
bool dispatch_workgroups(NDRange *ndr);
|
||||
void prepareInvalidate(HSAQueueEntry *task);
|
||||
void prepareFlush(GPUDynInstPtr gpuDynInst);
|
||||
|
||||
bool dispatchWorkgroups(HSAQueueEntry *task);
|
||||
Addr mmap(int length);
|
||||
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
|
||||
void updateContext(int cid);
|
||||
void hostWakeUp(BaseCPU *cpu);
|
||||
void notifyCuSleep();
|
||||
};
|
||||
|
||||
#endif // __SHADER_HH__
|
||||
|
||||
@@ -35,6 +35,12 @@
|
||||
|
||||
#include "base/logging.hh"
|
||||
|
||||
SimplePoolManager *
|
||||
SimplePoolManagerParams::create()
|
||||
{
|
||||
return new SimplePoolManager(this);
|
||||
}
|
||||
|
||||
// return the min number of elements that the manager can reserve given
|
||||
// a request for "size" elements
|
||||
uint32_t
|
||||
@@ -64,8 +70,6 @@ SimplePoolManager::printRegion()
|
||||
bool
|
||||
SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
|
||||
{
|
||||
assert(numRegions * minAllocatedElements(size) <= poolSize());
|
||||
|
||||
return _reservedGroups == 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -38,14 +38,15 @@
|
||||
#include <cstdint>
|
||||
|
||||
#include "gpu-compute/pool_manager.hh"
|
||||
#include "params/SimplePoolManager.hh"
|
||||
|
||||
// Simple Pool Manager: allows one region per pool. No region merging is
|
||||
// supported.
|
||||
class SimplePoolManager : public PoolManager
|
||||
{
|
||||
public:
|
||||
SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
|
||||
: PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
|
||||
SimplePoolManager(const PoolManagerParams *p)
|
||||
: PoolManager(p), _regionSize(0), _nxtFreeIdx(0),
|
||||
_reservedGroups(0)
|
||||
{
|
||||
}
|
||||
@@ -62,7 +63,7 @@ class SimplePoolManager : public PoolManager
|
||||
// be reserved)
|
||||
uint32_t _regionSize;
|
||||
// next index to allocate a region
|
||||
uint8_t _nxtFreeIdx;
|
||||
int _nxtFreeIdx;
|
||||
// number of groups that reserve a region
|
||||
uint32_t _reservedGroups;
|
||||
};
|
||||
|
||||
188
src/gpu-compute/static_register_manager_policy.cc
Normal file
188
src/gpu-compute/static_register_manager_policy.cc
Normal file
@@ -0,0 +1,188 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Mark Wyse
|
||||
*/
|
||||
|
||||
#include "gpu-compute/static_register_manager_policy.hh"
|
||||
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "debug/GPURename.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/pool_manager.hh"
|
||||
#include "gpu-compute/scalar_register_file.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
StaticRegisterManagerPolicy::StaticRegisterManagerPolicy()
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
StaticRegisterManagerPolicy::exec()
|
||||
{
|
||||
}
|
||||
|
||||
int
|
||||
StaticRegisterManagerPolicy::mapVgpr(Wavefront* w, int vgprIndex)
|
||||
{
|
||||
panic_if((vgprIndex >= w->reservedVectorRegs)
|
||||
|| (w->reservedVectorRegs < 0),
|
||||
"VGPR index %d is out of range: VGPR range=[0,%d]",
|
||||
vgprIndex, w->reservedVectorRegs);
|
||||
|
||||
// add the offset from where the VGPRs of the wavefront have been assigned
|
||||
int physicalVgprIndex = w->startVgprIndex + vgprIndex;
|
||||
|
||||
panic_if(!((w->startVgprIndex <= physicalVgprIndex) &&
|
||||
(w->startVgprIndex + w->reservedVectorRegs - 1)
|
||||
>= physicalVgprIndex),
|
||||
"Invalid VGPR index %d\n", physicalVgprIndex);
|
||||
|
||||
// calculate physical VGPR index
|
||||
return physicalVgprIndex % w->computeUnit->vrf[w->simdId]->numRegs();
|
||||
}
|
||||
|
||||
int
|
||||
StaticRegisterManagerPolicy::mapSgpr(Wavefront* w, int sgprIndex)
|
||||
{
|
||||
panic_if(!((sgprIndex < w->reservedScalarRegs)
|
||||
&& (w->reservedScalarRegs > 0)),
|
||||
"SGPR index %d is out of range: SGPR range=[0,%d]\n",
|
||||
sgprIndex, w->reservedScalarRegs);
|
||||
|
||||
// add the offset from where the SGPRs of the wavefront have been assigned
|
||||
int physicalSgprIndex = w->startSgprIndex + sgprIndex;
|
||||
|
||||
panic_if(!((w->startSgprIndex <= physicalSgprIndex) &&
|
||||
(w->startSgprIndex + w->reservedScalarRegs - 1)
|
||||
>= physicalSgprIndex),
|
||||
"Invalid SGPR index %d\n", physicalSgprIndex);
|
||||
|
||||
// calculate physical SGPR index
|
||||
return physicalSgprIndex % w->computeUnit->srf[w->simdId]->numRegs();
|
||||
}
|
||||
|
||||
bool
|
||||
StaticRegisterManagerPolicy::canAllocateVgprs(int simdId, int nWfs,
|
||||
int demandPerWf)
|
||||
{
|
||||
return cu->registerManager->vrfPoolMgrs[simdId]->
|
||||
canAllocate(nWfs, demandPerWf);
|
||||
}
|
||||
|
||||
bool
|
||||
StaticRegisterManagerPolicy::canAllocateSgprs(int simdId, int nWfs,
|
||||
int demandPerWf)
|
||||
{
|
||||
return cu->registerManager->srfPoolMgrs[simdId]->
|
||||
canAllocate(nWfs, demandPerWf);
|
||||
}
|
||||
|
||||
void
|
||||
StaticRegisterManagerPolicy::allocateRegisters(Wavefront *w, int vectorDemand,
|
||||
int scalarDemand)
|
||||
{
|
||||
uint32_t allocatedSize = 0;
|
||||
w->startVgprIndex = cu->registerManager->vrfPoolMgrs[w->simdId]->
|
||||
allocateRegion(vectorDemand, &allocatedSize);
|
||||
w->reservedVectorRegs = allocatedSize;
|
||||
cu->vectorRegsReserved[w->simdId] += w->reservedVectorRegs;
|
||||
panic_if(cu->vectorRegsReserved[w->simdId] > cu->numVecRegsPerSimd,
|
||||
"VRF[%d] has been overallocated %d > %d\n",
|
||||
w->simdId, cu->vectorRegsReserved[w->simdId],
|
||||
cu->numVecRegsPerSimd);
|
||||
|
||||
if (scalarDemand) {
|
||||
w->startSgprIndex = cu->registerManager->srfPoolMgrs[w->simdId]->
|
||||
allocateRegion(scalarDemand, &allocatedSize);
|
||||
w->reservedScalarRegs = allocatedSize;
|
||||
cu->scalarRegsReserved[w->simdId] += w->reservedScalarRegs;
|
||||
panic_if(cu->scalarRegsReserved[w->simdId] > cu->numScalarRegsPerSimd,
|
||||
"SRF[%d] has been overallocated %d > %d\n",
|
||||
w->simdId, cu->scalarRegsReserved[w->simdId],
|
||||
cu->numScalarRegsPerSimd);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
|
||||
{
|
||||
// free the vector registers of the completed wavefront
|
||||
w->computeUnit->vectorRegsReserved[w->simdId] -= w->reservedVectorRegs;
|
||||
// free the scalar registers of the completed wavefront
|
||||
w->computeUnit->scalarRegsReserved[w->simdId] -= w->reservedScalarRegs;
|
||||
|
||||
panic_if(w->computeUnit->vectorRegsReserved[w->simdId] < 0,
|
||||
"Freeing VRF[%d] registers left %d registers reserved\n",
|
||||
w->simdId,
|
||||
w->computeUnit->vectorRegsReserved[w->simdId]);
|
||||
panic_if(w->computeUnit->scalarRegsReserved[w->simdId] < 0,
|
||||
"Freeing SRF[%d] registers left %d registers reserved\n",
|
||||
w->simdId,
|
||||
w->computeUnit->scalarRegsReserved[w->simdId]);
|
||||
|
||||
int endIndex = (w->startVgprIndex + w->reservedVectorRegs - 1) %
|
||||
w->computeUnit->vrf[w->simdId]->numRegs();
|
||||
|
||||
w->computeUnit->registerManager->vrfPoolMgrs[w->simdId]->
|
||||
freeRegion(w->startVgprIndex, endIndex);
|
||||
|
||||
// mark/pre-mark all registers as not busy
|
||||
for (int i = 0; i < w->reservedVectorRegs; i++) {
|
||||
uint32_t physVgprIdx = mapVgpr(w, i);
|
||||
w->computeUnit->vrf[w->simdId]->markReg(physVgprIdx, false);
|
||||
}
|
||||
|
||||
w->reservedVectorRegs = 0;
|
||||
w->startVgprIndex = 0;
|
||||
|
||||
endIndex = (w->startSgprIndex + w->reservedScalarRegs - 1) %
|
||||
w->computeUnit->srf[w->simdId]->numRegs();
|
||||
w->computeUnit->registerManager->srfPoolMgrs[w->simdId]->
|
||||
freeRegion(w->startSgprIndex, endIndex);
|
||||
|
||||
// mark/pre-mark all registers as not busy
|
||||
for (int i = 0; i < w->reservedScalarRegs; i++) {
|
||||
uint32_t physSgprIdx = mapSgpr(w, i);
|
||||
w->computeUnit->srf[w->simdId]->markReg(physSgprIdx, false);
|
||||
}
|
||||
|
||||
w->reservedScalarRegs = 0;
|
||||
w->startSgprIndex = 0;
|
||||
}
|
||||
|
||||
void
|
||||
StaticRegisterManagerPolicy::regStats()
|
||||
{
|
||||
}
|
||||
65
src/gpu-compute/static_register_manager_policy.hh
Normal file
65
src/gpu-compute/static_register_manager_policy.hh
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Mark Wyse
|
||||
*/
|
||||
|
||||
#ifndef __STATIC_REGISTER_MANAGER_POLICY_HH__
|
||||
#define __STATIC_REGISTER_MANAGER_POLICY_HH__
|
||||
|
||||
#include "gpu-compute/register_manager_policy.hh"
|
||||
|
||||
class HSAQueueEntry;
|
||||
|
||||
class StaticRegisterManagerPolicy : public RegisterManagerPolicy
|
||||
{
|
||||
public:
|
||||
|
||||
StaticRegisterManagerPolicy();
|
||||
|
||||
void exec() override;
|
||||
|
||||
int mapVgpr(Wavefront* w, int vgprIndex) override;
|
||||
int mapSgpr(Wavefront* w, int sgprIndex) override;
|
||||
|
||||
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) override;
|
||||
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) override;
|
||||
|
||||
void allocateRegisters(Wavefront *w, int vectorDemand,
|
||||
int scalarDemand) override;
|
||||
|
||||
void freeRegisters(Wavefront *w) override;
|
||||
|
||||
void regStats() override;
|
||||
};
|
||||
|
||||
#endif // __STATIC_REGISTER_MANAGER_POLICY_HH__
|
||||
@@ -41,7 +41,6 @@
|
||||
|
||||
TLBCoalescer::TLBCoalescer(const Params *p)
|
||||
: ClockedObject(p),
|
||||
clock(p->clk_domain->clockPeriod()),
|
||||
TLBProbesPerCycle(p->probesPerCycle),
|
||||
coalescingWindow(p->coalescingWindow),
|
||||
disableCoalescing(p->disableCoalescing),
|
||||
@@ -317,7 +316,7 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
|
||||
//coalesced requests to the TLB
|
||||
if (!coalescer->probeTLBEvent.scheduled()) {
|
||||
coalescer->schedule(coalescer->probeTLBEvent,
|
||||
curTick() + coalescer->ticks(1));
|
||||
curTick() + coalescer->clockPeriod());
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -380,7 +379,7 @@ TLBCoalescer::MemSidePort::recvReqRetry()
|
||||
//we've receeived a retry. Schedule a probeTLBEvent
|
||||
if (!coalescer->probeTLBEvent.scheduled())
|
||||
coalescer->schedule(coalescer->probeTLBEvent,
|
||||
curTick() + coalescer->ticks(1));
|
||||
curTick() + coalescer->clockPeriod());
|
||||
}
|
||||
|
||||
void
|
||||
@@ -448,7 +447,7 @@ TLBCoalescer::processProbeTLBEvent()
|
||||
|
||||
// send the coalesced request for virt_page_addr
|
||||
if (!memSidePort[0]->sendTimingReq(first_packet)) {
|
||||
DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
|
||||
DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
|
||||
virt_page_addr);
|
||||
|
||||
// No need for a retries queue since we are already buffering
|
||||
|
||||
@@ -65,13 +65,6 @@ class ThreadContext;
|
||||
*/
|
||||
class TLBCoalescer : public ClockedObject
|
||||
{
|
||||
protected:
|
||||
// TLB clock: will inherit clock from shader's clock period in terms
|
||||
// of nuber of ticks of curTime (aka global simulation clock)
|
||||
// The assignment of TLB clock from shader clock is done in the
|
||||
// python config files.
|
||||
int clock;
|
||||
|
||||
public:
|
||||
typedef TLBCoalescerParams Params;
|
||||
TLBCoalescer(const Params *p);
|
||||
@@ -105,7 +98,8 @@ class TLBCoalescer : public ClockedObject
|
||||
* option is to change it to curTick(), so we coalesce based
|
||||
* on the receive time.
|
||||
*/
|
||||
typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;
|
||||
typedef std::unordered_map<int64_t, std::vector<coalescedReq>>
|
||||
CoalescingFIFO;
|
||||
|
||||
CoalescingFIFO coalescerFIFO;
|
||||
|
||||
@@ -143,13 +137,6 @@ class TLBCoalescer : public ClockedObject
|
||||
void updatePhysAddresses(PacketPtr pkt);
|
||||
void regStats() override;
|
||||
|
||||
// Clock related functions. Maps to-and-from
|
||||
// Simulation ticks and object clocks.
|
||||
Tick frequency() const { return SimClock::Frequency / clock; }
|
||||
Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
|
||||
Tick curCycle() const { return curTick() / clock; }
|
||||
Tick tickToCycles(Tick val) const { return val / clock;}
|
||||
|
||||
class CpuSidePort : public SlavePort
|
||||
{
|
||||
public:
|
||||
@@ -171,7 +158,8 @@ class TLBCoalescer : public ClockedObject
|
||||
virtual void
|
||||
recvRespRetry()
|
||||
{
|
||||
fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
|
||||
fatal("recvRespRetry() is not implemented in the TLB "
|
||||
"coalescer.\n");
|
||||
}
|
||||
|
||||
virtual AddrRangeList getAddrRanges() const;
|
||||
|
||||
@@ -36,81 +36,21 @@
|
||||
#include <string>
|
||||
|
||||
#include "base/logging.hh"
|
||||
#include "base/trace.hh"
|
||||
#include "debug/GPUVRF.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/simple_pool_manager.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "params/VectorRegisterFile.hh"
|
||||
|
||||
VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
|
||||
: SimObject(p),
|
||||
manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)),
|
||||
simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd),
|
||||
vgprState(new VecRegisterState())
|
||||
: RegisterFile(p)
|
||||
{
|
||||
fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n");
|
||||
fatal_if(simdId < 0, "Illegal SIMD id for VRF");
|
||||
regFile.resize(numRegs(), VecRegContainer());
|
||||
|
||||
fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not "
|
||||
"multiple of VRF size\n");
|
||||
|
||||
busy.clear();
|
||||
busy.resize(numRegsPerSimd, 0);
|
||||
nxtBusy.clear();
|
||||
nxtBusy.resize(numRegsPerSimd, 0);
|
||||
|
||||
vgprState->init(numRegsPerSimd, p->wfSize);
|
||||
}
|
||||
|
||||
void
|
||||
VectorRegisterFile::setParent(ComputeUnit *_computeUnit)
|
||||
{
|
||||
computeUnit = _computeUnit;
|
||||
vgprState->setParent(computeUnit);
|
||||
}
|
||||
|
||||
uint8_t
|
||||
VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const
|
||||
{
|
||||
uint8_t status = nxtBusy.at(idx);
|
||||
|
||||
if (operandSize > 4) {
|
||||
status = status | (nxtBusy.at((idx + 1) % numRegs()));
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
uint8_t
|
||||
VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const
|
||||
{
|
||||
uint8_t status = busy.at(idx);
|
||||
|
||||
if (operandSize > 4) {
|
||||
status = status | (busy.at((idx + 1) % numRegs()));
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void
|
||||
VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value)
|
||||
{
|
||||
nxtBusy.at(regIdx) = value;
|
||||
|
||||
if (operandSize > 4) {
|
||||
nxtBusy.at((regIdx + 1) % numRegs()) = value;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value)
|
||||
{
|
||||
busy.at(regIdx) = value;
|
||||
|
||||
if (operandSize > 4) {
|
||||
busy.at((regIdx + 1) % numRegs()) = value;
|
||||
for (auto ® : regFile) {
|
||||
reg.zero();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,127 +58,154 @@ bool
|
||||
VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
|
||||
{
|
||||
for (int i = 0; i < ii->getNumOperands(); ++i) {
|
||||
if (ii->isVectorRegister(i)) {
|
||||
uint32_t vgprIdx = ii->getRegisterIndex(i, ii);
|
||||
uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1);
|
||||
if (ii->isVectorRegister(i) && ii->isSrcOperand(i)) {
|
||||
int vgprIdx = ii->getRegisterIndex(i, ii);
|
||||
|
||||
if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) {
|
||||
if (ii->isDstOperand(i)) {
|
||||
w->numTimesBlockedDueWAXDependencies++;
|
||||
} else if (ii->isSrcOperand(i)) {
|
||||
w->numTimesBlockedDueRAWDependencies++;
|
||||
// determine number of registers
|
||||
int nRegs =
|
||||
ii->getOperandSize(i) <= 4 ? 1 : ii->getOperandSize(i) / 4;
|
||||
for (int j = 0; j < nRegs; j++) {
|
||||
int pVgpr = computeUnit->registerManager
|
||||
->mapVgpr(w, vgprIdx + j);
|
||||
if (regBusy(pVgpr)) {
|
||||
if (ii->isDstOperand(i)) {
|
||||
w->numTimesBlockedDueWAXDependencies++;
|
||||
} else if (ii->isSrcOperand(i)) {
|
||||
DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
|
||||
w->wfDynId, ii->disassemble(), pVgpr);
|
||||
w->numTimesBlockedDueRAWDependencies++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) {
|
||||
if (ii->isDstOperand(i)) {
|
||||
w->numTimesBlockedDueWAXDependencies++;
|
||||
} else if (ii->isSrcOperand(i)) {
|
||||
w->numTimesBlockedDueRAWDependencies++;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w)
|
||||
VectorRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
bool loadInstr = ii->isLoad();
|
||||
bool atomicInstr = ii->isAtomic() || ii->isMemFence();
|
||||
|
||||
bool loadNoArgInstr = loadInstr && !ii->isArgLoad();
|
||||
|
||||
// iterate over all register destination operands
|
||||
for (int i = 0; i < ii->getNumOperands(); ++i) {
|
||||
if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
|
||||
uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii),
|
||||
ii->getOperandSize(i), 1);
|
||||
int vgprIdx = ii->getRegisterIndex(i, ii);
|
||||
int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
|
||||
ii->getOperandSize(i) / 4;
|
||||
|
||||
// mark the destination vector register as busy
|
||||
markReg(physReg, ii->getOperandSize(i), 1);
|
||||
// clear the in-flight status of the destination vector register
|
||||
preMarkReg(physReg, ii->getOperandSize(i), 0);
|
||||
for (int j = 0; j < nRegs; ++j) {
|
||||
int physReg = computeUnit->registerManager
|
||||
->mapVgpr(w, vgprIdx + j);
|
||||
|
||||
// FIXME: if we ever model correct timing behavior
|
||||
// for load argument instructions then we should not
|
||||
// set the destination register as busy now but when
|
||||
// the data returns. Loads and Atomics should free
|
||||
// their destination registers when the data returns,
|
||||
// not now
|
||||
if (!atomicInstr && !loadNoArgInstr) {
|
||||
uint32_t pipeLen = ii->getOperandSize(i) <= 4 ?
|
||||
computeUnit->spBypassLength() :
|
||||
computeUnit->dpBypassLength();
|
||||
|
||||
// schedule an event for marking the register as ready
|
||||
computeUnit->registerEvent(w->simdId, physReg,
|
||||
ii->getOperandSize(i),
|
||||
computeUnit->shader->tick_cnt +
|
||||
computeUnit->shader->ticks(pipeLen),
|
||||
0);
|
||||
// If instruction is atomic instruction and
|
||||
// the atomics do not return value, then
|
||||
// do not mark this reg as busy.
|
||||
if (!(ii->isAtomic() && !ii->isAtomicRet())) {
|
||||
/**
|
||||
* if the instruction is a load with EXEC = 0, then
|
||||
* we do not mark the reg. we do this to avoid a
|
||||
* deadlock that can occur because a load reserves
|
||||
* its destination regs before checking its exec mask,
|
||||
* and in the case it is 0, it will not send/recv any
|
||||
* packets, and therefore it will never free its dest
|
||||
* reg(s).
|
||||
*/
|
||||
if (!ii->isLoad() || (ii->isLoad()
|
||||
&& ii->exec_mask.any())) {
|
||||
markReg(physReg, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w,
|
||||
std::vector<uint32_t> ®Vec, uint32_t operandSize,
|
||||
uint64_t timestamp)
|
||||
void
|
||||
VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
int delay = 0;
|
||||
// increment count of number of DWORDs read from VRF
|
||||
int DWORDs = ii->numSrcVecDWORDs();
|
||||
registerReads += (DWORDs * w->execMask().count());
|
||||
|
||||
panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n",
|
||||
regVec.size());
|
||||
|
||||
for (int i = 0; i < regVec.size(); ++i) {
|
||||
// mark the destination VGPR as free when the timestamp expires
|
||||
computeUnit->registerEvent(w->simdId, regVec[i], operandSize,
|
||||
computeUnit->shader->tick_cnt + timestamp +
|
||||
computeUnit->shader->ticks(delay), 0);
|
||||
uint64_t mask = w->execMask().to_ullong();
|
||||
int srams = w->execMask().size() / 4;
|
||||
for (int i = 0; i < srams; i++) {
|
||||
if (mask & 0xF) {
|
||||
sramReads += DWORDs;
|
||||
}
|
||||
mask = mask >> 4;
|
||||
}
|
||||
|
||||
return delay;
|
||||
}
|
||||
if (!ii->isLoad()
|
||||
&& !(ii->isAtomic() || ii->isMemSync())) {
|
||||
int opSize = 4;
|
||||
for (int i = 0; i < ii->getNumOperands(); i++) {
|
||||
if (ii->getOperandSize(i) > opSize) {
|
||||
opSize = ii->getOperandSize(i);
|
||||
}
|
||||
}
|
||||
Cycles delay(opSize <= 4 ? computeUnit->spBypassLength()
|
||||
: computeUnit->dpBypassLength());
|
||||
Tick tickDelay = computeUnit->cyclesToTicks(delay);
|
||||
|
||||
void
|
||||
VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
// iterate over all register destination operands
|
||||
for (int i = 0; i < ii->getNumOperands(); ++i) {
|
||||
if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
|
||||
uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii),
|
||||
ii->getOperandSize(i), 1);
|
||||
// set the in-flight status of the destination vector register
|
||||
preMarkReg(physReg, ii->getOperandSize(i), 1);
|
||||
for (int i = 0; i < ii->getNumOperands(); i++) {
|
||||
if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
|
||||
int vgprIdx = ii->getRegisterIndex(i, ii);
|
||||
int nRegs = ii->getOperandSize(i) <= 4 ? 1
|
||||
: ii->getOperandSize(i) / 4;
|
||||
for (int j = 0; j < nRegs; j++) {
|
||||
int physReg = computeUnit->registerManager
|
||||
->mapVgpr(w, vgprIdx + j);
|
||||
enqRegFreeEvent(physReg, tickDelay);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// increment count of number of DWORDs written to VRF
|
||||
DWORDs = ii->numDstVecDWORDs();
|
||||
registerWrites += (DWORDs * w->execMask().count());
|
||||
|
||||
mask = w->execMask().to_ullong();
|
||||
srams = w->execMask().size() / 4;
|
||||
for (int i = 0; i < srams; i++) {
|
||||
if (mask & 0xF) {
|
||||
sramWrites += DWORDs;
|
||||
}
|
||||
mask = mask >> 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
|
||||
GPUDynInstPtr ii,
|
||||
VrfAccessType accessType)
|
||||
void
|
||||
VectorRegisterFile::scheduleWriteOperandsFromLoad(
|
||||
Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
bool ready = true;
|
||||
assert(ii->isLoad() || ii->isAtomicRet());
|
||||
for (int i = 0; i < ii->getNumOperands(); ++i) {
|
||||
if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
|
||||
int vgprIdx = ii->getRegisterIndex(i, ii);
|
||||
int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
|
||||
ii->getOperandSize(i) / 4;
|
||||
|
||||
return ready;
|
||||
}
|
||||
for (int j = 0; j < nRegs; ++j) {
|
||||
int physReg = computeUnit->registerManager
|
||||
->mapVgpr(w, vgprIdx + j);
|
||||
enqRegFreeEvent(physReg, computeUnit->clockPeriod());
|
||||
}
|
||||
}
|
||||
}
|
||||
// increment count of number of DWORDs written to VRF
|
||||
int DWORDs = ii->numDstVecDWORDs();
|
||||
registerWrites += (DWORDs * ii->exec_mask.count());
|
||||
|
||||
bool
|
||||
VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
|
||||
VrfAccessType accessType)
|
||||
{
|
||||
bool ready = true;
|
||||
|
||||
return ready;
|
||||
uint64_t mask = ii->exec_mask.to_ullong();
|
||||
int srams = ii->exec_mask.size() / 4;
|
||||
for (int i = 0; i < srams; i++) {
|
||||
if (mask & 0xF) {
|
||||
sramWrites += DWORDs;
|
||||
}
|
||||
mask = mask >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
VectorRegisterFile*
|
||||
|
||||
@@ -34,111 +34,76 @@
|
||||
#ifndef __VECTOR_REGISTER_FILE_HH__
|
||||
#define __VECTOR_REGISTER_FILE_HH__
|
||||
|
||||
#include <list>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "base/trace.hh"
|
||||
#include "base/types.hh"
|
||||
#include "arch/gpu_isa.hh"
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "debug/GPUVRF.hh"
|
||||
#include "gpu-compute/vector_register_state.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
|
||||
class ComputeUnit;
|
||||
class Shader;
|
||||
class SimplePoolManager;
|
||||
class Wavefront;
|
||||
#include "gpu-compute/register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
struct VectorRegisterFileParams;
|
||||
|
||||
enum class VrfAccessType : uint8_t
|
||||
{
|
||||
READ = 0x01,
|
||||
WRITE = 0x02,
|
||||
RD_WR = READ | WRITE
|
||||
};
|
||||
|
||||
// Vector Register File
|
||||
class VectorRegisterFile : public SimObject
|
||||
class VectorRegisterFile : public RegisterFile
|
||||
{
|
||||
public:
|
||||
using VecRegContainer = TheGpuISA::VecRegContainerU32;
|
||||
|
||||
VectorRegisterFile(const VectorRegisterFileParams *p);
|
||||
~VectorRegisterFile() { }
|
||||
|
||||
void setParent(ComputeUnit *_computeUnit);
|
||||
virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override;
|
||||
virtual void scheduleWriteOperands(Wavefront *w,
|
||||
GPUDynInstPtr ii) override;
|
||||
virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
|
||||
GPUDynInstPtr ii) override;
|
||||
virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override;
|
||||
|
||||
// Read a register
|
||||
template<typename T>
|
||||
T
|
||||
read(int regIdx, int threadId=0)
|
||||
void
|
||||
setParent(ComputeUnit *_computeUnit) override
|
||||
{
|
||||
T p0 = vgprState->read<T>(regIdx, threadId);
|
||||
DPRINTF(GPUVRF, "reading vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)p0);
|
||||
RegisterFile::setParent(_computeUnit);
|
||||
}
|
||||
|
||||
return p0;
|
||||
// Read a register that is writeable (e.g., a DST operand)
|
||||
VecRegContainer&
|
||||
readWriteable(int regIdx)
|
||||
{
|
||||
return regFile[regIdx];
|
||||
}
|
||||
|
||||
// Read a register that is not writeable (e.g., src operand)
|
||||
const VecRegContainer&
|
||||
read(int regIdx) const
|
||||
{
|
||||
return regFile[regIdx];
|
||||
}
|
||||
|
||||
// Write a register
|
||||
template<typename T>
|
||||
void
|
||||
write(int regIdx, T value, int threadId=0)
|
||||
write(int regIdx, const VecRegContainer &value)
|
||||
{
|
||||
DPRINTF(GPUVRF, "writing vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)value);
|
||||
vgprState->write<T>(regIdx, value, threadId);
|
||||
regFile[regIdx] = value;
|
||||
}
|
||||
|
||||
uint8_t regBusy(int idx, uint32_t operandSize) const;
|
||||
uint8_t regNxtBusy(int idx, uint32_t operandSize) const;
|
||||
|
||||
int numRegs() const { return numRegsPerSimd; }
|
||||
|
||||
void markReg(int regIdx, uint32_t operandSize, uint8_t value);
|
||||
void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value);
|
||||
|
||||
virtual void exec(GPUDynInstPtr ii, Wavefront *w);
|
||||
|
||||
virtual int exec(uint64_t dynamic_id, Wavefront *w,
|
||||
std::vector<uint32_t> ®Vec, uint32_t operandSize,
|
||||
uint64_t timestamp);
|
||||
|
||||
bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
|
||||
virtual void updateEvents() { }
|
||||
virtual void updateResources(Wavefront *w, GPUDynInstPtr ii);
|
||||
|
||||
virtual bool
|
||||
isReadConflict(int memWfId, int exeWfId) const
|
||||
void
|
||||
printReg(Wavefront *wf, int regIdx) const
|
||||
{
|
||||
return false;
|
||||
#ifndef NDEBUG
|
||||
const auto &vec_reg_cont = regFile[regIdx];
|
||||
auto vgpr = vec_reg_cont.as<TheGpuISA::VecElemU32>();
|
||||
|
||||
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
DPRINTF(GPUVRF, "WF[%d][%d]: WV[%d] v[%d][%d] = %#x\n",
|
||||
wf->simdId, wf->wfSlotId, wf->wfDynId, regIdx, lane,
|
||||
vgpr[lane]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual bool
|
||||
isWriteConflict(int memWfId, int exeWfId) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
|
||||
GPUDynInstPtr ii,
|
||||
VrfAccessType accessType);
|
||||
|
||||
virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
|
||||
VrfAccessType accessType);
|
||||
|
||||
SimplePoolManager *manager;
|
||||
|
||||
protected:
|
||||
ComputeUnit* computeUnit;
|
||||
int simdId;
|
||||
|
||||
// flag indicating if a register is busy
|
||||
std::vector<uint8_t> busy;
|
||||
// flag indicating if a register will be busy (by instructions
|
||||
// in the SIMD pipeline)
|
||||
std::vector<uint8_t> nxtBusy;
|
||||
|
||||
// numer of registers (bank size) per simd unit (bank)
|
||||
int numRegsPerSimd;
|
||||
|
||||
// vector register state
|
||||
VecRegisterState *vgprState;
|
||||
private:
|
||||
std::vector<VecRegContainer> regFile;
|
||||
};
|
||||
|
||||
#endif // __VECTOR_REGISTER_FILE_HH__
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -31,161 +31,116 @@
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __WAVEFRONT_HH__
|
||||
#define __WAVEFRONT_HH__
|
||||
#ifndef __GPU_COMPUTE_WAVEFRONT_HH__
|
||||
#define __GPU_COMPUTE_WAVEFRONT_HH__
|
||||
|
||||
#include <cassert>
|
||||
#include <deque>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <stack>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "arch/gpu_isa.hh"
|
||||
#include "base/logging.hh"
|
||||
#include "base/types.hh"
|
||||
#include "config/the_gpu_isa.hh"
|
||||
#include "gpu-compute/condition_register_state.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/hsa_queue_entry.hh"
|
||||
#include "gpu-compute/lds_state.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "gpu-compute/ndrange.hh"
|
||||
#include "params/Wavefront.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
|
||||
static const int MAX_NUM_INSTS_PER_WF = 12;
|
||||
|
||||
/**
|
||||
* A reconvergence stack entry conveys the necessary state to implement
|
||||
* control flow divergence.
|
||||
*/
|
||||
struct ReconvergenceStackEntry {
|
||||
/**
|
||||
* PC of current instruction.
|
||||
*/
|
||||
uint32_t pc;
|
||||
/**
|
||||
* PC of the immediate post-dominator instruction, i.e., the value of
|
||||
* @a pc for the first instruction that will be executed by the wavefront
|
||||
* when a reconvergence point is reached.
|
||||
*/
|
||||
uint32_t rpc;
|
||||
/**
|
||||
* Execution mask.
|
||||
*/
|
||||
VectorMask execMask;
|
||||
};
|
||||
|
||||
/*
|
||||
* Arguments for the hsail opcode call, are user defined and variable length.
|
||||
* The hardware/finalizer can support arguments in hardware or use memory to
|
||||
* pass arguments. For now, let's assume that an unlimited number of arguments
|
||||
* are supported in hardware (the compiler inlines functions whenver it can
|
||||
* anyways, so unless someone is interested in the implications of linking/
|
||||
* library functions, I think this is a reasonable assumption given the typical
|
||||
* size of an OpenCL kernel).
|
||||
*
|
||||
* Note that call args are different than kernel arguments:
|
||||
* * All work-items in a kernel refer the same set of kernel arguments
|
||||
* * Each work-item has it's on set of call args. So a call argument at
|
||||
* address 0x4 is different for work-item 0 and work-item 1.
|
||||
*
|
||||
* Ok, the table below shows an example of how we organize the call arguments in
|
||||
* the CallArgMem class.
|
||||
*
|
||||
* int foo(int arg1, double arg2)
|
||||
* ___________________________________________________
|
||||
* | 0: return.0 | 4: return.1 | ... | 252: return.63 |
|
||||
* |---------------------------------------------------|
|
||||
* | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
|
||||
* |---------------------------------------------------|
|
||||
* | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
|
||||
* ___________________________________________________
|
||||
*/
|
||||
class CallArgMem
|
||||
{
|
||||
public:
|
||||
// pointer to buffer for storing function arguments
|
||||
uint8_t *mem;
|
||||
int wfSize;
|
||||
// size of function args
|
||||
int funcArgsSizePerItem;
|
||||
|
||||
template<typename CType>
|
||||
int
|
||||
getLaneOffset(int lane, int addr)
|
||||
{
|
||||
return addr * wfSize + sizeof(CType) * lane;
|
||||
}
|
||||
|
||||
CallArgMem(int func_args_size_per_item, int wf_size)
|
||||
: wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
|
||||
{
|
||||
mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
|
||||
}
|
||||
|
||||
~CallArgMem()
|
||||
{
|
||||
free(mem);
|
||||
}
|
||||
|
||||
template<typename CType>
|
||||
uint8_t*
|
||||
getLaneAddr(int lane, int addr)
|
||||
{
|
||||
return mem + getLaneOffset<CType>(lane, addr);
|
||||
}
|
||||
|
||||
template<typename CType>
|
||||
void
|
||||
setLaneAddr(int lane, int addr, CType val)
|
||||
{
|
||||
*((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
|
||||
}
|
||||
};
|
||||
|
||||
class Wavefront : public SimObject
|
||||
{
|
||||
public:
|
||||
enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
|
||||
enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
|
||||
|
||||
// Base pointer for array of instruction pointers
|
||||
uint64_t basePtr;
|
||||
enum status_e {
|
||||
// wavefront is stalled
|
||||
S_STOPPED,
|
||||
// wavefront is returning from a kernel
|
||||
S_RETURNING,
|
||||
// wavefront is running normally
|
||||
S_RUNNING,
|
||||
// wavefront is stalled
|
||||
S_STALLED,
|
||||
/**
|
||||
* wavefront has unsatisfied wait counts
|
||||
*
|
||||
* while in this state the WF will only execute if
|
||||
* the oldest instruction is the waitcnt. while in
|
||||
* S_WAITCNT, the wavefront will not be ready until
|
||||
* all of its waitcnts have been satisfied. the
|
||||
* scoreboard ready() function will check the status
|
||||
* of the waitcnts whenever the WF is in S_WAITCNT,
|
||||
* and once they are satisfied, it will resume normal
|
||||
* operation.
|
||||
*/
|
||||
S_WAITCNT
|
||||
};
|
||||
|
||||
uint32_t oldBarrierCnt;
|
||||
uint32_t barrierCnt;
|
||||
uint32_t barrierId;
|
||||
uint32_t barrierSlots;
|
||||
status_e status;
|
||||
// HW slot id where the WF is mapped to inside a SIMD unit
|
||||
int wfSlotId;
|
||||
const int wfSlotId;
|
||||
int kernId;
|
||||
// SIMD unit where the WV has been scheduled
|
||||
int simdId;
|
||||
const int simdId;
|
||||
// id of the execution unit (or pipeline) where the oldest instruction
|
||||
// of the WF is scheduled
|
||||
int execUnitId;
|
||||
int flatLmUnitId;
|
||||
int flatGmUnitId;
|
||||
// pointer to parent CU
|
||||
ComputeUnit *computeUnit;
|
||||
int maxIbSize;
|
||||
|
||||
std::deque<GPUDynInstPtr> instructionBuffer;
|
||||
|
||||
bool pendingFetch;
|
||||
bool dropFetch;
|
||||
// last tick during which all WFs in the CU are not idle
|
||||
Tick lastNonIdleTick;
|
||||
|
||||
// Condition Register State (for HSAIL simulations only)
|
||||
class ConditionRegisterState *condRegState;
|
||||
// number of single precision VGPRs required by WF
|
||||
uint32_t maxSpVgprs;
|
||||
// number of double precision VGPRs required by WF
|
||||
uint32_t maxDpVgprs;
|
||||
// map virtual to physical vector register
|
||||
uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
|
||||
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
|
||||
// Execution unit resource ID's associated with this WF
|
||||
// These are static mappings set at WF slot construction and
|
||||
// based off of the simdId and wfSlotId.
|
||||
|
||||
// Index to scalarALUs resource vector in CU
|
||||
int scalarAlu;
|
||||
|
||||
// Indices into readyList/dispatchList of resources used by this
|
||||
// wavefront
|
||||
int scalarAluGlobalIdx;
|
||||
int globalMem;
|
||||
int localMem;
|
||||
int scalarMem;
|
||||
|
||||
// number of VGPRs required by WF
|
||||
uint32_t maxVgprs;
|
||||
// number of SGPRs required by WF
|
||||
uint32_t maxSgprs;
|
||||
void freeResources();
|
||||
GPUDynInstPtr nextInstr();
|
||||
void setStatus(status_e newStatus);
|
||||
status_e getStatus() { return status; }
|
||||
void resizeRegFiles(int num_vregs, int num_sregs);
|
||||
bool isGmInstruction(GPUDynInstPtr ii);
|
||||
bool isLmInstruction(GPUDynInstPtr ii);
|
||||
bool isOldestInstWaitcnt();
|
||||
bool isOldestInstGMem();
|
||||
bool isOldestInstLMem();
|
||||
bool isOldestInstPrivMem();
|
||||
bool isOldestInstFlatMem();
|
||||
bool isOldestInstALU();
|
||||
bool isOldestInstVectorALU();
|
||||
bool isOldestInstScalarALU();
|
||||
bool isOldestInstScalarMem();
|
||||
bool isOldestInstBarrier();
|
||||
|
||||
// used for passing spill address to DDInstGPU
|
||||
std::vector<Addr> lastAddr;
|
||||
std::vector<uint32_t> workItemId[3];
|
||||
@@ -199,36 +154,44 @@ class Wavefront : public SimObject
|
||||
/* the actual WG size can differ than the maximum size */
|
||||
uint32_t actualWgSz[3];
|
||||
uint32_t actualWgSzTotal;
|
||||
void computeActualWgSz(NDRange *ndr);
|
||||
void computeActualWgSz(HSAQueueEntry *task);
|
||||
// wavefront id within a workgroup
|
||||
uint32_t wfId;
|
||||
uint32_t maxDynWaveId;
|
||||
uint32_t dispatchId;
|
||||
// outstanding global+local memory requests
|
||||
uint32_t outstandingReqs;
|
||||
// memory requests between scoreboard
|
||||
// and execute stage not yet executed
|
||||
uint32_t memReqsInPipe;
|
||||
// vector and scalar memory requests pending in memory system
|
||||
int outstandingReqs;
|
||||
// outstanding global memory write requests
|
||||
uint32_t outstandingReqsWrGm;
|
||||
int outstandingReqsWrGm;
|
||||
// outstanding local memory write requests
|
||||
uint32_t outstandingReqsWrLm;
|
||||
int outstandingReqsWrLm;
|
||||
// outstanding global memory read requests
|
||||
uint32_t outstandingReqsRdGm;
|
||||
int outstandingReqsRdGm;
|
||||
// outstanding local memory read requests
|
||||
uint32_t outstandingReqsRdLm;
|
||||
uint32_t rdLmReqsInPipe;
|
||||
uint32_t rdGmReqsInPipe;
|
||||
uint32_t wrLmReqsInPipe;
|
||||
uint32_t wrGmReqsInPipe;
|
||||
int outstandingReqsRdLm;
|
||||
// outstanding scalar memory read requests
|
||||
int scalarOutstandingReqsRdGm;
|
||||
// outstanding scalar memory write requests
|
||||
int scalarOutstandingReqsWrGm;
|
||||
int rdLmReqsInPipe;
|
||||
int rdGmReqsInPipe;
|
||||
int wrLmReqsInPipe;
|
||||
int wrGmReqsInPipe;
|
||||
int scalarRdGmReqsInPipe;
|
||||
int scalarWrGmReqsInPipe;
|
||||
|
||||
int memTraceBusy;
|
||||
uint64_t lastTrace;
|
||||
// number of vector registers reserved by WF
|
||||
// number of virtual vector registers reserved by WF
|
||||
int reservedVectorRegs;
|
||||
// number of virtual scalar registers reserved by WF
|
||||
int reservedScalarRegs;
|
||||
// Index into the Vector Register File's namespace where the WF's registers
|
||||
// will live while the WF is executed
|
||||
uint32_t startVgprIndex;
|
||||
// Index into the Scalar Register File's namespace where the WF's registers
|
||||
// will live while the WF is executed
|
||||
uint32_t startSgprIndex;
|
||||
|
||||
// Old value of destination gpr (for trace)
|
||||
std::vector<uint32_t> oldVgpr;
|
||||
@@ -257,64 +220,63 @@ class Wavefront : public SimObject
|
||||
// to this workgroup (thus this wavefront)
|
||||
LdsChunk *ldsChunk;
|
||||
|
||||
// A pointer to the spill area
|
||||
Addr spillBase;
|
||||
// The size of the spill area
|
||||
uint32_t spillSizePerItem;
|
||||
// The vector width of the spill area
|
||||
uint32_t spillWidth;
|
||||
|
||||
// A pointer to the private memory area
|
||||
Addr privBase;
|
||||
// The size of the private memory area
|
||||
uint32_t privSizePerItem;
|
||||
|
||||
// A pointer ot the read-only memory area
|
||||
Addr roBase;
|
||||
// size of the read-only memory area
|
||||
uint32_t roSize;
|
||||
|
||||
// pointer to buffer for storing kernel arguments
|
||||
uint8_t *kernelArgs;
|
||||
// unique WF id over all WFs executed across all CUs
|
||||
uint64_t wfDynId;
|
||||
|
||||
// number of times instruction issue for this wavefront is blocked
|
||||
// due to VRF port availability
|
||||
Stats::Scalar numTimesBlockedDueVrfPortAvail;
|
||||
// Wavefront slot stats
|
||||
|
||||
// Number of instructions executed by this wavefront slot across all
|
||||
// dynamic wavefronts
|
||||
Stats::Scalar numInstrExecuted;
|
||||
|
||||
// Number of cycles this WF spends in SCH stage
|
||||
Stats::Scalar schCycles;
|
||||
|
||||
// Number of stall cycles encounterd by this WF in SCH stage
|
||||
Stats::Scalar schStalls;
|
||||
|
||||
// The following stats sum to the value of schStalls, and record, per
|
||||
// WF slot, what the cause of each stall was at a coarse granularity.
|
||||
|
||||
// Cycles WF is selected by scheduler, but RFs cannot support instruction
|
||||
Stats::Scalar schRfAccessStalls;
|
||||
// Cycles spent waiting for execution resources
|
||||
Stats::Scalar schResourceStalls;
|
||||
// cycles spent waiting for RF reads to complete in SCH stage
|
||||
Stats::Scalar schOpdNrdyStalls;
|
||||
// LDS arbitration stall cycles. WF attempts to execute LM instruction,
|
||||
// but another wave is executing FLAT, which requires LM and GM and forces
|
||||
// this WF to stall.
|
||||
Stats::Scalar schLdsArbStalls;
|
||||
|
||||
// number of times an instruction of a WF is blocked from being issued
|
||||
// due to WAR and WAW dependencies
|
||||
Stats::Scalar numTimesBlockedDueWAXDependencies;
|
||||
// number of times an instruction of a WF is blocked from being issued
|
||||
// due to WAR and WAW dependencies
|
||||
Stats::Scalar numTimesBlockedDueRAWDependencies;
|
||||
// distribution of executed instructions based on their register
|
||||
// operands; this is used to highlight the load on the VRF
|
||||
Stats::Distribution srcRegOpDist;
|
||||
Stats::Distribution dstRegOpDist;
|
||||
|
||||
// Functions to operate on call argument memory
|
||||
// argument memory for hsail call instruction
|
||||
CallArgMem *callArgMem;
|
||||
void
|
||||
initCallArgMem(int func_args_size_per_item, int wf_size)
|
||||
{
|
||||
callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
|
||||
}
|
||||
// dyn inst id (per SIMD) of last instruction exec from this wave
|
||||
uint64_t lastInstExec;
|
||||
|
||||
template<typename CType>
|
||||
CType
|
||||
readCallArgMem(int lane, int addr)
|
||||
{
|
||||
return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
|
||||
}
|
||||
// Distribution to track the distance between producer and consumer
|
||||
// for vector register values
|
||||
Stats::Distribution vecRawDistance;
|
||||
// Map to track the dyn instruction id of each vector register value
|
||||
// produced, indexed by physical vector register ID
|
||||
std::unordered_map<int,uint64_t> rawDist;
|
||||
|
||||
template<typename CType>
|
||||
void
|
||||
writeCallArgMem(int lane, int addr, CType val)
|
||||
{
|
||||
callArgMem->setLaneAddr<CType>(lane, addr, val);
|
||||
}
|
||||
// Distribution to track the number of times every vector register
|
||||
// value produced is consumed.
|
||||
Stats::Distribution readsPerWrite;
|
||||
// Counts the number of reads performed to each physical register
|
||||
// - counts are reset to 0 for each dynamic wavefront launched
|
||||
std::vector<int> vecReads;
|
||||
|
||||
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems);
|
||||
|
||||
// context for save/restore
|
||||
uint8_t *context;
|
||||
|
||||
typedef WavefrontParams Params;
|
||||
Wavefront(const Params *p);
|
||||
@@ -327,50 +289,31 @@ class Wavefront : public SimObject
|
||||
computeUnit = cu;
|
||||
}
|
||||
|
||||
void validateRequestCounters();
|
||||
void start(uint64_t _wfDynId, uint64_t _base_ptr);
|
||||
void exec();
|
||||
void updateResources();
|
||||
int ready(itype_e type);
|
||||
bool instructionBufferHasBranch();
|
||||
// called by SCH stage to reserve
|
||||
std::vector<int> reserveResources();
|
||||
bool stopFetch();
|
||||
void regStats();
|
||||
VectorMask getPred() { return execMask() & initMask; }
|
||||
|
||||
bool waitingAtBarrier(int lane);
|
||||
|
||||
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
|
||||
const VectorMask& exec_mask);
|
||||
|
||||
void popFromReconvergenceStack();
|
||||
|
||||
uint32_t pc() const;
|
||||
|
||||
uint32_t rpc() const;
|
||||
|
||||
VectorMask execMask() const;
|
||||
Addr pc() const;
|
||||
void pc(Addr new_pc);
|
||||
|
||||
VectorMask& execMask();
|
||||
bool execMask(int lane) const;
|
||||
|
||||
void pc(uint32_t new_pc);
|
||||
|
||||
void discardFetch();
|
||||
|
||||
/**
|
||||
* Returns the size of the static hardware context of a particular wavefront
|
||||
* This should be updated everytime the context is changed
|
||||
*/
|
||||
uint32_t getStaticContextSize() const;
|
||||
bool waitCntsSatisfied();
|
||||
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
|
||||
void clearWaitCnts();
|
||||
|
||||
/**
|
||||
* Returns the hardware context as a stream of bytes
|
||||
* This method is designed for HSAIL execution
|
||||
*/
|
||||
void getContext(const void *out);
|
||||
|
||||
/**
|
||||
* Sets the hardware context fromt a stream of bytes
|
||||
* This method is designed for HSAIL execution
|
||||
*/
|
||||
void setContext(const void *in);
|
||||
/** Freeing VRF space */
|
||||
void freeRegisterFile();
|
||||
|
||||
TheGpuISA::GPUISA&
|
||||
gpuISA()
|
||||
@@ -380,14 +323,32 @@ class Wavefront : public SimObject
|
||||
|
||||
private:
|
||||
TheGpuISA::GPUISA _gpuISA;
|
||||
|
||||
void reserveGmResource(GPUDynInstPtr ii);
|
||||
void reserveLmResource(GPUDynInstPtr ii);
|
||||
|
||||
/**
|
||||
* Stack containing Control Flow Graph nodes (i.e., kernel instructions)
|
||||
* to be visited by the wavefront, and the associated execution masks. The
|
||||
* reconvergence stack grows every time the wavefront reaches a divergence
|
||||
* point (branch instruction), and shrinks every time the wavefront
|
||||
* reaches a reconvergence point (immediate post-dominator instruction).
|
||||
* the following are used for waitcnt instructions
|
||||
* vmWaitCnt: once set, we wait for the oustanding
|
||||
* number of vector mem instructions to be
|
||||
* at, or below vmWaitCnt.
|
||||
*
|
||||
* expWaitCnt: once set, we wait for the outstanding
|
||||
* number outstanding VM writes or EXP
|
||||
* insts to be at, or below expWaitCnt.
|
||||
*
|
||||
* lgkmWaitCnt: once set, we wait for the oustanding
|
||||
* number of LDS, GDS, scalar memory,
|
||||
* and message instructions to be at, or
|
||||
* below lgkmCount. we currently do not
|
||||
* support GDS/message ops.
|
||||
*/
|
||||
std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
|
||||
int vmWaitCnt;
|
||||
int expWaitCnt;
|
||||
int lgkmWaitCnt;
|
||||
status_e status;
|
||||
Addr _pc;
|
||||
VectorMask _execMask;
|
||||
};
|
||||
|
||||
#endif // __WAVEFRONT_HH__
|
||||
#endif // __GPU_COMPUTE_WAVEFRONT_HH__
|
||||
|
||||
@@ -86,6 +86,14 @@ MemCmd::commandInfo[] =
|
||||
WriteResp, "WriteReq" },
|
||||
/* WriteResp */
|
||||
{ SET2(IsWrite, IsResponse), InvalidCmd, "WriteResp" },
|
||||
/* WriteCompleteResp - The WriteCompleteResp command is needed
|
||||
* because in the GPU memory model we use a WriteResp to indicate
|
||||
* that a write has reached the cache controller so we can free
|
||||
* resources at the coalescer. Later, when the write succesfully
|
||||
* completes we send a WriteCompleteResp to the CU so its wait
|
||||
* counters can be updated. Wait counters in the CU is how memory
|
||||
* dependences are handled in the GPU ISA. */
|
||||
{ SET2(IsWrite, IsResponse), InvalidCmd, "WriteCompleteResp" },
|
||||
/* WritebackDirty */
|
||||
{ SET5(IsWrite, IsRequest, IsEviction, HasData, FromCache),
|
||||
InvalidCmd, "WritebackDirty" },
|
||||
|
||||
@@ -83,6 +83,7 @@ class MemCmd
|
||||
ReadRespWithInvalidate,
|
||||
WriteReq,
|
||||
WriteResp,
|
||||
WriteCompleteResp,
|
||||
WritebackDirty,
|
||||
WritebackClean,
|
||||
WriteClean, // writes dirty data below without evicting
|
||||
|
||||
@@ -298,9 +298,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else {
|
||||
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
|
||||
if (in_msg.segment == HSASegment:SPILL) {
|
||||
trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else if (WB) {
|
||||
if (WB) {
|
||||
trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else {
|
||||
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
|
||||
|
||||
68
src/mem/ruby/protocol/GPU_VIPER-msg.sm
Normal file
68
src/mem/ruby/protocol/GPU_VIPER-msg.sm
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright (c) 2020 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
structure (GPUCoalescer, external = "yes") {
|
||||
void readCallback(Addr, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void writeCallback(Addr, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void evictionCallback(Addr);
|
||||
void recordCPReadCallBack(MachineID, MachineID);
|
||||
void recordCPWriteCallBack(MachineID, MachineID);
|
||||
}
|
||||
|
||||
structure (VIPERCoalescer, external = "yes") {
|
||||
void readCallback(Addr, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void writeCallback(Addr, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void invCallback(Addr);
|
||||
void wbCallback(Addr);
|
||||
void evictionCallback(Addr);
|
||||
}
|
||||
@@ -3,6 +3,7 @@ include "RubySlicc_interfaces.slicc";
|
||||
include "MOESI_AMD_Base-msg.sm";
|
||||
include "MOESI_AMD_Base-dir.sm";
|
||||
include "MOESI_AMD_Base-CorePair.sm";
|
||||
include "GPU_VIPER-msg.sm";
|
||||
include "GPU_VIPER-TCP.sm";
|
||||
include "GPU_VIPER-SQC.sm";
|
||||
include "GPU_VIPER-TCC.sm";
|
||||
|
||||
@@ -135,7 +135,6 @@ structure(CPURequestMsg, desc="...", interface="Message") {
|
||||
CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer";
|
||||
WriteMask writeMask, desc="Write Through Data";
|
||||
MachineID WTRequestor, desc="Node who initiated the write through";
|
||||
HSAScope scope, default="HSAScope_SYSTEM", desc="Request Scope";
|
||||
int wfid, default="0", desc="wavefront id";
|
||||
bool NoWriteConflict, default="true", desc="write collided with CAB entry";
|
||||
int ProgramCounter, desc="PC that accesses to this block";
|
||||
|
||||
@@ -103,26 +103,6 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent")
|
||||
NotPresent, desc="block is NotPresent";
|
||||
Busy, desc="block is in a transient state, currently invalid";
|
||||
}
|
||||
//HSA scopes
|
||||
enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") {
|
||||
UNSPECIFIED, desc="Unspecified scope";
|
||||
NOSCOPE, desc="Explictly unscoped";
|
||||
WAVEFRONT, desc="Wavefront scope";
|
||||
WORKGROUP, desc="Workgroup scope";
|
||||
DEVICE, desc="Device scope";
|
||||
SYSTEM, desc="System scope";
|
||||
}
|
||||
|
||||
// HSA segment types
|
||||
enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") {
|
||||
GLOBAL, desc="Global segment";
|
||||
GROUP, desc="Group segment";
|
||||
PRIVATE, desc="Private segment";
|
||||
KERNARG, desc="Kernarg segment";
|
||||
READONLY, desc="Readonly segment";
|
||||
SPILL, desc="Spill segment";
|
||||
ARG, desc="Arg segment";
|
||||
}
|
||||
|
||||
// TesterStatus
|
||||
enumeration(TesterStatus, desc="...") {
|
||||
|
||||
@@ -138,42 +138,6 @@ structure (Sequencer, external = "yes") {
|
||||
bool checkResourceAvailable(CacheResourceType, Addr);
|
||||
}
|
||||
|
||||
structure (GPUCoalescer, external = "yes") {
|
||||
void readCallback(Addr, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void writeCallback(Addr, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void evictionCallback(Addr);
|
||||
void recordCPReadCallBack(MachineID, MachineID);
|
||||
void recordCPWriteCallBack(MachineID, MachineID);
|
||||
}
|
||||
|
||||
structure (VIPERCoalescer, external = "yes") {
|
||||
void readCallback(Addr, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void readCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void writeCallback(Addr, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles);
|
||||
void writeCallback(Addr, MachineType, DataBlock,
|
||||
Cycles, Cycles, Cycles, bool);
|
||||
void invCallback(Addr);
|
||||
void wbCallback(Addr);
|
||||
void evictionCallback(Addr);
|
||||
}
|
||||
|
||||
structure(RubyRequest, desc="...", interface="Message", external="yes") {
|
||||
Addr LineAddress, desc="Line address for this request";
|
||||
Addr PhysicalAddress, desc="Physical address for this request";
|
||||
@@ -186,8 +150,6 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
|
||||
WriteMask writeMask, desc="Writethrough mask";
|
||||
DataBlock WTData, desc="Writethrough data block";
|
||||
int wfid, desc="Writethrough wavefront";
|
||||
HSAScope scope, desc="HSA scope";
|
||||
HSASegment segment, desc="HSA segment";
|
||||
PacketPtr pkt, desc="Packet associated with this request";
|
||||
}
|
||||
|
||||
|
||||
@@ -43,7 +43,6 @@
|
||||
#include "debug/RubyQueue.hh"
|
||||
#include "mem/ruby/network/Network.hh"
|
||||
#include "mem/ruby/protocol/MemoryMsg.hh"
|
||||
#include "mem/ruby/system/GPUCoalescer.hh"
|
||||
#include "mem/ruby/system/RubySystem.hh"
|
||||
#include "mem/ruby/system/Sequencer.hh"
|
||||
#include "sim/system.hh"
|
||||
|
||||
@@ -35,8 +35,6 @@
|
||||
#include "mem/ruby/common/Address.hh"
|
||||
#include "mem/ruby/common/DataBlock.hh"
|
||||
#include "mem/ruby/common/WriteMask.hh"
|
||||
#include "mem/ruby/protocol/HSAScope.hh"
|
||||
#include "mem/ruby/protocol/HSASegment.hh"
|
||||
#include "mem/ruby/protocol/Message.hh"
|
||||
#include "mem/ruby/protocol/PrefetchBit.hh"
|
||||
#include "mem/ruby/protocol/RubyAccessMode.hh"
|
||||
|
||||
@@ -61,58 +61,6 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
GPUCoalescer *
|
||||
RubyGPUCoalescerParams::create()
|
||||
{
|
||||
return new GPUCoalescer(this);
|
||||
}
|
||||
|
||||
HSAScope
|
||||
reqScopeToHSAScope(const RequestPtr &req)
|
||||
{
|
||||
HSAScope accessScope = HSAScope_UNSPECIFIED;
|
||||
if (req->isScoped()) {
|
||||
if (req->isWavefrontScope()) {
|
||||
accessScope = HSAScope_WAVEFRONT;
|
||||
} else if (req->isWorkgroupScope()) {
|
||||
accessScope = HSAScope_WORKGROUP;
|
||||
} else if (req->isDeviceScope()) {
|
||||
accessScope = HSAScope_DEVICE;
|
||||
} else if (req->isSystemScope()) {
|
||||
accessScope = HSAScope_SYSTEM;
|
||||
} else {
|
||||
fatal("Bad scope type");
|
||||
}
|
||||
}
|
||||
return accessScope;
|
||||
}
|
||||
|
||||
HSASegment
|
||||
reqSegmentToHSASegment(const RequestPtr &req)
|
||||
{
|
||||
HSASegment accessSegment = HSASegment_GLOBAL;
|
||||
|
||||
if (req->isGlobalSegment()) {
|
||||
accessSegment = HSASegment_GLOBAL;
|
||||
} else if (req->isGroupSegment()) {
|
||||
accessSegment = HSASegment_GROUP;
|
||||
} else if (req->isPrivateSegment()) {
|
||||
accessSegment = HSASegment_PRIVATE;
|
||||
} else if (req->isKernargSegment()) {
|
||||
accessSegment = HSASegment_KERNARG;
|
||||
} else if (req->isReadonlySegment()) {
|
||||
accessSegment = HSASegment_READONLY;
|
||||
} else if (req->isSpillSegment()) {
|
||||
accessSegment = HSASegment_SPILL;
|
||||
} else if (req->isArgSegment()) {
|
||||
accessSegment = HSASegment_ARG;
|
||||
} else {
|
||||
fatal("Bad segment type");
|
||||
}
|
||||
|
||||
return accessSegment;
|
||||
}
|
||||
|
||||
UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
|
||||
: coalescer(gc)
|
||||
{
|
||||
@@ -152,6 +100,7 @@ UncoalescedTable::updateResources()
|
||||
{
|
||||
for (auto iter = instMap.begin(); iter != instMap.end(); ) {
|
||||
if (iter->second.empty()) {
|
||||
DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
|
||||
instMap.erase(iter++);
|
||||
coalescer->getGMTokenPort().sendTokens(1);
|
||||
} else {
|
||||
@@ -160,15 +109,27 @@ UncoalescedTable::updateResources()
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
|
||||
// iterate the instructions held in UncoalescedTable to see whether there
|
||||
// are more requests to issue; if yes, not yet done; otherwise, done
|
||||
for (auto& inst : instMap) {
|
||||
DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
|
||||
,inst.first, inst.second.size());
|
||||
if (inst.first == instSeqNum) { return false; }
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
UncoalescedTable::printRequestTable(std::stringstream& ss)
|
||||
{
|
||||
ss << "UncoalescedTable contains " << instMap.size()
|
||||
<< " address entries." << std::endl;
|
||||
ss << "Listing pending packets from " << instMap.size() << " instructions";
|
||||
|
||||
for (auto& inst : instMap) {
|
||||
ss << "Addr 0x" << std::hex << inst.first << std::dec
|
||||
<< " with " << inst.second.size() << " packets"
|
||||
<< std::endl;
|
||||
ss << "\tAddr: " << printAddress(inst.first) << " with "
|
||||
<< inst.second.size() << " pending packets" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,7 +188,6 @@ GPUCoalescer::GPUCoalescer(const Params *p)
|
||||
assert(m_dataCache_ptr);
|
||||
|
||||
m_runningGarnetStandalone = p->garnet_standalone;
|
||||
assumingRfOCoherence = p->assume_rfo;
|
||||
}
|
||||
|
||||
GPUCoalescer::~GPUCoalescer()
|
||||
@@ -254,18 +214,9 @@ GPUCoalescer::wakeup()
|
||||
if (current_time - req->getIssueTime() > m_deadlock_threshold) {
|
||||
std::stringstream ss;
|
||||
printRequestTable(ss);
|
||||
ss << "Outstanding requests: " << m_outstanding_count
|
||||
<< std::endl;
|
||||
|
||||
panic("Possible Deadlock detected. Aborting!\n"
|
||||
"version: %d request.paddr: 0x%x coalescedTable: %d "
|
||||
"current time: %u issue_time: %d difference: %d\n"
|
||||
"Request Tables:\n %s", m_version,
|
||||
req->getFirstPkt()->getAddr(),
|
||||
coalescedTable.size(), cyclesToTicks(current_time),
|
||||
cyclesToTicks(req->getIssueTime()),
|
||||
cyclesToTicks(current_time - req->getIssueTime()),
|
||||
ss.str());
|
||||
warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
|
||||
m_version, ss.str());
|
||||
panic("Aborting due to deadlock!\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -283,21 +234,27 @@ GPUCoalescer::wakeup()
|
||||
void
|
||||
GPUCoalescer::printRequestTable(std::stringstream& ss)
|
||||
{
|
||||
uncoalescedTable.printRequestTable(ss);
|
||||
ss << "Printing out " << coalescedTable.size()
|
||||
<< " outstanding requests in the coalesced table\n";
|
||||
|
||||
ss << "CoalescedTable contains " << coalescedTable.size()
|
||||
<< " address entries." << std::endl;
|
||||
for (auto& requestList : coalescedTable) {
|
||||
ss << "Addr 0x" << std::hex << requestList.first << std::dec
|
||||
<< ": type-";
|
||||
for (auto& request : requestList.second) {
|
||||
ss << RubyRequestType_to_string(request->getRubyType())
|
||||
<< " pkts-" << request->getPackets().size()
|
||||
<< " issued-" << request->getIssueTime() << " seqNum-"
|
||||
<< request->getSeqNum() << "; ";
|
||||
ss << "\tAddr: " << printAddress(requestList.first) << "\n"
|
||||
<< "\tInstruction sequence number: "
|
||||
<< request->getSeqNum() << "\n"
|
||||
<< "\t\tType: "
|
||||
<< RubyRequestType_to_string(request->getRubyType()) << "\n"
|
||||
<< "\t\tNumber of associated packets: "
|
||||
<< request->getPackets().size() << "\n"
|
||||
<< "\t\tIssue time: "
|
||||
<< request->getIssueTime() * clockPeriod() << "\n"
|
||||
<< "\t\tDifference from current tick: "
|
||||
<< (curCycle() - request->getIssueTime()) * clockPeriod();
|
||||
}
|
||||
ss << std::endl;
|
||||
}
|
||||
|
||||
// print out packets waiting to be issued in uncoalesced table
|
||||
uncoalescedTable.printRequestTable(ss);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -387,6 +344,7 @@ GPUCoalescer::writeCallback(Addr address,
|
||||
hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
|
||||
forwardRequestTime, firstResponseTime, isRegion);
|
||||
|
||||
// remove this crequest in coalescedTable
|
||||
delete crequest;
|
||||
coalescedTable.at(address).pop_front();
|
||||
|
||||
@@ -398,6 +356,36 @@ GPUCoalescer::writeCallback(Addr address,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCoalescer::writeCompleteCallback(Addr address,
|
||||
uint64_t instSeqNum,
|
||||
MachineType mach)
|
||||
{
|
||||
DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
|
||||
" instSeqNum = %d\n", address, instSeqNum);
|
||||
|
||||
assert(pendingWriteInsts.count(instSeqNum) == 1);
|
||||
PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
|
||||
|
||||
// check the uncoalescedTable to see whether all requests for the inst
|
||||
// have been issued or not
|
||||
bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
|
||||
DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
|
||||
"reqsAllIssued=%d\n", reqsAllIssued,
|
||||
inst.getNumPendingStores()-1, reqsAllIssued);
|
||||
|
||||
if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
|
||||
// if the pending write instruction has received all write completion
|
||||
// callbacks for its issued Ruby requests, we can now start respond
|
||||
// the requesting CU in one response packet.
|
||||
inst.ackWriteCompletion(m_usingRubyTester);
|
||||
|
||||
DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
|
||||
instSeqNum);
|
||||
pendingWriteInsts.erase(instSeqNum);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCoalescer::readCallback(Addr address, DataBlock& data)
|
||||
{
|
||||
@@ -477,7 +465,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
|
||||
{
|
||||
PacketPtr pkt = crequest->getFirstPkt();
|
||||
Addr request_address = pkt->getAddr();
|
||||
Addr request_line_address = makeLineAddress(request_address);
|
||||
Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);
|
||||
|
||||
RubyRequestType type = crequest->getRubyType();
|
||||
|
||||
@@ -516,20 +504,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
|
||||
"%s\n",
|
||||
RubyRequestType_to_string(type));
|
||||
}
|
||||
|
||||
// If using the RubyTester, update the RubyTester sender state's
|
||||
// subBlock with the recieved data. The tester will later access
|
||||
// this state.
|
||||
// Note: RubyPort will access it's sender state before the
|
||||
// RubyTester.
|
||||
if (m_usingRubyTester) {
|
||||
RubyPort::SenderState *requestSenderState =
|
||||
safe_cast<RubyPort::SenderState*>(pkt->senderState);
|
||||
RubyTester::SenderState* testerSenderState =
|
||||
safe_cast<RubyTester::SenderState*>
|
||||
(requestSenderState->predecessor);
|
||||
testerSenderState->subBlock.mergeFrom(data);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -566,8 +540,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
|
||||
} else if (pkt->isWrite()) {
|
||||
req_type = RubyRequestType_ST;
|
||||
} else {
|
||||
// Acquire and release packets will have been issued by
|
||||
// makeRequest, so we do not need to check for it here.
|
||||
panic("Unsupported ruby packet type\n");
|
||||
}
|
||||
|
||||
@@ -579,71 +551,43 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
|
||||
RequestStatus
|
||||
GPUCoalescer::makeRequest(PacketPtr pkt)
|
||||
{
|
||||
// Check for GPU Barrier Kernel End or Kernel Begin
|
||||
// Leave these to be handled by the child class
|
||||
// Kernel End/Barrier = isFlush + isRelease
|
||||
// Kernel Begin = isFlush + isAcquire
|
||||
if (pkt->req->isKernel()) {
|
||||
if (pkt->req->isAcquire()){
|
||||
// This is a Kernel Begin leave handling to
|
||||
// virtual xCoalescer::makeRequest
|
||||
return RequestStatus_Issued;
|
||||
}else if (pkt->req->isRelease()) {
|
||||
// This is a Kernel End leave handling to
|
||||
// virtual xCoalescer::makeRequest
|
||||
// If we are here then we didn't call
|
||||
// a virtual version of this function
|
||||
// so we will also schedule the callback
|
||||
int wf_id = 0;
|
||||
if (pkt->req->hasContextId()) {
|
||||
wf_id = pkt->req->contextId();
|
||||
}
|
||||
insertKernel(wf_id, pkt);
|
||||
newKernelEnds.push_back(wf_id);
|
||||
if (!issueEvent.scheduled()) {
|
||||
schedule(issueEvent, curTick());
|
||||
}
|
||||
return RequestStatus_Issued;
|
||||
// all packets must have valid instruction sequence numbers
|
||||
assert(pkt->req->hasInstSeqNum());
|
||||
|
||||
if (pkt->cmd == MemCmd::MemSyncReq) {
|
||||
// issue mem_sync requests immedidately to the cache system without
|
||||
// going though uncoalescedTable like normal LD/ST/Atomic requests
|
||||
issueMemSyncRequest(pkt);
|
||||
} else {
|
||||
// otherwise, this must be either read or write command
|
||||
assert(pkt->isRead() || pkt->isWrite());
|
||||
|
||||
// the pkt is temporarily stored in the uncoalesced table until
|
||||
// it's picked for coalescing process later in this cycle or in a
|
||||
// future cycle
|
||||
uncoalescedTable.insertPacket(pkt);
|
||||
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
|
||||
pkt->getAddr());
|
||||
|
||||
// we schedule an issue event here to process the uncoalesced table
|
||||
// and try to issue Ruby request to cache system
|
||||
if (!issueEvent.scheduled()) {
|
||||
schedule(issueEvent, curTick());
|
||||
}
|
||||
}
|
||||
|
||||
if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
|
||||
!pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
|
||||
(pkt->req->isRelease() || pkt->req->isAcquire())) {
|
||||
if (assumingRfOCoherence) {
|
||||
// If we reached here, this request must be a memFence
|
||||
// and the protocol implements RfO, the coalescer can
|
||||
// assume sequentially consistency and schedule the callback
|
||||
// immediately.
|
||||
// Currently the code implements fence callbacks
|
||||
// by reusing the mechanism for kernel completions.
|
||||
// This should be fixed.
|
||||
int wf_id = 0;
|
||||
if (pkt->req->hasContextId()) {
|
||||
wf_id = pkt->req->contextId();
|
||||
}
|
||||
insertKernel(wf_id, pkt);
|
||||
newKernelEnds.push_back(wf_id);
|
||||
if (!issueEvent.scheduled()) {
|
||||
schedule(issueEvent, curTick());
|
||||
}
|
||||
return RequestStatus_Issued;
|
||||
} else {
|
||||
// If not RfO, return issued here and let the child coalescer
|
||||
// take care of it.
|
||||
return RequestStatus_Issued;
|
||||
}
|
||||
}
|
||||
|
||||
uncoalescedTable.insertPacket(pkt);
|
||||
DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
|
||||
|
||||
if (!issueEvent.scheduled())
|
||||
schedule(issueEvent, curTick());
|
||||
// TODO: issue hardware prefetches here
|
||||
// we always return RequestStatus_Issued in this coalescer
|
||||
// b/c the coalescer's resouce was checked ealier and the coalescer is
|
||||
// queueing up aliased requets in its coalesced table
|
||||
return RequestStatus_Issued;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO: Figure out what do with this code. This code may go away
|
||||
* and/or be merged into the VIPER coalescer once the VIPER
|
||||
* protocol is re-integrated with GCN3 codes.
|
||||
*/
|
||||
/*
|
||||
void
|
||||
GPUCoalescer::issueRequest(CoalescedRequest* crequest)
|
||||
{
|
||||
@@ -736,8 +680,8 @@ GPUCoalescer::issueRequest(CoalescedRequest* crequest)
|
||||
}
|
||||
|
||||
assert(m_mandatory_q_ptr);
|
||||
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
|
||||
}
|
||||
m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
|
||||
}*/
|
||||
|
||||
template <class KEY, class VALUE>
|
||||
std::ostream &
|
||||
@@ -760,12 +704,6 @@ GPUCoalescer::print(ostream& out) const
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
|
||||
DPRINTF(RubyStats, "Recorded statistic: %s\n",
|
||||
SequencerRequestType_to_string(requestType));
|
||||
}
|
||||
|
||||
bool
|
||||
GPUCoalescer::coalescePacket(PacketPtr pkt)
|
||||
{
|
||||
@@ -819,6 +757,41 @@ GPUCoalescer::coalescePacket(PacketPtr pkt)
|
||||
// be counted as outstanding requests.
|
||||
m_outstanding_count++;
|
||||
|
||||
// We track all issued or to-be-issued Ruby requests associated with
|
||||
// write instructions. An instruction may have multiple Ruby
|
||||
// requests.
|
||||
if (pkt->cmd == MemCmd::WriteReq) {
|
||||
DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
|
||||
" the pending write instruction list\n", seqNum,
|
||||
line_addr);
|
||||
|
||||
RubyPort::SenderState* ss =
|
||||
safe_cast<RubyPort::SenderState*>(pkt->senderState);
|
||||
|
||||
// we need to save this port because it will be used to call
|
||||
// back the requesting CU when we receive write
|
||||
// complete callbacks for all issued Ruby requests of this
|
||||
// instruction.
|
||||
RubyPort::MemSlavePort* mem_slave_port = ss->port;
|
||||
|
||||
GPUDynInstPtr gpuDynInst = nullptr;
|
||||
|
||||
if (!m_usingRubyTester) {
|
||||
// If this coalescer is connected to a real CU, we need
|
||||
// to save the corresponding gpu dynamic instruction.
|
||||
// CU will use that instruction to decrement wait counters
|
||||
// in the issuing wavefront.
|
||||
// For Ruby tester, gpuDynInst == nullptr
|
||||
ComputeUnit::DataPort::SenderState* cu_state =
|
||||
safe_cast<ComputeUnit::DataPort::SenderState*>
|
||||
(ss->predecessor);
|
||||
gpuDynInst = cu_state->_gpuDynInst;
|
||||
}
|
||||
|
||||
PendingWriteInst& inst = pendingWriteInsts[seqNum];
|
||||
inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -907,34 +880,6 @@ GPUCoalescer::atomicCallback(Addr address,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
|
||||
{
|
||||
if (myMachID == senderMachID) {
|
||||
CP_TCPLdHits++;
|
||||
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
|
||||
CP_TCPLdTransfers++;
|
||||
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
|
||||
CP_TCCLdHits++;
|
||||
} else {
|
||||
CP_LdMiss++;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
|
||||
{
|
||||
if (myMachID == senderMachID) {
|
||||
CP_TCPStHits++;
|
||||
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
|
||||
CP_TCPStTransfers++;
|
||||
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
|
||||
CP_TCCStHits++;
|
||||
} else {
|
||||
CP_StMiss++;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
|
||||
{
|
||||
@@ -970,74 +915,6 @@ GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
|
||||
Cycles firstResponseTime,
|
||||
bool success, bool isRegion)
|
||||
{
|
||||
RubyRequestType type = crequest->getRubyType();
|
||||
Cycles issued_time = crequest->getIssueTime();
|
||||
Cycles completion_time = curCycle();
|
||||
assert(completion_time >= issued_time);
|
||||
Cycles total_lat = completion_time - issued_time;
|
||||
|
||||
// cache stats (valid for RfO protocol only)
|
||||
if (mach == MachineType_TCP) {
|
||||
if (type == RubyRequestType_LD) {
|
||||
GPU_TCPLdHits++;
|
||||
} else {
|
||||
GPU_TCPStHits++;
|
||||
}
|
||||
} else if (mach == MachineType_L1Cache_wCC) {
|
||||
if (type == RubyRequestType_LD) {
|
||||
GPU_TCPLdTransfers++;
|
||||
} else {
|
||||
GPU_TCPStTransfers++;
|
||||
}
|
||||
} else if (mach == MachineType_TCC) {
|
||||
if (type == RubyRequestType_LD) {
|
||||
GPU_TCCLdHits++;
|
||||
} else {
|
||||
GPU_TCCStHits++;
|
||||
}
|
||||
} else {
|
||||
if (type == RubyRequestType_LD) {
|
||||
GPU_LdMiss++;
|
||||
} else {
|
||||
GPU_StMiss++;
|
||||
}
|
||||
}
|
||||
|
||||
// Profile all access latency, even zero latency accesses
|
||||
m_latencyHist.sample(total_lat);
|
||||
m_typeLatencyHist[type]->sample(total_lat);
|
||||
|
||||
// Profile the miss latency for all non-zero demand misses
|
||||
if (total_lat != Cycles(0)) {
|
||||
m_missLatencyHist.sample(total_lat);
|
||||
m_missTypeLatencyHist[type]->sample(total_lat);
|
||||
|
||||
if (mach != MachineType_NUM) {
|
||||
m_missMachLatencyHist[mach]->sample(total_lat);
|
||||
m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
|
||||
|
||||
if ((issued_time <= initialRequestTime) &&
|
||||
(initialRequestTime <= forwardRequestTime) &&
|
||||
(forwardRequestTime <= firstResponseTime) &&
|
||||
(firstResponseTime <= completion_time)) {
|
||||
|
||||
m_IssueToInitialDelayHist[mach]->sample(
|
||||
initialRequestTime - issued_time);
|
||||
m_InitialToForwardDelayHist[mach]->sample(
|
||||
forwardRequestTime - initialRequestTime);
|
||||
m_ForwardToFirstResponseDelayHist[mach]->sample(
|
||||
firstResponseTime - forwardRequestTime);
|
||||
m_FirstResponseToCompletionDelayHist[mach]->sample(
|
||||
completion_time - firstResponseTime);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
|
||||
curTick(), m_version, "Coal",
|
||||
success ? "Done" : "SC_Failed", "", "",
|
||||
printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -1085,74 +962,4 @@ GPUCoalescer::regStats()
|
||||
m_missTypeMachLatencyHist[i][j]->init(10);
|
||||
}
|
||||
}
|
||||
|
||||
// GPU cache stats
|
||||
GPU_TCPLdHits
|
||||
.name(name() + ".gpu_tcp_ld_hits")
|
||||
.desc("loads that hit in the TCP")
|
||||
;
|
||||
GPU_TCPLdTransfers
|
||||
.name(name() + ".gpu_tcp_ld_transfers")
|
||||
.desc("TCP to TCP load transfers")
|
||||
;
|
||||
GPU_TCCLdHits
|
||||
.name(name() + ".gpu_tcc_ld_hits")
|
||||
.desc("loads that hit in the TCC")
|
||||
;
|
||||
GPU_LdMiss
|
||||
.name(name() + ".gpu_ld_misses")
|
||||
.desc("loads that miss in the GPU")
|
||||
;
|
||||
|
||||
GPU_TCPStHits
|
||||
.name(name() + ".gpu_tcp_st_hits")
|
||||
.desc("stores that hit in the TCP")
|
||||
;
|
||||
GPU_TCPStTransfers
|
||||
.name(name() + ".gpu_tcp_st_transfers")
|
||||
.desc("TCP to TCP store transfers")
|
||||
;
|
||||
GPU_TCCStHits
|
||||
.name(name() + ".gpu_tcc_st_hits")
|
||||
.desc("stores that hit in the TCC")
|
||||
;
|
||||
GPU_StMiss
|
||||
.name(name() + ".gpu_st_misses")
|
||||
.desc("stores that miss in the GPU")
|
||||
;
|
||||
|
||||
// CP cache stats
|
||||
CP_TCPLdHits
|
||||
.name(name() + ".cp_tcp_ld_hits")
|
||||
.desc("loads that hit in the TCP")
|
||||
;
|
||||
CP_TCPLdTransfers
|
||||
.name(name() + ".cp_tcp_ld_transfers")
|
||||
.desc("TCP to TCP load transfers")
|
||||
;
|
||||
CP_TCCLdHits
|
||||
.name(name() + ".cp_tcc_ld_hits")
|
||||
.desc("loads that hit in the TCC")
|
||||
;
|
||||
CP_LdMiss
|
||||
.name(name() + ".cp_ld_misses")
|
||||
.desc("loads that miss in the GPU")
|
||||
;
|
||||
|
||||
CP_TCPStHits
|
||||
.name(name() + ".cp_tcp_st_hits")
|
||||
.desc("stores that hit in the TCP")
|
||||
;
|
||||
CP_TCPStTransfers
|
||||
.name(name() + ".cp_tcp_st_transfers")
|
||||
.desc("TCP to TCP store transfers")
|
||||
;
|
||||
CP_TCCStHits
|
||||
.name(name() + ".cp_tcc_st_hits")
|
||||
.desc("stores that hit in the TCC")
|
||||
;
|
||||
CP_StMiss
|
||||
.name(name() + ".cp_st_misses")
|
||||
.desc("stores that miss in the GPU")
|
||||
;
|
||||
}
|
||||
|
||||
@@ -38,11 +38,11 @@
|
||||
#include <unordered_map>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "mem/request.hh"
|
||||
#include "mem/ruby/common/Address.hh"
|
||||
#include "mem/ruby/common/Consumer.hh"
|
||||
#include "mem/ruby/protocol/HSAScope.hh"
|
||||
#include "mem/ruby/protocol/HSASegment.hh"
|
||||
#include "mem/ruby/protocol/PrefetchBit.hh"
|
||||
#include "mem/ruby/protocol/RubyAccessMode.hh"
|
||||
#include "mem/ruby/protocol/RubyRequestType.hh"
|
||||
@@ -57,9 +57,6 @@ class CacheMemory;
|
||||
|
||||
class RubyGPUCoalescerParams;
|
||||
|
||||
HSAScope reqScopeToHSAScope(const RequestPtr &req);
|
||||
HSASegment reqSegmentToHSASegment(const RequestPtr &req);
|
||||
|
||||
// List of packets that belongs to a specific instruction.
|
||||
typedef std::list<PacketPtr> PerInstPackets;
|
||||
|
||||
@@ -78,6 +75,7 @@ class UncoalescedTable
|
||||
// instructions at the offset.
|
||||
PerInstPackets* getInstPackets(int offset);
|
||||
void updateResources();
|
||||
bool areRequestsDone(const uint64_t instSeqNum);
|
||||
|
||||
// Check if a packet hasn't been removed from instMap in too long.
|
||||
// Panics if a deadlock is detected and returns nothing otherwise.
|
||||
@@ -120,6 +118,86 @@ class CoalescedRequest
|
||||
std::vector<PacketPtr> pkts;
|
||||
};
|
||||
|
||||
// PendingWriteInst tracks the number of outstanding Ruby requests
|
||||
// per write instruction. Once all requests associated with one instruction
|
||||
// are completely done in Ruby, we call back the requester to mark
|
||||
// that this instruction is complete.
|
||||
class PendingWriteInst
|
||||
{
|
||||
public:
|
||||
PendingWriteInst()
|
||||
: numPendingStores(0),
|
||||
originalPort(nullptr),
|
||||
gpuDynInstPtr(nullptr)
|
||||
{}
|
||||
|
||||
~PendingWriteInst()
|
||||
{}
|
||||
|
||||
void
|
||||
addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
|
||||
bool usingRubyTester)
|
||||
{
|
||||
assert(port);
|
||||
originalPort = port;
|
||||
|
||||
if (!usingRubyTester) {
|
||||
gpuDynInstPtr = inst;
|
||||
}
|
||||
|
||||
numPendingStores++;
|
||||
}
|
||||
|
||||
// return true if no more ack is expected
|
||||
bool
|
||||
receiveWriteCompleteAck()
|
||||
{
|
||||
assert(numPendingStores > 0);
|
||||
numPendingStores--;
|
||||
return (numPendingStores == 0) ? true : false;
|
||||
}
|
||||
|
||||
// ack the original requester that this write instruction is complete
|
||||
void
|
||||
ackWriteCompletion(bool usingRubyTester)
|
||||
{
|
||||
assert(numPendingStores == 0);
|
||||
|
||||
// make a response packet
|
||||
PacketPtr pkt = new Packet(std::make_shared<Request>(),
|
||||
MemCmd::WriteCompleteResp);
|
||||
|
||||
if (!usingRubyTester) {
|
||||
assert(gpuDynInstPtr);
|
||||
ComputeUnit::DataPort::SenderState* ss =
|
||||
new ComputeUnit::DataPort::SenderState
|
||||
(gpuDynInstPtr, 0, nullptr);
|
||||
pkt->senderState = ss;
|
||||
}
|
||||
|
||||
// send the ack response to the requester
|
||||
originalPort->sendTimingResp(pkt);
|
||||
}
|
||||
|
||||
int
|
||||
getNumPendingStores() {
|
||||
return numPendingStores;
|
||||
}
|
||||
|
||||
private:
|
||||
// the number of stores waiting for writeCompleteCallback
|
||||
int numPendingStores;
|
||||
// The original port that sent one of packets associated with this
|
||||
// write instruction. We may have more than one packet per instruction,
|
||||
// which implies multiple ports per instruction. However, we need
|
||||
// only 1 of the ports to call back the CU. Therefore, here we keep
|
||||
// track the port that sent the first packet of this instruction.
|
||||
RubyPort::MemSlavePort* originalPort;
|
||||
// similar to the originalPort, this gpuDynInstPtr is set only for
|
||||
// the first packet of this instruction.
|
||||
GPUDynInstPtr gpuDynInstPtr;
|
||||
};
|
||||
|
||||
class GPUCoalescer : public RubyPort
|
||||
{
|
||||
public:
|
||||
@@ -159,6 +237,17 @@ class GPUCoalescer : public RubyPort
|
||||
void collateStats();
|
||||
void regStats() override;
|
||||
|
||||
// each store request needs two callbacks:
|
||||
// (1) writeCallback is called when the store is received and processed
|
||||
// by TCP. This writeCallback does not guarantee the store is actually
|
||||
// completed at its destination cache or memory. writeCallback helps
|
||||
// release hardware resources (e.g., its entry in coalescedTable)
|
||||
// allocated for the store so that subsequent requests will not be
|
||||
// blocked unnecessarily due to hardware resource constraints.
|
||||
// (2) writeCompleteCallback is called when the store is fully completed
|
||||
// at its destination cache or memory. writeCompleteCallback
|
||||
// guarantees that the store is fully completed. This callback
|
||||
// will decrement hardware counters in CU
|
||||
void writeCallback(Addr address, DataBlock& data);
|
||||
|
||||
void writeCallback(Addr address,
|
||||
@@ -180,6 +269,10 @@ class GPUCoalescer : public RubyPort
|
||||
Cycles forwardRequestTime,
|
||||
Cycles firstResponseTime);
|
||||
|
||||
void writeCompleteCallback(Addr address,
|
||||
uint64_t instSeqNum,
|
||||
MachineType mach);
|
||||
|
||||
void readCallback(Addr address, DataBlock& data);
|
||||
|
||||
void readCallback(Addr address,
|
||||
@@ -200,18 +293,12 @@ class GPUCoalescer : public RubyPort
|
||||
Cycles forwardRequestTime,
|
||||
Cycles firstResponseTime,
|
||||
bool isRegion);
|
||||
/* atomics need their own callback because the data
|
||||
might be const coming from SLICC */
|
||||
|
||||
void atomicCallback(Addr address,
|
||||
MachineType mach,
|
||||
const DataBlock& data);
|
||||
|
||||
void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
|
||||
void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
|
||||
|
||||
// Alternate implementations in VIPER Coalescer
|
||||
virtual RequestStatus makeRequest(PacketPtr pkt) override;
|
||||
|
||||
RequestStatus makeRequest(PacketPtr pkt) override;
|
||||
int outstandingCount() const override { return m_outstanding_count; }
|
||||
|
||||
bool
|
||||
@@ -237,7 +324,6 @@ class GPUCoalescer : public RubyPort
|
||||
|
||||
GMTokenPort& getGMTokenPort() { return gmTokenPort; }
|
||||
|
||||
void recordRequestType(SequencerRequestType requestType);
|
||||
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
|
||||
|
||||
Stats::Histogram& getLatencyHist() { return m_latencyHist; }
|
||||
@@ -271,15 +357,17 @@ class GPUCoalescer : public RubyPort
|
||||
getFirstResponseToCompletionDelayHist(const MachineType t) const
|
||||
{ return *m_FirstResponseToCompletionDelayHist[t]; }
|
||||
|
||||
// Changed to protected to enable inheritance by VIPER Coalescer
|
||||
protected:
|
||||
bool tryCacheAccess(Addr addr, RubyRequestType type,
|
||||
Addr pc, RubyAccessMode access_mode,
|
||||
int size, DataBlock*& data_ptr);
|
||||
// Alternate implementations in VIPER Coalescer
|
||||
virtual void issueRequest(CoalescedRequest* crequest);
|
||||
|
||||
void kernelCallback(int wavfront_id);
|
||||
// since the two following issue functions are protocol-specific,
|
||||
// they must be implemented in a derived coalescer
|
||||
virtual void issueRequest(CoalescedRequest* crequest) = 0;
|
||||
virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
|
||||
|
||||
void kernelCallback(int wavefront_id);
|
||||
|
||||
void hitCallback(CoalescedRequest* crequest,
|
||||
MachineType mach,
|
||||
@@ -297,7 +385,6 @@ class GPUCoalescer : public RubyPort
|
||||
bool success, bool isRegion);
|
||||
void completeHitCallback(std::vector<PacketPtr> & mylist);
|
||||
|
||||
|
||||
virtual RubyRequestType getRequestType(PacketPtr pkt);
|
||||
|
||||
// Attempt to remove a packet from the uncoalescedTable and coalesce
|
||||
@@ -309,8 +396,6 @@ class GPUCoalescer : public RubyPort
|
||||
|
||||
EventFunctionWrapper issueEvent;
|
||||
|
||||
|
||||
// Changed to protected to enable inheritance by VIPER Coalescer
|
||||
protected:
|
||||
int m_max_outstanding_requests;
|
||||
Cycles m_deadlock_threshold;
|
||||
@@ -334,6 +419,11 @@ class GPUCoalescer : public RubyPort
|
||||
// an address, the are serviced in age order.
|
||||
std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
|
||||
|
||||
// a map btw an instruction sequence number and PendingWriteInst
|
||||
// this is used to do a final call back for each write when it is
|
||||
// completely done in the memory system
|
||||
std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
|
||||
|
||||
// Global outstanding request count, across all request tables
|
||||
int m_outstanding_count;
|
||||
bool m_deadlock_check_scheduled;
|
||||
@@ -350,26 +440,28 @@ class GPUCoalescer : public RubyPort
|
||||
EventFunctionWrapper deadlockCheckEvent;
|
||||
bool assumingRfOCoherence;
|
||||
|
||||
// m5 style stats for TCP hit/miss counts
|
||||
Stats::Scalar GPU_TCPLdHits;
|
||||
Stats::Scalar GPU_TCPLdTransfers;
|
||||
Stats::Scalar GPU_TCCLdHits;
|
||||
Stats::Scalar GPU_LdMiss;
|
||||
|
||||
Stats::Scalar GPU_TCPStHits;
|
||||
Stats::Scalar GPU_TCPStTransfers;
|
||||
Stats::Scalar GPU_TCCStHits;
|
||||
Stats::Scalar GPU_StMiss;
|
||||
|
||||
Stats::Scalar CP_TCPLdHits;
|
||||
Stats::Scalar CP_TCPLdTransfers;
|
||||
Stats::Scalar CP_TCCLdHits;
|
||||
Stats::Scalar CP_LdMiss;
|
||||
|
||||
Stats::Scalar CP_TCPStHits;
|
||||
Stats::Scalar CP_TCPStTransfers;
|
||||
Stats::Scalar CP_TCCStHits;
|
||||
Stats::Scalar CP_StMiss;
|
||||
// TODO - Need to update the following stats once the VIPER protocol
|
||||
// is re-integrated.
|
||||
// // m5 style stats for TCP hit/miss counts
|
||||
// Stats::Scalar GPU_TCPLdHits;
|
||||
// Stats::Scalar GPU_TCPLdTransfers;
|
||||
// Stats::Scalar GPU_TCCLdHits;
|
||||
// Stats::Scalar GPU_LdMiss;
|
||||
//
|
||||
// Stats::Scalar GPU_TCPStHits;
|
||||
// Stats::Scalar GPU_TCPStTransfers;
|
||||
// Stats::Scalar GPU_TCCStHits;
|
||||
// Stats::Scalar GPU_StMiss;
|
||||
//
|
||||
// Stats::Scalar CP_TCPLdHits;
|
||||
// Stats::Scalar CP_TCPLdTransfers;
|
||||
// Stats::Scalar CP_TCCLdHits;
|
||||
// Stats::Scalar CP_LdMiss;
|
||||
//
|
||||
// Stats::Scalar CP_TCPStHits;
|
||||
// Stats::Scalar CP_TCPStTransfers;
|
||||
// Stats::Scalar CP_TCCStHits;
|
||||
// Stats::Scalar CP_StMiss;
|
||||
|
||||
//! Histogram for number of outstanding requests per cycle.
|
||||
Stats::Histogram m_outstandReqHist;
|
||||
@@ -394,6 +486,21 @@ class GPUCoalescer : public RubyPort
|
||||
std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
|
||||
std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
|
||||
|
||||
// TODO - Need to update the following stats once the VIPER protocol
|
||||
// is re-integrated.
|
||||
// Stats::Distribution numHopDelays;
|
||||
// Stats::Distribution tcpToTccDelay;
|
||||
// Stats::Distribution tccToSdDelay;
|
||||
// Stats::Distribution sdToSdDelay;
|
||||
// Stats::Distribution sdToTccDelay;
|
||||
// Stats::Distribution tccToTcpDelay;
|
||||
//
|
||||
// Stats::Average avgTcpToTcc;
|
||||
// Stats::Average avgTccToSd;
|
||||
// Stats::Average avgSdToSd;
|
||||
// Stats::Average avgSdToTcc;
|
||||
// Stats::Average avgTccToTcp;
|
||||
|
||||
private:
|
||||
// Token port is used to send/receive tokens to/from GPU's global memory
|
||||
// pipeline across the port boundary. There is one per <wave size> data
|
||||
|
||||
@@ -36,6 +36,7 @@ from m5.objects.Sequencer import *
|
||||
|
||||
class RubyGPUCoalescer(RubyPort):
|
||||
type = 'RubyGPUCoalescer'
|
||||
abstract = True
|
||||
cxx_class = 'GPUCoalescer'
|
||||
cxx_header = "mem/ruby/system/GPUCoalescer.hh"
|
||||
|
||||
@@ -44,8 +45,6 @@ class RubyGPUCoalescer(RubyPort):
|
||||
"max requests (incl. prefetches) outstanding")
|
||||
max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
|
||||
"coalesced in a single cycle")
|
||||
assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
|
||||
"Ownership coherence");
|
||||
|
||||
icache = Param.RubyCache("")
|
||||
dcache = Param.RubyCache("")
|
||||
|
||||
@@ -58,7 +58,7 @@ class VIPERCoalescer : public GPUCoalescer
|
||||
VIPERCoalescer(const Params *);
|
||||
~VIPERCoalescer();
|
||||
|
||||
void issueMemSyncRequest(PacketPtr pkt);
|
||||
void issueMemSyncRequest(PacketPtr pkt) override;
|
||||
void issueRequest(CoalescedRequest* crequest) override;
|
||||
void wbCallback(Addr address);
|
||||
void invCallback(Addr address);
|
||||
|
||||
@@ -39,4 +39,3 @@ class VIPERCoalescer(RubyGPUCoalescer):
|
||||
cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
|
||||
max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
|
||||
max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
|
||||
assume_rfo = False
|
||||
|
||||
Reference in New Issue
Block a user