gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Tony Gutierrez
2018-05-01 16:59:35 -04:00
committed by Anthony Gutierrez
parent b0eac7857a
commit b8da9abba7
86 changed files with 10299 additions and 3734 deletions

5
build_opts/GCN3_X86 Normal file
View File

@@ -0,0 +1,5 @@
PROTOCOL = 'GPU_VIPER'
TARGET_ISA = 'x86'
TARGET_GPU_ISA = 'gcn3'
BUILD_GPU = True
CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'

View File

@@ -48,7 +48,7 @@ def TLB_constructor(level):
maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\
accessDistance = options.L%(level)dAccessDistanceStat,\
clk_domain = SrcClockDomain(\
clock = options.GPUClock,\
clock = options.gpu_clock,\
voltage_domain = VoltageDomain(\
voltage = options.gpu_voltage)))" % locals()
return constructor_call
@@ -60,23 +60,22 @@ def Coalescer_constructor(level):
coalescingWindow = options.L%(level)dCoalescingWindow,\
disableCoalescing = options.L%(level)dDisableCoalescing,\
clk_domain = SrcClockDomain(\
clock = options.GPUClock,\
clock = options.gpu_clock,\
voltage_domain = VoltageDomain(\
voltage = options.gpu_voltage)))" % locals()
return constructor_call
def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name):
# arguments: options, TLB level, number of private structures for this Level,
# TLB name and Coalescer name
def create_TLB_Coalescer(options, my_level, my_index, tlb_name,
coalescer_name):
# arguments: options, TLB level, number of private structures for this
# Level, TLB name and Coalescer name
for i in range(my_index):
TLB_name.append(eval(TLB_constructor(my_level)))
Coalescer_name.append(eval(Coalescer_constructor(my_level)))
tlb_name.append(eval(TLB_constructor(my_level)))
coalescer_name.append(eval(Coalescer_constructor(my_level)))
def config_tlb_hierarchy(options, system, shader_idx):
n_cu = options.num_compute_units
# Make this configurable now, instead of the hard coded val. The dispatcher
# is always the last item in the system.cpu list.
dispatcher_idx = len(system.cpu) - 1
n_cu = options.cu_per_sa * options.sa_per_complex * \
options.num_gpu_complexes
if options.TLB_config == "perLane":
num_TLBs = 64 * n_cu
@@ -90,21 +89,26 @@ def config_tlb_hierarchy(options, system, shader_idx):
print("Bad option for TLB Configuration.")
sys.exit(1)
#----------------------------------------------------------------------------------------
#-------------------------------------------------------------------------
# A visual representation of the TLB hierarchy
# for ease of configuration
# < Modify here the width and the number of levels if you want a different configuration >
# width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level
L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []},
{'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []},
{'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}]
# < Modify here the width and the number of levels if you want a different
# configuration >
# width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc)
# for this level
L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [],
'CoalescerArray': []},
{'name': 'scalar', 'width' : options.num_scalar_cache,
'TLBarray': [], 'CoalescerArray': []},
{'name': 'l1', 'width': num_TLBs, 'TLBarray': [],
'CoalescerArray': []}]
L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
TLB_hierarchy = [L1, L2, L3]
#----------------------------------------------------------------------------------------
#-------------------------------------------------------------------------
# Create the hiearchy
# Call the appropriate constructors and add objects to the system
@@ -164,17 +168,14 @@ def config_tlb_hierarchy(options, system, shader_idx):
for tlb in range(tlb_per_cu):
exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
system.l1_coalescer[%d].slave[%d]' % \
(shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0))
(shader_idx, cu_idx, tlb,
cu_idx*tlb_per_cu+tlb, 0))
else:
exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
system.l1_coalescer[%d].slave[%d]' % \
(shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs)))
elif name == 'dispatcher': # Dispatcher TLB
for index in range(TLB_type['width']):
exec('system.cpu[%d].translation_port = \
system.dispatcher_coalescer[%d].slave[0]' % \
(dispatcher_idx, index))
(shader_idx, cu_idx, tlb_per_cu,
cu_idx / (n_cu / num_TLBs),
cu_idx % (n_cu / num_TLBs)))
elif name == 'sqc': # I-TLB
for index in range(n_cu):
sqc_tlb_index = index / options.cu_per_sqc
@@ -182,7 +183,14 @@ def config_tlb_hierarchy(options, system, shader_idx):
exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \
system.sqc_coalescer[%d].slave[%d]' % \
(shader_idx, index, sqc_tlb_index, sqc_tlb_port_id))
elif name == 'scalar': # Scalar D-TLB
for index in range(n_cu):
scalar_tlb_index = index / options.cu_per_scalar_cache
scalar_tlb_port_id = index % options.cu_per_scalar_cache
exec('system.cpu[%d].CUs[%d].scalar_tlb_port = \
system.scalar_coalescer[%d].slave[%d]' % \
(shader_idx, index, scalar_tlb_index,
scalar_tlb_port_id))
# Connect the memSidePorts (masters) of all the TLBs with the
# cpuSidePorts (slaves) of the Coalescers of the next level

View File

@@ -3728,7 +3728,7 @@ namespace Gcn3ISA
DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
wf->computeUnit->cu_id, wf->wgId, refCount);
wf->computeUnit->registerManager.freeRegisters(wf);
wf->computeUnit->registerManager->freeRegisters(wf);
wf->computeUnit->completedWfs++;
wf->computeUnit->activeWaves--;

View File

@@ -192,7 +192,7 @@ namespace Gcn3ISA
*/
bool misaligned_acc = split_addr > vaddr;
RequestPtr req = new Request(0, vaddr, req_size, 0,
RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId);
@@ -208,7 +208,6 @@ namespace Gcn3ISA
pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
delete req;
} else {
gpuDynInst->numScalarReqs = 1;
gpuDynInst->setRequestFlags(req);
@@ -243,7 +242,7 @@ namespace Gcn3ISA
*/
bool misaligned_acc = split_addr > vaddr;
RequestPtr req = new Request(0, vaddr, req_size, 0,
RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId);
@@ -259,7 +258,6 @@ namespace Gcn3ISA
pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
delete req;
} else {
gpuDynInst->numScalarReqs = 1;
gpuDynInst->setRequestFlags(req);
@@ -574,7 +572,8 @@ namespace Gcn3ISA
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
RequestPtr req = std::make_shared<Request>(vaddr,
sizeof(T), 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId);
@@ -600,7 +599,8 @@ namespace Gcn3ISA
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
RequestPtr req = std::make_shared<Request>(vaddr,
sizeof(T), 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
@@ -619,7 +619,7 @@ namespace Gcn3ISA
{
// create request and set flags
gpuDynInst->statusBitVector = VectorMask(1);
Request *req = new Request(0, 0, 0, 0,
RequestPtr req = std::make_shared<Request>(0, 0, 0,
gpuDynInst->computeUnit()->
masterId(), 0,
gpuDynInst->wfDynId);
@@ -777,7 +777,8 @@ namespace Gcn3ISA
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
RequestPtr req = std::make_shared<Request>(vaddr,
sizeof(T), 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId);
@@ -802,7 +803,8 @@ namespace Gcn3ISA
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
RequestPtr req = new Request(0, vaddr, req_size, 0,
RequestPtr req = std::make_shared<Request>(vaddr, req_size,
0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId);
@@ -826,7 +828,8 @@ namespace Gcn3ISA
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
RequestPtr req = std::make_shared<Request>(vaddr,
sizeof(T), 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
@@ -851,7 +854,8 @@ namespace Gcn3ISA
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
RequestPtr req = new Request(0, vaddr, req_size, 0,
RequestPtr req = std::make_shared<Request>(vaddr, req_size,
0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
@@ -875,7 +879,8 @@ namespace Gcn3ISA
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
RequestPtr req = std::make_shared<Request>(vaddr,
sizeof(T), 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId,
gpuDynInst->makeAtomicOpFunctor<T>(

View File

@@ -153,7 +153,7 @@ namespace Gcn3ISA
ComputeUnit *cu = _gpuDynInst->computeUnit();
for (auto i = 0; i < NumDwords; ++i) {
int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx + i);
int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx + i);
vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx);
@@ -207,7 +207,7 @@ namespace Gcn3ISA
? _gpuDynInst->exec_mask : wf->execMask();
if (NumDwords == 1) {
int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx);
int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx);
vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
assert(vrfData[0]);
auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
@@ -223,8 +223,8 @@ namespace Gcn3ISA
DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx);
cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
} else if (NumDwords == 2) {
int vgprIdx0 = cu->registerManager.mapVgpr(wf, _opIdx);
int vgprIdx1 = cu->registerManager.mapVgpr(wf, _opIdx + 1);
int vgprIdx0 = cu->registerManager->mapVgpr(wf, _opIdx);
int vgprIdx1 = cu->registerManager->mapVgpr(wf, _opIdx + 1);
vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0);
vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1);
assert(vrfData[0]);
@@ -605,16 +605,16 @@ namespace Gcn3ISA
if (_opIdx == REG_VCC_LO) {
sgprIdx = cu->registerManager
.mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
->mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
} else if (_opIdx == REG_FLAT_SCRATCH_HI) {
sgprIdx = cu->registerManager
.mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
->mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
} else if (_opIdx == REG_FLAT_SCRATCH_LO) {
assert(NumDwords == 1);
sgprIdx = cu->registerManager
.mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
->mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
} else {
sgprIdx = cu->registerManager.mapSgpr(wf, _opIdx + dword);
sgprIdx = cu->registerManager->mapSgpr(wf, _opIdx + dword);
}
assert(sgprIdx > -1);

View File

@@ -101,7 +101,7 @@ HSADevice::translateOrDie(Addr vaddr, Addr &paddr)
* with new extensions, it will likely be wrong to just arbitrarily
* grab context zero.
*/
auto process = sys->getThreadContext(0)->getProcessPtr();
auto process = sys->threads[0]->getProcessPtr();
if (!process->pTable->translate(vaddr, paddr)) {
fatal("failed translation: vaddr 0x%x\n", vaddr);

View File

@@ -92,3 +92,28 @@ HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start);
return start;
}
/**
* Forward relevant parameters to packet processor; queueID
* is used to link doorbell. The queueIDs are not re-used
* in current implementation, and we allocate only one page
* (4096 bytes) for doorbells, so check if this queue ID can
* be mapped into that page.
*/
void
HSADriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
{
TypedBufferArg<kfd_ioctl_create_queue_args> args(ioc_buf);
args.copyIn(mem_proxy);
if (queueId >= 0x1000) {
fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
}
args->queue_id = queueId++;
auto &hsa_pp = device->hsaPacketProc();
hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
args->ring_base_address, args->queue_id,
args->ring_size);
args.copyOut(mem_proxy);
}

View File

@@ -56,7 +56,7 @@
struct HSADriverParams;
class HSADevice;
class SETranslatingPortProxy;
class PortProxy;
class ThreadContext;
class HSADriver : public EmulatedDriver
@@ -74,8 +74,7 @@ class HSADriver : public EmulatedDriver
HSADevice *device;
uint32_t queueId;
void allocateQueue(const SETranslatingPortProxy &mem_proxy,
Addr ioc_buf_addr);
void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf);
};
#endif // __DEV_HSA_HSA_DRIVER_HH__

View File

@@ -151,7 +151,7 @@ HSAPacketProcessor::translateOrDie(Addr vaddr, Addr &paddr)
// Grab the process and try to translate the virtual address with it; with
// new extensions, it will likely be wrong to just arbitrarily grab context
// zero.
auto process = sys->getThreadContext(0)->getProcessPtr();
auto process = sys->threads[0]->getProcessPtr();
if (!process->pTable->translate(vaddr, paddr))
fatal("failed translation: vaddr 0x%x\n", vaddr);
@@ -393,7 +393,7 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
* The reason for this is that the DMASequencer does
* not support atomic operations.
*/
auto tc = sys->getThreadContext(0);
auto tc = sys->threads[0];
auto &virt_proxy = tc->getVirtProxy();
TypedBufferArg<uint64_t> prev_signal(signal_addr);
prev_signal.copyIn(virt_proxy);

View File

@@ -92,7 +92,7 @@ HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
// We use the same mapping function used by hsa runtime to do this mapping
//
// Originally
// #define VOID_PTR_ADD32(ptr,n) \
// #define VOID_PTR_ADD32(ptr,n)
// (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
// (Addr)VOID_PTR_ADD32(0, queue_id)
Addr db_offset = queue_id;
@@ -343,7 +343,7 @@ HWScheduler::unregisterQueue(uint64_t queue_id)
// `(Addr)(VOID_PRT_ADD32(0, queue_id))`
//
// Originally
// #define VOID_PTR_ADD32(ptr,n) \
// #define VOID_PTR_ADD32(ptr,n)
// (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
// (Addr)VOID_PTR_ADD32(0, queue_id)
Addr db_offset = queue_id;

View File

@@ -1,48 +1,48 @@
# Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Copyright (c) 2015 Advanced Micro Devices, Inc.
# All rights reserved.
# For use for simulation and test purposes only
#
# For use for simulation and test purposes only
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Author: Steve Reinhardt
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Authors: Steve Reinhardt
from m5.defines import buildEnv
from m5.params import *
from m5.proxy import *
from m5.SimObject import SimObject
from m5.objects.Bridge import Bridge
from m5.objects.ClockedObject import ClockedObject
from m5.objects.Device import DmaDevice
from m5.objects.Process import EmulatedDriver
from m5.objects.Bridge import Bridge
from m5.objects.HSADevice import HSADevice
from m5.objects.HSADriver import HSADriver
from m5.objects.LdsState import LdsState
from m5.objects.Process import EmulatedDriver
class PrefetchType(Enum): vals = [
'PF_CU',
@@ -52,15 +52,48 @@ class PrefetchType(Enum): vals = [
'PF_END',
]
class VectorRegisterFile(SimObject):
class PoolManager(SimObject):
type = 'PoolManager'
abstract = True
cxx_header = "gpu-compute/pool_manager.hh"
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
pool_size = Param.Int(2048, 'number of vector registers per SIMD')
# The simple pool manage only allows one workgroup to
# be executing on a CU at any given time.
class SimplePoolManager(PoolManager):
type = 'SimplePoolManager'
cxx_class = 'SimplePoolManager'
cxx_header = "gpu-compute/simple_pool_manager.hh"
class RegisterFile(SimObject):
type = 'RegisterFile'
cxx_class = 'RegisterFile'
cxx_header = 'gpu-compute/register_file.hh'
simd_id = Param.Int(-1, 'SIMD ID associated with this Register File')
num_regs = Param.Int(2048, 'number of registers in this RF')
wf_size = Param.Int(64, 'Wavefront size (in work items)')
class ScalarRegisterFile(RegisterFile):
type = 'ScalarRegisterFile'
cxx_class = 'ScalarRegisterFile'
cxx_header = 'gpu-compute/scalar_register_file.hh'
class VectorRegisterFile(RegisterFile):
type = 'VectorRegisterFile'
cxx_class = 'VectorRegisterFile'
cxx_header = 'gpu-compute/vector_register_file.hh'
simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
wfSize = Param.Int(64, 'Wavefront size (in work items)')
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
class RegisterManager(SimObject):
type = 'RegisterManager'
cxx_class = 'RegisterManager'
cxx_header = 'gpu-compute/register_manager.hh'
policy = Param.String("static", "Register Manager Policy")
vrf_pool_managers = VectorParam.PoolManager('VRF Pool Managers')
srf_pool_managers = VectorParam.PoolManager('SRF Pool Managers')
class Wavefront(SimObject):
type = 'Wavefront'
@@ -69,45 +102,68 @@ class Wavefront(SimObject):
simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
wfSize = Param.Int(64, 'Wavefront size (in work items)')
wf_size = Param.Int(64, 'Wavefront size (in work items)')
max_ib_size = Param.Int(13, 'Maximum size (in number of insts) of the '
'instruction buffer (IB).')
# Most of the default values here are obtained from the
# AMD Graphics Core Next (GCN) Architecture whitepaper.
class ComputeUnit(ClockedObject):
type = 'ComputeUnit'
cxx_class = 'ComputeUnit'
cxx_header = 'gpu-compute/compute_unit.hh'
wavefronts = VectorParam.Wavefront('Number of wavefronts')
wfSize = Param.Int(64, 'Wavefront size (in work items)')
# Wavefront size is 64. This is configurable, however changing
# this value to anything other than 64 will likely cause errors.
wf_size = Param.Int(64, 'Wavefront size (in work items)')
num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU')
num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\
'per CU')
simd_width = Param.Int(16, 'width (number of lanes) per SIMD unit')
operand_network_length = Param.Int(1, 'number of pipe stages of operand '\
'network')
spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
'latency')
dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
dpbypass_pipe_length = Param.Int(4, 'vector ALU Double Precision bypass '\
'latency')
scalar_pipe_length = Param.Int(1, 'number of pipe stages per scalar ALU')
issue_period = Param.Int(4, 'number of cycles per issue period')
vrf_gm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
'GM bus')
srf_scm_bus_latency = Param.Int(1, 'number of cycles per use of SRF '\
'to Scalar Mem bus')
vrf_lm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
'LM bus')
num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
"Represents the pipeline to reach the TCP and "\
"specified in GPU clock cycles")
mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
"cu. Represents the pipeline between the TCP "\
"and cu as well as TCP data array access. "\
"Specified in GPU clock cycles")
n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
mem_req_latency = Param.Int(50, "Latency for request from the cu to ruby. "\
"Represents the pipeline to reach the TCP "\
"and specified in GPU clock cycles")
mem_resp_latency = Param.Int(50, "Latency for responses from ruby to the "\
"cu. Represents the pipeline between the "\
"TCP and cu as well as TCP data array "\
"access. Specified in GPU clock cycles")
system = Param.System(Parent.any, "system object")
cu_id = Param.Int('CU id')
vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
"in bytes")
coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
"in bytes")
vrf_to_coalescer_bus_width = Param.Int(64, "VRF->Coalescer data bus "\
"width in bytes")
coalescer_to_vrf_bus_width = Param.Int(64, "Coalescer->VRF data bus "\
"width in bytes")
memory_port = VectorMasterPort("Port to the memory system")
translation_port = VectorMasterPort('Port to the TLB hierarchy')
sqc_port = MasterPort("Port to the SQC (I-cache")
sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
scalar_port = MasterPort("Port to the scalar data cache")
scalar_tlb_port = MasterPort("Port to the TLB for the scalar data cache")
perLaneTLB = Param.Bool(False, "enable per-lane TLB")
prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
"(0 turns off prefetching)")
@@ -116,19 +172,22 @@ class ComputeUnit(ClockedObject):
"from last mem req in lane of "\
"CU|Phase|Wavefront")
execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
"kernel end")
countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
"and how many times")
countPages = Param.Bool(False, "Generate per-CU file of all pages "\
"touched and how many times")
scalar_mem_queue_size = Param.Int(32, "Number of entries in scalar "\
"memory pipeline's queues")
global_mem_queue_size = Param.Int(256, "Number of entries in the global "
"memory pipeline's queues")
local_mem_queue_size = Param.Int(256, "Number of entries in the local "
"memory pipeline's queues")
max_wave_requests = Param.Int(64, "number of pending vector memory "\
"requests per wavefront")
max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\
" of instructions that can be sent to coalescer")
ldsBus = Bridge() # the bridge between the CU and its LDS
@@ -137,72 +196,54 @@ class ComputeUnit(ClockedObject):
vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
"file")
scalar_register_file = VectorParam.ScalarRegisterFile("Scalar register "\
"file")
out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
" in the GM pipeline")
register_manager = Param.RegisterManager("Register Manager")
fetch_depth = Param.Int(2, 'number of i-cache lines that may be '
'buffered in the fetch unit.')
class Shader(ClockedObject):
type = 'Shader'
cxx_class = 'Shader'
cxx_header = 'gpu-compute/shader.hh'
CUs = VectorParam.ComputeUnit('Number of compute units')
n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU')
dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher')
n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
ruby at kernel boundaries""")
separate_acquire_release = Param.Bool(False,
"""Do ld_acquire/st_release generate separate requests for the
acquire and release?""")
ruby at kernel boundaries""")
globalmem = Param.MemorySize('64kB', 'Memory size')
timing = Param.Bool(False, 'timing memory accesses')
cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
translation = Param.Bool(False, "address translation");
timer_period = Param.Clock('10us', "system timer period")
idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold")
max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting")
class ClDriver(EmulatedDriver):
type = 'ClDriver'
cxx_header = 'gpu-compute/cl_driver.hh'
codefile = VectorParam.String('code file name(s)')
class GPUComputeDriver(HSADriver):
type = 'GPUComputeDriver'
cxx_header = 'gpu-compute/gpu_compute_driver.hh'
class GpuDispatcher(DmaDevice):
type = 'GpuDispatcher'
class GPUDispatcher(SimObject):
type = 'GPUDispatcher'
cxx_header = 'gpu-compute/dispatcher.hh'
# put at 8GB line for now
pio_addr = Param.Addr(0x200000000, "Device Address")
pio_latency = Param.Latency('1ns', "Programmed IO latency")
shader_pointer = Param.Shader('pointer to shader')
translation_port = MasterPort('Port to the dispatcher TLB')
cpu = Param.BaseCPU("CPU to wake up on kernel completion")
cl_driver = Param.ClDriver('pointer to driver')
class MemType(Enum): vals = [
'M_U8',
'M_U16',
'M_U32',
'M_U64',
'M_S8',
'M_S16',
'M_S32',
'M_S64',
'M_F16',
'M_F32',
'M_F64',
]
class GPUCommandProcessor(HSADevice):
type = 'GPUCommandProcessor'
cxx_header = 'gpu-compute/gpu_command_processor.hh'
dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU')
class StorageClassType(Enum): vals = [
'SC_SPILL',
'SC_GLOBAL',
'SC_SHARED',
'SC_GROUP',
'SC_PRIVATE',
'SC_READONLY',
'SC_KERNARG',
'SC_ARG',
'SC_NONE',
]
class RegisterType(Enum): vals = [
'RT_VECTOR',
'RT_SCALAR',
'RT_CONDITION',
'RT_HARDWARE',
'RT_NONE',
]

View File

@@ -13,9 +13,9 @@
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -40,15 +40,18 @@ class GPUStaticInstFlags(Enum):
# Op types
'ALU', # ALU op
'Branch', # Branch instruction
'CondBranch', # Conditinal Branch instruction
'Nop', # No-op (no effect at all)
'Return', # Return instruction
'Return', # Subroutine return instruction
'EndOfKernel', # Kernel termination instruction
'KernelLaunch', # Kernel launch inst
'UnconditionalJump', #
'SpecialOp', # Special op
'Waitcnt', # Is a waitcnt instruction
# Memory ops
'MemBarrier', # Barrier instruction
'MemFence', # Memory fence instruction
'MemSync', # Synchronizing instruction
'MemoryRef', # References memory (load, store, or atomic)
'Flat', # Flat memory op
'Load', # Reads from memory
@@ -64,6 +67,13 @@ class GPUStaticInstFlags(Enum):
'WritesSCC', # The instruction writes SCC
'ReadsVCC', # The instruction reads VCC
'WritesVCC', # The instruction writes VCC
'ReadsEXEC', # The instruction reads Exec Mask
'WritesEXEC', # The instruction writes Exec Mask
'ReadsMode', # The instruction reads Mode register
'WritesMode', # The instruction writes Mode register
'IgnoreExec', # The instruction ignores the Exec Mask
'IsSDWA', # The instruction is a SDWA instruction
'IsDPP', # The instruction is a DPP instruction
# Atomic OP types
'AtomicAnd',
@@ -78,13 +88,6 @@ class GPUStaticInstFlags(Enum):
'AtomicMax',
'AtomicMin',
# Memory order flags
'RelaxedOrder',
'Acquire', # Has acquire semantics
'Release', # Has release semantics
'AcquireRelease', # Has acquire and release semantics
'NoOrder', # Has no ordering restrictions
# Segment access flags
'ArgSegment', # Accesses the arg segment
'GlobalSegment', # Accesses global memory
@@ -95,15 +98,17 @@ class GPUStaticInstFlags(Enum):
'SpillSegment', # Accesses the spill segment
'NoSegment', # Does not have an associated segment
# Scope flags
'WorkitemScope',
'WavefrontScope',
'WorkgroupScope',
'DeviceScope',
'SystemScope',
'NoScope', # Does not have an associated scope
# Coherence flags
'GloballyCoherent', # Coherent with other workitems on same device
'SystemCoherent' # Coherent with a different device, or the host
'GloballyCoherent', # Coherent with other work-items on same device
'SystemCoherent', # Coherent with a different device, or the host
# Floating-point flags
'F16', # F16 operation
'F32', # F32 operation
'F64', # F64 operation
# MAC, MAD, FMA
'FMA', # FMA
'MAC', # MAC
'MAD' # MAD
]

View File

@@ -41,56 +41,62 @@ SimObject('GPUStaticInstFlags.py')
SimObject('LdsState.py')
SimObject('X86GPUTLB.py')
if env['TARGET_GPU_ISA'] == 'hsail':
Source('brig_object.cc')
Source('hsail_code.cc')
Source('cl_driver.cc')
Source('compute_unit.cc')
Source('condition_register_state.cc')
Source('dispatcher.cc')
Source('exec_stage.cc')
Source('fetch_stage.cc')
Source('fetch_unit.cc')
Source('global_memory_pipeline.cc')
Source('gpu_command_processor.cc')
Source('gpu_compute_driver.cc')
Source('gpu_dyn_inst.cc')
Source('gpu_exec_context.cc')
Source('gpu_static_inst.cc')
Source('gpu_tlb.cc')
Source('hsa_object.cc')
Source('kernel_cfg.cc')
Source('lds_state.cc')
Source('local_memory_pipeline.cc')
Source('pool_manager.cc')
Source('register_file.cc')
Source('register_manager.cc')
Source('scalar_memory_pipeline.cc')
Source('scalar_register_file.cc')
Source('schedule_stage.cc')
Source('scheduler.cc')
Source('scoreboard_check_stage.cc')
Source('shader.cc')
Source('simple_pool_manager.cc')
Source('static_register_manager_policy.cc')
Source('tlb_coalescer.cc')
Source('vector_register_file.cc')
Source('vector_register_state.cc')
Source('wavefront.cc')
DebugFlag('BRIG')
DebugFlag('GPUCoalescer')
DebugFlag('GPUCommandProc')
DebugFlag('GPUDriver')
DebugFlag('GPUInitAbi')
DebugFlag('GPUDisp')
DebugFlag('GPUExec')
DebugFlag('GPUFetch')
DebugFlag('GPUHsailCFInfo')
DebugFlag('GPUKernelInfo')
DebugFlag('GPUMem')
DebugFlag('GPUPort')
DebugFlag('GPUPrefetch')
DebugFlag('GPUReg')
DebugFlag('GPURename')
DebugFlag('GPURF')
DebugFlag('GPURfState')
DebugFlag('GPUSched')
DebugFlag('GPUShader')
DebugFlag('GPUSRF')
DebugFlag('GPUSync')
DebugFlag('GPUTLB')
DebugFlag('GPUVRF')
DebugFlag('HSALoader')
DebugFlag('HSAIL')
DebugFlag('HSAILObject')
DebugFlag('GPUVRFSched')
DebugFlag('GPUWgLatency')
DebugFlag('Predictor')
DebugFlag('WavefrontStack')
CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL',
'GPUVRF'])
'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
'GPUInitAbi'])

File diff suppressed because it is too large Load Diff

View File

@@ -36,28 +36,30 @@
#include <deque>
#include <map>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "base/callback.hh"
#include "base/statistics.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "enums/PrefetchType.hh"
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/fetch_stage.hh"
#include "gpu-compute/global_memory_pipeline.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/local_memory_pipeline.hh"
#include "gpu-compute/qstruct.hh"
#include "gpu-compute/register_manager.hh"
#include "gpu-compute/scalar_memory_pipeline.hh"
#include "gpu-compute/schedule_stage.hh"
#include "gpu-compute/scoreboard_check_stage.hh"
#include "mem/port.hh"
#include "mem/token_port.hh"
#include "sim/clocked_object.hh"
static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
static const int MAX_WIDTH_FOR_MEM_INST = 32;
class NDRange;
class HSAQueueEntry;
class LdsChunk;
class ScalarRegisterFile;
class Shader;
class VectorRegisterFile;
@@ -69,18 +71,6 @@ enum EXEC_POLICY
RR
};
// List of execution units
enum EXEC_UNIT
{
SIMD0 = 0,
SIMD1,
SIMD2,
SIMD3,
GLBMEM_PIPE,
LDSMEM_PIPE,
NUM_UNITS
};
enum TLB_CACHE
{
TLB_MISS_CACHE_MISS = 0,
@@ -92,32 +82,100 @@ enum TLB_CACHE
class ComputeUnit : public ClockedObject
{
public:
FetchStage fetchStage;
ScoreboardCheckStage scoreboardCheckStage;
ScheduleStage scheduleStage;
ExecStage execStage;
GlobalMemPipeline globalMemoryPipe;
LocalMemPipeline localMemoryPipe;
// Execution resources
//
// The ordering of units is:
// Vector ALUs
// Scalar ALUs
// GM Pipe
// LM Pipe
// Scalar Mem Pipe
//
// Note: the ordering of units is important and the code assumes the
// above ordering. However, there may be more than one resource of
// each type (e.g., 4 VALUs or 2 SALUs)
int numVectorGlobalMemUnits;
// Resource control for global memory to VRF data/address bus
WaitClass glbMemToVrfBus;
// Resource control for Vector Register File->Global Memory pipe buses
WaitClass vrfToGlobalMemPipeBus;
// Resource control for Vector Global Memory execution unit
WaitClass vectorGlobalMemUnit;
int numVectorSharedMemUnits;
// Resource control for local memory to VRF data/address bus
WaitClass locMemToVrfBus;
// Resource control for Vector Register File->Local Memory pipe buses
WaitClass vrfToLocalMemPipeBus;
// Resource control for Vector Shared/Local Memory execution unit
WaitClass vectorSharedMemUnit;
int numScalarMemUnits;
// Resource control for scalar memory to SRF data/address bus
WaitClass scalarMemToSrfBus;
// Resource control for Scalar Register File->Scalar Memory pipe buses
WaitClass srfToScalarMemPipeBus;
// Resource control for Scalar Memory execution unit
WaitClass scalarMemUnit;
// vector ALU execution resources
int numVectorALUs;
std::vector<WaitClass> vectorALUs;
// scalar ALU execution resources
int numScalarALUs;
std::vector<WaitClass> scalarALUs;
// Return total number of execution units on this CU
int numExeUnits() const;
// index into readyList of the first memory unit
int firstMemUnit() const;
// index into readyList of the last memory unit
int lastMemUnit() const;
// index into scalarALUs vector of SALU used by the wavefront
int mapWaveToScalarAlu(Wavefront *w) const;
// index into readyList of SALU used by wavefront
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
// index into readyList of Global Memory unit used by wavefront
int mapWaveToGlobalMem(Wavefront *w) const;
// index into readyList of Local Memory unit used by wavefront
int mapWaveToLocalMem(Wavefront *w) const;
// index into readyList of Scalar Memory unit used by wavefront
int mapWaveToScalarMem(Wavefront *w) const;
int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
int numCyclesPerStoreTransfer; // number of cycles per vector store
int numCyclesPerLoadTransfer; // number of cycles per vector load
// Buffers used to communicate between various pipeline stages
// At a high level, the following intra-/inter-stage communication occurs:
// SCB to SCH: readyList provides per exec resource list of waves that
// passed dependency and readiness checks. If selected by
// scheduler, attempt to add wave to schList conditional on
// RF support.
// SCH: schList holds waves that are gathering operands or waiting
// for execution resource availability. Once ready, waves are
// placed on the dispatchList as candidates for execution. A wave
// may spend multiple cycles in SCH stage, on the schList due to
// RF access conflicts or execution resource contention.
// SCH to EX: dispatchList holds waves that are ready to be executed.
// LM/FLAT arbitration may remove an LM wave and place it
// back on the schList. RF model may also force a wave back
// to the schList if using the detailed model.
// List of waves which are ready to be scheduled.
// Each execution resource has a ready list. readyList is
// used to communicate between scoreboardCheck stage and
// schedule stage
// TODO: make enum to index readyList
std::vector<std::vector<Wavefront*>> readyList;
// Stores the status of waves. A READY implies the
// wave is ready to be scheduled this cycle and
// is already present in the readyList. waveStatusList is
// used to communicate between scoreboardCheck stage and
// schedule stage
// TODO: convert std::pair to a class to increase readability
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
// List of waves which will be dispatched to
// each execution resource. A FILLED implies
// each execution resource. An EXREADY implies
// dispatch list is non-empty and
// execution unit has something to execute
// this cycle. Currently, the dispatch list of
@@ -127,32 +185,67 @@ class ComputeUnit : public ClockedObject
// and exec stage
// TODO: convert std::pair to a class to increase readability
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
// track presence of dynamic instructions in the Schedule pipeline
// stage. This is used to check the readiness of the oldest,
// non-dispatched instruction of every WF in the Scoreboard stage.
std::unordered_set<uint64_t> pipeMap;
RegisterManager* registerManager;
FetchStage fetchStage;
ScoreboardCheckStage scoreboardCheckStage;
ScheduleStage scheduleStage;
ExecStage execStage;
GlobalMemPipeline globalMemoryPipe;
LocalMemPipeline localMemoryPipe;
ScalarMemPipeline scalarMemoryPipe;
EventFunctionWrapper tickEvent;
int rrNextMemID; // used by RR WF exec policy to cycle through WF's
int rrNextALUWp;
typedef ComputeUnitParams Params;
std::vector<std::vector<Wavefront*>> wfList;
int cu_id;
// array of vector register files, one per SIMD
std::vector<VectorRegisterFile*> vrf;
// Number of vector ALU units (SIMDs) in CU
int numSIMDs;
// array of scalar register files, one per SIMD
std::vector<ScalarRegisterFile*> srf;
// Width per VALU/SIMD unit: number of work items that can be executed
// on the vector ALU simultaneously in a SIMD unit
int simdWidth;
// number of pipe stages for bypassing data to next dependent single
// precision vector instruction inside the vector ALU pipeline
int spBypassPipeLength;
// number of pipe stages for bypassing data to next dependent double
// precision vector instruction inside the vector ALU pipeline
int dpBypassPipeLength;
// number of cycles per issue period
int issuePeriod;
// number of pipe stages for scalar ALU
int scalarPipeStages;
// number of pipe stages for operand collection & distribution network
int operandNetworkLength;
// number of cycles per instruction issue period
Cycles issuePeriod;
// VRF to GM Bus latency
Cycles vrf_gm_bus_latency;
// SRF to Scalar Mem Bus latency
Cycles srf_scm_bus_latency;
// VRF to LM Bus latency
Cycles vrf_lm_bus_latency;
// Number of global and local memory execution resources in CU
int numGlbMemUnits;
int numLocMemUnits;
// tracks the last cycle a vector instruction was executed on a SIMD
std::vector<uint64_t> lastExecCycle;
// Track the amount of interleaving between wavefronts on each SIMD.
// This stat is sampled using instExecPerSimd to compute the number of
// instructions that have been executed on a SIMD between a WF executing
// two successive instructions.
Stats::VectorDistribution instInterleave;
// tracks the number of dyn inst executed per SIMD
std::vector<uint64_t> instExecPerSimd;
// true if we allow a separate TLB per lane
bool perLaneTLB;
// if 0, TLB prefetching is off.
@@ -166,8 +259,10 @@ class ComputeUnit : public ClockedObject
Enums::PrefetchType prefetchType;
EXEC_POLICY exec_policy;
bool xact_cas_mode;
bool debugSegFault;
// Idle CU timeout in ticks
Tick idleCUTimeout;
int idleWfs;
bool functionalTLB;
bool localMemBarrier;
@@ -183,91 +278,67 @@ class ComputeUnit : public ClockedObject
Shader *shader;
uint32_t barrier_id;
// vector of Vector ALU (MACC) pipelines
std::vector<WaitClass> aluPipe;
// minimum issue period per SIMD unit (in cycles)
std::vector<WaitClass> wfWait;
// Resource control for Vector Register File->Global Memory pipe buses
std::vector<WaitClass> vrfToGlobalMemPipeBus;
// Resource control for Vector Register File->Local Memory pipe buses
std::vector<WaitClass> vrfToLocalMemPipeBus;
int nextGlbMemBus;
int nextLocMemBus;
// Resource control for global memory to VRF data/address bus
WaitClass glbMemToVrfBus;
// Resource control for local memory to VRF data/address bus
WaitClass locMemToVrfBus;
uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
Tick req_tick_latency;
Tick resp_tick_latency;
// number of vector registers being reserved for each SIMD unit
/**
* Number of WFs to schedule to each SIMD. This vector is populated
* by hasDispResources(), and consumed by the subsequent call to
* dispWorkgroup(), to schedule the specified number of WFs to the
* SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
*/
std::vector<int> numWfsToSched;
// number of currently reserved vector registers per SIMD unit
std::vector<int> vectorRegsReserved;
// number of currently reserved scalar registers per SIMD unit
std::vector<int> scalarRegsReserved;
// number of vector registers per SIMD unit
uint32_t numVecRegsPerSimd;
// Support for scheduling VGPR status update events
std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
std::vector<uint64_t> timestampVec;
std::vector<uint8_t> statusVec;
int numVecRegsPerSimd;
// number of available scalar registers per SIMD unit
int numScalarRegsPerSimd;
void
registerEvent(uint32_t simdId,
uint32_t regIdx,
uint32_t operandSize,
uint64_t when,
uint8_t newStatus) {
regIdxVec.push_back(std::make_pair(simdId, regIdx));
timestampVec.push_back(when);
statusVec.push_back(newStatus);
if (operandSize > 4) {
regIdxVec.push_back(std::make_pair(simdId,
((regIdx + 1) %
numVecRegsPerSimd)));
timestampVec.push_back(when);
statusVec.push_back(newStatus);
}
}
void updateEvents();
void updateReadyList(int unitId);
// this hash map will keep track of page divergence
// per memory instruction per wavefront. The hash map
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
std::map<Addr, int> pagesTouched;
void insertInPipeMap(Wavefront *w);
void deleteFromPipeMap(Wavefront *w);
ComputeUnit(const Params *p);
~ComputeUnit();
int spBypassLength() { return spBypassPipeLength; };
int dpBypassLength() { return dpBypassPipeLength; };
int storeBusLength() { return numCyclesPerStoreTransfer; };
int loadBusLength() { return numCyclesPerLoadTransfer; };
int wfSize() const { return wavefrontSize; };
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
// Timing Functions
int oprNetPipeLength() const { return operandNetworkLength; }
int simdUnitWidth() const { return simdWidth; }
int spBypassLength() const { return spBypassPipeLength; }
int dpBypassLength() const { return dpBypassPipeLength; }
int scalarPipeLength() const { return scalarPipeStages; }
int storeBusLength() const { return numCyclesPerStoreTransfer; }
int loadBusLength() const { return numCyclesPerLoadTransfer; }
int wfSize() const { return wavefrontSize; }
void exec();
void initiateFetch(Wavefront *wavefront);
void fetch(PacketPtr pkt, Wavefront *wavefront);
void fillKernelState(Wavefront *w, NDRange *ndr);
void fillKernelState(Wavefront *w, HSAQueueEntry *task);
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
NDRange *ndr);
HSAQueueEntry *task, bool fetchContext=false);
void StartWorkgroup(NDRange *ndr);
int ReadyWorkgroup(NDRange *ndr);
void doInvalidate(RequestPtr req, int kernId);
void doFlush(GPUDynInstPtr gpuDynInst);
void dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler=false);
bool hasDispResources(HSAQueueEntry *task);
int cacheLineSize() const { return _cacheLineSize; }
int getCacheLineBits() const { return cacheLineBits; }
bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
int GlbMemUnitId() { return GLBMEM_PIPE; }
int ShrMemUnitId() { return LDSMEM_PIPE; }
int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
/* This function cycles through all the wavefronts in all the phases to see
* if all of the wavefronts which should be associated with one barrier
* (denoted with _barrier_id), are all at the same barrier in the program
@@ -275,14 +346,15 @@ class ComputeUnit : public ClockedObject
* return true.
*/
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
bool cedeSIMD(int simdId, int wfSlotId);
template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
template<typename c0, typename c1>
void doSmReturn(GPUDynInstPtr gpuDynInst);
virtual void init() override;
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
bool kernelLaunch=true,
bool kernelMemSync,
RequestPtr req=nullptr);
void handleMemPacket(PacketPtr pkt, int memport_index);
bool processTimingPacket(PacketPtr pkt);
@@ -292,7 +364,7 @@ class ComputeUnit : public ClockedObject
MasterID masterId() { return _masterId; }
bool isDone() const;
bool isSimdDone(uint32_t) const;
bool isVectorAluIdle(uint32_t simdId) const;
protected:
MasterID _masterId;
@@ -323,6 +395,44 @@ class ComputeUnit : public ClockedObject
Stats::Scalar scalarMemReads;
Stats::Formula scalarMemReadsPerWF;
Stats::Formula vectorMemReadsPerKiloInst;
Stats::Formula vectorMemWritesPerKiloInst;
Stats::Formula vectorMemInstsPerKiloInst;
Stats::Formula scalarMemReadsPerKiloInst;
Stats::Formula scalarMemWritesPerKiloInst;
Stats::Formula scalarMemInstsPerKiloInst;
// Cycles required to send register source (addr and data) from
// register files to memory pipeline, per SIMD.
Stats::Vector instCyclesVMemPerSimd;
Stats::Vector instCyclesScMemPerSimd;
Stats::Vector instCyclesLdsPerSimd;
Stats::Scalar globalReads;
Stats::Scalar globalWrites;
Stats::Formula globalMemInsts;
Stats::Scalar argReads;
Stats::Scalar argWrites;
Stats::Formula argMemInsts;
Stats::Scalar spillReads;
Stats::Scalar spillWrites;
Stats::Formula spillMemInsts;
Stats::Scalar groupReads;
Stats::Scalar groupWrites;
Stats::Formula groupMemInsts;
Stats::Scalar privReads;
Stats::Scalar privWrites;
Stats::Formula privMemInsts;
Stats::Scalar readonlyReads;
Stats::Scalar readonlyWrites;
Stats::Formula readonlyMemInsts;
Stats::Scalar kernargReads;
Stats::Scalar kernargWrites;
Stats::Formula kernargMemInsts;
int activeWaves;
Stats::Distribution waveLevelParallelism;
void updateInstStats(GPUDynInstPtr gpuDynInst);
// the following stats compute the avg. TLB accesslatency per
@@ -339,21 +449,48 @@ class ComputeUnit : public ClockedObject
// over all memory instructions executed over all wavefronts
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
Stats::Distribution pageDivergenceDist;
// count of non-flat global memory vector instructions executed
Stats::Scalar dynamicGMemInstrCnt;
// count of flat global memory vector instructions executed
Stats::Scalar dynamicFlatMemInstrCnt;
Stats::Scalar dynamicLMemInstrCnt;
Stats::Scalar wgBlockedDueLdsAllocation;
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
// when the instruction is committed, this number is still incremented by 1
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
// active when the instruction is committed, this number is still
// incremented by 1
Stats::Scalar numInstrExecuted;
// Number of cycles among successive instruction executions across all
// wavefronts of the same CU
Stats::Distribution execRateDist;
// number of individual vector operations executed
Stats::Scalar numVecOpsExecuted;
// number of individual f16 vector operations executed
Stats::Scalar numVecOpsExecutedF16;
// number of individual f32 vector operations executed
Stats::Scalar numVecOpsExecutedF32;
// number of individual f64 vector operations executed
Stats::Scalar numVecOpsExecutedF64;
// number of individual FMA 16,32,64 vector operations executed
Stats::Scalar numVecOpsExecutedFMA16;
Stats::Scalar numVecOpsExecutedFMA32;
Stats::Scalar numVecOpsExecutedFMA64;
// number of individual MAC 16,32,64 vector operations executed
Stats::Scalar numVecOpsExecutedMAC16;
Stats::Scalar numVecOpsExecutedMAC32;
Stats::Scalar numVecOpsExecutedMAC64;
// number of individual MAD 16,32,64 vector operations executed
Stats::Scalar numVecOpsExecutedMAD16;
Stats::Scalar numVecOpsExecutedMAD32;
Stats::Scalar numVecOpsExecutedMAD64;
// total number of two op FP vector operations executed
Stats::Scalar numVecOpsExecutedTwoOpFP;
// Total cycles that something is running on the GPU
Stats::Scalar totalCycles;
Stats::Formula vpc; // vector ops per cycle
Stats::Formula vpc_f16; // vector ops per cycle
Stats::Formula vpc_f32; // vector ops per cycle
Stats::Formula vpc_f64; // vector ops per cycle
Stats::Formula ipc; // vector instructions per cycle
Stats::Distribution controlFlowDivergenceDist;
Stats::Distribution activeLanesPerGMemInstrDist;
@@ -362,20 +499,16 @@ class ComputeUnit : public ClockedObject
Stats::Formula numALUInstsExecuted;
// number of times a WG can not start due to lack of free VGPRs in SIMDs
Stats::Scalar numTimesWgBlockedDueVgprAlloc;
// number of times a WG can not start due to lack of free SGPRs in SIMDs
Stats::Scalar numTimesWgBlockedDueSgprAlloc;
Stats::Scalar numCASOps;
Stats::Scalar numFailedCASOps;
Stats::Scalar completedWfs;
// flag per vector SIMD unit that is set when there is at least one
// WV that has a vector ALU instruction as the oldest in its
// Instruction Buffer: Defined in the Scoreboard stage, consumed
// by the Execute stage.
std::vector<bool> vectorAluInstAvail;
// number of available (oldest) LDS instructions that could have
// been issued to the LDS at a specific issue slot
int shrMemInstAvail;
// number of available Global memory instructions that could have
// been issued to TCP at a specific issue slot
int glbMemInstAvail;
Stats::Scalar completedWGs;
// distrubtion in latency difference between first and last cache block
// arrival ticks
Stats::Distribution headTailLatency;
void
regStats() override;
@@ -389,8 +522,6 @@ class ComputeUnit : public ClockedObject
int32_t
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
int cacheLineSize() const { return _cacheLineSize; }
bool
sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
@@ -486,6 +617,56 @@ class ComputeUnit : public ClockedObject
};
// Scalar data cache access port
class ScalarDataPort : public MasterPort
{
public:
ScalarDataPort(const std::string &_name, ComputeUnit *_cu,
PortID _index)
: MasterPort(_name, _cu, _index), computeUnit(_cu), index(_index)
{
(void)index;
}
bool recvTimingResp(PacketPtr pkt) override;
void recvReqRetry() override;
struct SenderState : public Packet::SenderState
{
SenderState(GPUDynInstPtr gpuDynInst,
Packet::SenderState *sender_state=nullptr)
: _gpuDynInst(gpuDynInst), saved(sender_state)
{
}
GPUDynInstPtr _gpuDynInst;
Packet::SenderState *saved;
};
class MemReqEvent : public Event
{
private:
ScalarDataPort *scalarDataPort;
PacketPtr pkt;
public:
MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt)
: Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
{
setFlags(Event::AutoDelete);
}
void process();
const char *description() const;
};
std::deque<PacketPtr> retries;
private:
ComputeUnit *computeUnit;
PortID index;
};
// Instruction cache access port
class SQCPort : public MasterPort
{
@@ -500,10 +681,13 @@ class ComputeUnit : public ClockedObject
{
Wavefront *wavefront;
Packet::SenderState *saved;
// kernel id to be used in handling I-Cache invalidate response
int kernId;
SenderState(Wavefront *_wavefront, Packet::SenderState
*sender_state=nullptr)
: wavefront(_wavefront), saved(sender_state) { }
*sender_state=nullptr, int _kernId=-1)
: wavefront(_wavefront), saved(sender_state),
kernId(_kernId){ }
};
std::deque<std::pair<PacketPtr, Wavefront*>> retries;
@@ -575,6 +759,34 @@ class ComputeUnit : public ClockedObject
virtual void recvReqRetry();
};
class ScalarDTLBPort : public MasterPort
{
public:
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
: MasterPort(_name, _cu), computeUnit(_cu), stalled(false)
{
}
struct SenderState : public Packet::SenderState
{
SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
GPUDynInstPtr _gpuDynInst;
};
bool recvTimingResp(PacketPtr pkt) override;
void recvReqRetry() override { assert(false); }
bool isStalled() const { return stalled; }
void stallPort() { stalled = true; }
void unstallPort() { stalled = false; }
std::deque<PacketPtr> retries;
private:
ComputeUnit *computeUnit;
bool stalled;
};
class ITLBPort : public MasterPort
{
public:
@@ -710,6 +922,10 @@ class ComputeUnit : public ClockedObject
std::vector<DataPort*> memPort;
// port to the TLB hierarchy (i.e., the L1 TLB)
std::vector<DTLBPort*> tlbPort;
// port to the scalar data cache
ScalarDataPort *scalarDataPort;
// port to the scalar data TLB
ScalarDTLBPort *scalarDTLBPort;
// port to the SQC (i.e. the I-cache)
SQCPort *sqcPort;
// port to the SQC TLB (there's a separate TLB for each I-cache)
@@ -726,6 +942,14 @@ class ComputeUnit : public ClockedObject
tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
this, idx);
return *tlbPort[idx];
} else if (if_name == "scalar_port") {
scalarDataPort = new ScalarDataPort(csprintf("%s-port%d", name(),
idx), this, idx);
return *scalarDataPort;
} else if (if_name == "scalar_tlb_port") {
scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port", name()),
this);
return *scalarDTLBPort;
} else if (if_name == "sqc_port") {
sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
this, idx);
@@ -746,32 +970,18 @@ class ComputeUnit : public ClockedObject
}
}
// xact_cas_load()
class waveIdentifier
{
public:
waveIdentifier() { }
waveIdentifier(int _simdId, int _wfSlotId)
: simdId(_simdId), wfSlotId(_wfSlotId) { }
int simdId;
int wfSlotId;
};
class waveQueue
{
public:
std::list<waveIdentifier> waveIDQueue;
};
std::map<unsigned, waveQueue> xactCasLoadMap;
uint64_t getAndIncSeqNum() { return globalSeqNum++; }
InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }
private:
const int _cacheLineSize;
uint64_t globalSeqNum;
int cacheLineBits;
InstSeqNum globalSeqNum;
int wavefrontSize;
GPUStaticInst *kernelLaunchInst;
// hold the time of the arrival of the first cache block related to
// a particular GPUDynInst. This is used to calculate the difference
// between the first and last chace block arrival times.
std::map<GPUDynInstPtr, Tick> headTailMap;
};
#endif // __COMPUTE_UNIT_HH__

View File

@@ -34,66 +34,76 @@
#include "gpu-compute/dispatcher.hh"
#include "cpu/base.hh"
#include "debug/GPUDisp.hh"
#include "gpu-compute/cl_driver.hh"
#include "gpu-compute/cl_event.hh"
#include "debug/GPUKernelInfo.hh"
#include "debug/GPUWgLatency.hh"
#include "gpu-compute/gpu_command_processor.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/packet_access.hh"
#include "sim/syscall_emul_buf.hh"
#include "sim/system.hh"
GpuDispatcher *GpuDispatcher::instance = nullptr;
GpuDispatcher::GpuDispatcher(const Params *p)
: DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")),
pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
dispatchCount(0), dispatchActive(false), cpu(p->cpu),
shader(p->shader_pointer), driver(p->cl_driver),
tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
false, Event::CPU_Tick_Pri)
GPUDispatcher::GPUDispatcher(const Params *p)
: SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
tickEvent([this]{ exec(); },
"GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
dispatchActive(false)
{
shader->handshake(this);
driver->handshake(this);
ndRange.wg_disp_rem = false;
ndRange.globalWgId = 0;
schedule(&tickEvent, 0);
// translation port for the dispatcher
tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
num_kernelLaunched
.name(name() + ".num_kernel_launched")
.desc("number of kernel launched")
;
}
GpuDispatcher *GpuDispatcherParams::create()
GPUDispatcher::~GPUDispatcher()
{
GpuDispatcher *dispatcher = new GpuDispatcher(this);
GpuDispatcher::setInstance(dispatcher);
return GpuDispatcher::getInstance();
}
void
GpuDispatcher::serialize(CheckpointOut &cp) const
GPUDispatcher::regStats()
{
numKernelLaunched
.name(name() + ".num_kernel_launched")
.desc("number of kernel launched")
;
cyclesWaitingForDispatch
.name(name() + ".cycles_wait_dispatch")
.desc("number of cycles with outstanding wavefronts "
"that are waiting to be dispatched")
;
}
HSAQueueEntry*
GPUDispatcher::hsaTask(int disp_id)
{
assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end());
return hsaQueueEntries[disp_id];
}
void
GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
{
gpuCmdProc = gpu_cmd_proc;
}
void
GPUDispatcher::setShader(Shader *new_shader)
{
shader = new_shader;
}
void
GPUDispatcher::serialize(CheckpointOut &cp) const
{
Tick event_tick = 0;
if (ndRange.wg_disp_rem)
fatal("Checkpointing not supported during active workgroup execution");
if (tickEvent.scheduled())
event_tick = tickEvent.when();
SERIALIZE_SCALAR(event_tick);
}
void
GpuDispatcher::unserialize(CheckpointIn &cp)
GPUDispatcher::unserialize(CheckpointIn &cp)
{
Tick event_tick;
@@ -102,288 +112,256 @@ GpuDispatcher::unserialize(CheckpointIn &cp)
UNSERIALIZE_SCALAR(event_tick);
if (event_tick)
if (event_tick) {
schedule(&tickEvent, event_tick);
}
}
AddrRangeList
GpuDispatcher::getAddrRanges() const
/**
* After all relevant HSA data structures have been traversed/extracted
* from memory by the CP, dispatch() is called on the dispatcher. This will
* schedule a dispatch event that, when triggered, will attempt to dispatch
* the WGs associated with the given task to the CUs.
*/
void
GPUDispatcher::dispatch(HSAQueueEntry *task)
{
AddrRangeList ranges;
++numKernelLaunched;
DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
pioAddr, pioSize);
DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
task->kernelName(), task->dispatchId());
ranges.push_back(RangeSize(pioAddr, pioSize));
execIds.push(task->dispatchId());
dispatchActive = true;
hsaQueueEntries.emplace(task->dispatchId(), task);
return ranges;
}
Tick
GpuDispatcher::read(PacketPtr pkt)
{
assert(pkt->getAddr() >= pioAddr);
assert(pkt->getAddr() < pioAddr + pioSize);
int offset = pkt->getAddr() - pioAddr;
pkt->allocate();
DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
if (offset < 8) {
assert(!offset);
assert(pkt->getSize() == 8);
uint64_t retval = dispatchActive;
pkt->setLE(retval);
} else {
offset -= 8;
assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
char *curTaskPtr = (char*)&curTask;
memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
if (!tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->clockPeriod());
}
pkt->makeAtomicResponse();
return pioDelay;
}
Tick
GpuDispatcher::write(PacketPtr pkt)
{
assert(pkt->getAddr() >= pioAddr);
assert(pkt->getAddr() < pioAddr + pioSize);
int offset = pkt->getAddr() - pioAddr;
#if TRACING_ON
uint64_t data_val = 0;
switch (pkt->getSize()) {
case 1:
data_val = pkt->getLE<uint8_t>();
break;
case 2:
data_val = pkt->getLE<uint16_t>();
break;
case 4:
data_val = pkt->getLE<uint32_t>();
break;
case 8:
data_val = pkt->getLE<uint64_t>();
break;
default:
DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
}
DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
pkt->getSize());
#endif
if (!offset) {
static int nextId = 0;
// The depends field of the qstruct, which was previously unused, is
// used to communicate with simulated application.
if (curTask.depends) {
HostState hs;
shader->ReadMem((uint64_t)(curTask.depends), &hs,
sizeof(HostState), 0);
// update event start time (in nano-seconds)
uint64_t start = curTick() / 1000;
shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
&start, sizeof(uint64_t), 0);
}
// launch kernel
++num_kernelLaunched;
NDRange *ndr = &(ndRangeMap[nextId]);
// copy dispatch info
ndr->q = curTask;
// update the numDispTask polled by the runtime
accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
ndr->numWgTotal = 1;
for (int i = 0; i < 3; ++i) {
ndr->wgId[i] = 0;
ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
ndr->numWgTotal *= ndr->numWg[i];
}
ndr->numWgCompleted = 0;
ndr->globalWgId = 0;
ndr->wg_disp_rem = true;
ndr->execDone = false;
ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
ndr->dispatchId = nextId;
ndr->curCid = pkt->req->contextId();
DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
execIds.push(nextId);
++nextId;
dispatchActive = true;
if (!tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->ticks(1));
}
} else {
// populate current task struct
// first 64 bits are launch reg
offset -= 8;
assert(offset < sizeof(HsaQueueEntry));
char *curTaskPtr = (char*)&curTask;
memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
}
pkt->makeAtomicResponse();
return pioDelay;
}
Port &
GpuDispatcher::getPort(const std::string &if_name, PortID idx)
{
if (if_name == "translation_port") {
return *tlbPort;
}
return DmaDevice::getPort(if_name, idx);
}
void
GpuDispatcher::exec()
GPUDispatcher::exec()
{
int fail_count = 0;
int fail_count(0);
// There are potentially multiple outstanding kernel launches.
// It is possible that the workgroups in a different kernel
// can fit on the GPU even if another kernel's workgroups cannot
/**
* There are potentially multiple outstanding kernel launches.
* It is possible that the workgroups in a different kernel
* can fit on the GPU even if another kernel's workgroups cannot
*/
DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
if (execIds.size() > 0) {
++cyclesWaitingForDispatch;
}
/**
* dispatch work cannot start until the kernel's invalidate is
* completely finished; hence, kernel will always initiates
* invalidate first and keeps waiting until inv done
*/
while (execIds.size() > fail_count) {
int execId = execIds.front();
int exec_id = execIds.front();
auto task = hsaQueueEntries[exec_id];
bool launched(false);
while (ndRangeMap[execId].wg_disp_rem) {
//update the thread context
shader->updateContext(ndRangeMap[execId].curCid);
// invalidate is needed before starting dispatch
if (shader->impl_kern_boundary_sync) {
// try to invalidate cache
shader->prepareInvalidate(task);
} else {
// kern boundary sync is not set, skip invalidate
task->markInvDone();
}
// attempt to dispatch_workgroup
if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
// if we failed try the next kernel,
// it may have smaller workgroups.
// put it on the queue to rety latter
DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
execIds.push(execId);
/**
* invalidate is still ongoing, put the kernel on the queue to
* retry later
*/
if (!task->isInvDone()){
execIds.push(exec_id);
++fail_count;
DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending"
" invalidate requests\n", exec_id, task->outstandingInvs());
// try the next kernel_id
execIds.pop();
continue;
}
// kernel invalidate is done, start workgroup dispatch
while (!task->dispComplete()) {
// update the thread context
shader->updateContext(task->contextId());
// attempt to dispatch workgroup
DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n",
curTick(), exec_id);
if (!shader->dispatchWorkgroups(task)) {
/**
* if we failed try the next kernel,
* it may have smaller workgroups.
* put it on the queue to rety latter
*/
DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
execIds.push(exec_id);
++fail_count;
break;
} else if (!launched) {
launched = true;
DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);
}
}
// let's try the next kernel_id
// try the next kernel_id
execIds.pop();
}
DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
if (doneIds.size() && cpu) {
shader->hostWakeUp(cpu);
}
while (doneIds.size()) {
// wakeup the CPU if any Kernels completed this cycle
DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
doneIds.pop();
}
}
void
GpuDispatcher::notifyWgCompl(Wavefront *w)
bool
GPUDispatcher::isReachingKernelEnd(Wavefront *wf)
{
int kern_id = w->kernId;
DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
assert(ndRangeMap[kern_id].dispatchId == kern_id);
ndRangeMap[kern_id].numWgCompleted++;
int kern_id = wf->kernId;
assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end());
auto task = hsaQueueEntries[kern_id];
assert(task->dispatchId() == kern_id);
if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
ndRangeMap[kern_id].execDone = true;
doneIds.push(kern_id);
/**
* whether the next workgroup is the final one in the kernel,
* +1 as we check first before taking action
*/
return (task->numWgCompleted() + 1 == task->numWgTotal());
}
if (ndRangeMap[kern_id].addrToNotify) {
accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
0);
/**
* update the counter of oustanding inv requests for the kernel
* kern_id: kernel id
* val: +1/-1, increment or decrement the counter (default: -1)
*/
void
GPUDispatcher::updateInvCounter(int kern_id, int val) {
assert(val == -1 || val == 1);
auto task = hsaQueueEntries[kern_id];
task->updateOutstandingInvs(val);
// kernel invalidate is done, schedule dispatch work
if (task->isInvDone() && !tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->clockPeriod());
}
}
/**
* update the counter of oustanding wb requests for the kernel
* kern_id: kernel id
* val: +1/-1, increment or decrement the counter (default: -1)
*
* return true if all wbs are done for the kernel
*/
bool
GPUDispatcher::updateWbCounter(int kern_id, int val) {
assert(val == -1 || val == 1);
auto task = hsaQueueEntries[kern_id];
task->updateOutstandingWbs(val);
// true: WB is done, false: WB is still ongoing
return (task->outstandingWbs() == 0);
}
/**
* get kernel's outstanding cache writeback requests
*/
int
GPUDispatcher::getOutstandingWbs(int kernId) {
auto task = hsaQueueEntries[kernId];
return task->outstandingWbs();
}
/**
* When an end program instruction detects that the last WF in
* a WG has completed it will call this method on the dispatcher.
* If we detect that this is the last WG for the given task, then
* we ring the completion signal, which is used by the CPU to
* synchronize with the GPU. The HSAPP is also notified that the
* task has completed so it can be removed from its task queues.
*/
void
GPUDispatcher::notifyWgCompl(Wavefront *wf)
{
int kern_id = wf->kernId;
DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId);
auto task = hsaQueueEntries[kern_id];
assert(task->dispatchId() == kern_id);
task->notifyWgCompleted();
DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",
curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id);
if (task->numWgCompleted() == task->numWgTotal()) {
// Notify the HSA PP that this kernel is complete
gpuCmdProc->hsaPacketProc()
.finishPkt(task->dispPktPtr(), task->queueId());
if (task->completionSignal()) {
// The signal value is aligned 8 bytes from
// the actual handle in the runtime
Addr signal_addr = task->completionSignal() + sizeof(Addr);
DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "
"completion signal: %x!\n", signal_addr);
/**
* HACK: The semantics of the HSA signal is to decrement
* the current signal value. We cheat here and read out
* he value from main memory using functional access and
* then just DMA the decremented value. This is because
* the DMA controller does not currently support GPU
* atomics.
*/
auto *tc = gpuCmdProc->system()->threads[0];
auto &virt_proxy = tc->getVirtProxy();
TypedBufferArg<Addr> prev_signal(signal_addr);
prev_signal.copyIn(virt_proxy);
Addr *new_signal = new Addr;
*new_signal = (Addr)*prev_signal - 1;
gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,
new_signal, 0);
} else {
DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
"signal\n");
}
accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
// update event end time (in nano-seconds)
if (ndRangeMap[kern_id].q.depends) {
HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
uint64_t event;
shader->ReadMem((uint64_t)(&host_state->event), &event,
sizeof(uint64_t), 0);
uint64_t end = curTick() / 1000;
shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
sizeof(uint64_t), 0);
}
DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n",
curTick(), kern_id);
DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
}
if (!tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->ticks(1));
schedule(&tickEvent, curTick() + shader->clockPeriod());
}
}
void
GpuDispatcher::scheduleDispatch()
GPUDispatcher::scheduleDispatch()
{
if (!tickEvent.scheduled())
schedule(&tickEvent, curTick() + shader->ticks(1));
}
void
GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
{
if (cpu) {
if (off) {
shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
true);
val += off;
}
shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
} else {
panic("Cannot find host");
if (!tickEvent.scheduled()) {
schedule(&tickEvent, curTick() + shader->clockPeriod());
}
}
// helper functions for driver to retrieve GPU attributes
int
GpuDispatcher::getNumCUs()
GPUDispatcher *GPUDispatcherParams::create()
{
return shader->cuList.size();
}
int
GpuDispatcher::wfSize() const
{
return shader->cuList[0]->wfSize();
}
void
GpuDispatcher::setFuncargsSize(int funcargs_size)
{
shader->funcargs_size = funcargs_size;
}
uint32_t
GpuDispatcher::getStaticContextSize() const
{
return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
return new GPUDispatcher(this);
}

View File

@@ -31,125 +31,69 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __GPU_DISPATCHER_HH__
#define __GPU_DISPATCHER_HH__
/**
* @file
* The GPUDispatcher is the component of the shader that is responsible
* for creating and dispatching WGs to the compute units. If all WGs in
* a kernel cannot be dispatched simultaneously, then the dispatcher will
* keep track of all pending WGs and dispatch them as resources become
* available.
*/
#ifndef __GPU_COMPUTE_DISPATCHER_HH__
#define __GPU_COMPUTE_DISPATCHER_HH__
#include <queue>
#include <unordered_map>
#include <vector>
#include "base/statistics.hh"
#include "dev/dma_device.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/ndrange.hh"
#include "gpu-compute/qstruct.hh"
#include "mem/port.hh"
#include "params/GpuDispatcher.hh"
#include "dev/hsa/hsa_packet.hh"
#include "params/GPUDispatcher.hh"
#include "sim/sim_object.hh"
class BaseCPU;
class GPUCommandProcessor;
class HSAQueueEntry;
class Shader;
class Wavefront;
class GpuDispatcher : public DmaDevice
class GPUDispatcher : public SimObject
{
public:
typedef GpuDispatcherParams Params;
public:
typedef GPUDispatcherParams Params;
MasterID masterId() { return _masterId; }
GPUDispatcher(const Params *p);
~GPUDispatcher();
protected:
MasterID _masterId;
void serialize(CheckpointOut &cp) const override;
void unserialize(CheckpointIn &cp) override;
void regStats() override;
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc);
void setShader(Shader *new_shader);
void exec();
bool isReachingKernelEnd(Wavefront *wf);
void updateInvCounter(int kern_id, int val=-1);
bool updateWbCounter(int kern_id, int val=-1);
int getOutstandingWbs(int kern_id);
void notifyWgCompl(Wavefront *wf);
void scheduleDispatch();
void dispatch(HSAQueueEntry *task);
HSAQueueEntry* hsaTask(int disp_id);
// Base and length of PIO register space
Addr pioAddr;
Addr pioSize;
Tick pioDelay;
HsaQueueEntry curTask;
std::unordered_map<int, NDRange> ndRangeMap;
NDRange ndRange;
// list of kernel_ids to launch
std::queue<int> execIds;
// list of kernel_ids that have finished
std::queue<int> doneIds;
uint64_t dispatchCount;
// is there a kernel in execution?
bool dispatchActive;
BaseCPU *cpu;
Shader *shader;
ClDriver *driver;
EventFunctionWrapper tickEvent;
static GpuDispatcher *instance;
// sycall emulation mode can have only 1 application running(?)
// else we have to do some pid based tagging
// unused
typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
TranslationBuffer tlb;
public:
/*statistics*/
Stats::Scalar num_kernelLaunched;
GpuDispatcher(const Params *p);
~GpuDispatcher() { }
void exec();
virtual void serialize(CheckpointOut &cp) const override;
virtual void unserialize(CheckpointIn &cp) override;
void notifyWgCompl(Wavefront *w);
void scheduleDispatch();
void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
// using singleton so that glue code can pass pointer locations
// to the dispatcher. when there are multiple dispatchers, we can
// call something like getInstance(index)
static void
setInstance(GpuDispatcher *_instance)
{
instance = _instance;
}
static GpuDispatcher* getInstance() { return instance; }
class TLBPort : public MasterPort
{
public:
TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
: MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
protected:
GpuDispatcher *dispatcher;
virtual bool recvTimingResp(PacketPtr pkt) { return true; }
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry() { }
};
TLBPort *tlbPort;
Port &getPort(const std::string &if_name,
PortID idx=InvalidPortID) override;
AddrRangeList getAddrRanges() const override;
Tick read(PacketPtr pkt) override;
Tick write(PacketPtr pkt) override;
// helper functions to retrieve/set GPU attributes
int getNumCUs();
int wfSize() const;
void setFuncargsSize(int funcargs_size);
/** Returns the size of the static hardware context of a wavefront */
uint32_t getStaticContextSize() const;
private:
Shader *shader;
GPUCommandProcessor *gpuCmdProc;
EventFunctionWrapper tickEvent;
std::unordered_map<int, HSAQueueEntry*> hsaQueueEntries;
// list of kernel_ids to launch
std::queue<int> execIds;
// list of kernel_ids that have finished
std::queue<int> doneIds;
// is there a kernel in execution?
bool dispatchActive;
/*statistics*/
Stats::Scalar numKernelLaunched;
Stats::Scalar cyclesWaitingForDispatch;
};
#endif // __GPU_DISPATCHER_HH__
#endif // __GPU_COMPUTE_DISPATCHER_HH__

View File

@@ -33,13 +33,15 @@
#include "gpu-compute/exec_stage.hh"
#include <sstream>
#include "base/trace.hh"
#include "debug/GPUSched.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
ExecStage::ExecStage(const ComputeUnitParams *p) : lastTimeInstExecuted(false),
thisTimeInstExecuted(false), instrExecuted (false),
executionResourcesUsed(0)
{
@@ -53,37 +55,18 @@ ExecStage::init(ComputeUnit *cu)
computeUnit = cu;
_name = computeUnit->name() + ".ExecStage";
dispatchList = &computeUnit->dispatchList;
vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
glbMemInstAvail= &(computeUnit->glbMemInstAvail);
shrMemInstAvail= &(computeUnit->shrMemInstAvail);
idle_dur = 0;
}
void
ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
if (stage == IdleExec) {
// count cycles of no vector ALU instruction executed
// even if one was the oldest in a WV of that vector SIMD unit
if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
numCyclesWithNoInstrTypeIssued[unitId]++;
}
// count cycles of no global memory (vector) instruction executed
// even if one was the oldest in a WV of that vector SIMD unit
if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
numCyclesWithNoInstrTypeIssued[unitId]++;
(*glbMemInstAvail)--;
}
// count cycles of no shared memory (vector) instruction executed
// even if one was the oldest in a WV of that vector SIMD unit
if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
numCyclesWithNoInstrTypeIssued[unitId]++;
(*shrMemInstAvail)--;
}
// count cycles when no instruction to a specific execution resource
// is executed
numCyclesWithNoInstrTypeIssued[unitId]++;
} else if (stage == BusyExec) {
// count the number of cycles an instruction to a specific unit
// was issued
// count the number of cycles an instruction to a specific execution
// resource type was issued
numCyclesWithInstrTypeIssued[unitId]++;
thisTimeInstExecuted = true;
instrExecuted = true;
@@ -102,14 +85,13 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
}
lastTimeInstExecuted = thisTimeInstExecuted;
// track the number of cycles we either issued one vector instruction
// or issued no instructions at all
// track the number of cycles we either issued at least
// instruction or issued no instructions at all
if (instrExecuted) {
numCyclesWithInstrIssued++;
} else {
numCyclesWithNoIssue++;
}
spc.sample(executionResourcesUsed);
}
}
@@ -122,25 +104,86 @@ ExecStage::initStatistics()
thisTimeInstExecuted = false;
}
std::string
ExecStage::dispStatusToStr(int i)
{
std::string s("INVALID");
switch (i) {
case EMPTY:
s = "EMPTY";
break;
case SKIP:
s = "SKIP";
break;
case EXREADY:
s = "EXREADY";
break;
}
return s;
}
void
ExecStage::dumpDispList()
{
std::stringstream ss;
bool empty = true;
for (int i = 0; i < computeUnit->numExeUnits(); i++) {
DISPATCH_STATUS s = dispatchList->at(i).second;
ss << i << ": " << dispStatusToStr(s);
if (s != EMPTY) {
empty = false;
Wavefront *w = dispatchList->at(i).first;
ss << " SIMD[" << w->simdId << "] WV[" << w->wfDynId << "]: ";
ss << (w->instructionBuffer.front())->seqNum() << ": ";
ss << (w->instructionBuffer.front())->disassemble();
}
ss << "\n";
}
if (!empty) {
DPRINTF(GPUSched, "Dispatch List:\n%s", ss.str());
}
}
void
ExecStage::exec()
{
initStatistics();
for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
// if dispatch list for this execution resource is empty,
// skip this execution resource this cycle
if (dispatchList->at(unitId).second == EMPTY) {
collectStatistics(IdleExec, unitId);
continue;
}
collectStatistics(BusyExec, unitId);
// execute an instruction for the WF
dispatchList->at(unitId).first->exec();
// clear the dispatch list entry
dispatchList->at(unitId).second = EMPTY;
dispatchList->at(unitId).first = (Wavefront*)nullptr;
if (Debug::GPUSched) {
dumpDispList();
}
for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
DISPATCH_STATUS s = dispatchList->at(unitId).second;
switch (s) {
case EMPTY:
// Do not execute if empty, waiting for VRF reads,
// or LM tied to GM waiting for VRF reads
collectStatistics(IdleExec, unitId);
break;
case EXREADY:
{
collectStatistics(BusyExec, unitId);
Wavefront *w = dispatchList->at(unitId).first;
DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
unitId, w->simdId, w->wfDynId,
(w->instructionBuffer.front())->disassemble());
DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
dispatchList->at(unitId).first->exec();
(computeUnit->scheduleStage).deleteFromSch(w);
dispatchList->at(unitId).second = EMPTY;
dispatchList->at(unitId).first->freeResources();
dispatchList->at(unitId).first = nullptr;
break;
}
case SKIP:
collectStatistics(BusyExec, unitId);
DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
dispatchList->at(unitId).second = EMPTY;
dispatchList->at(unitId).first->freeResources();
dispatchList->at(unitId).first = nullptr;
break;
default:
panic("Unknown dispatch status in exec()\n");
}
}
collectStatistics(PostExec, 0);
@@ -165,7 +208,7 @@ ExecStage::regStats()
;
spc
.init(0, numSIMDs + numMemUnits, 1)
.init(0, computeUnit->numExeUnits(), 1)
.name(name() + ".spc")
.desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
;
@@ -177,25 +220,36 @@ ExecStage::regStats()
;
numCyclesWithInstrTypeIssued
.init(numSIMDs + numMemUnits)
.name(name() + ".num_cycles_with_instrtype_issue")
.desc("Number of cycles at least one instruction of specific type "
"issued")
.init(computeUnit->numExeUnits())
.name(name() + ".num_cycles_issue_exec_rsrc")
.desc("Number of cycles at least one instruction issued to "
"execution resource type")
;
numCyclesWithNoInstrTypeIssued
.init(numSIMDs + numMemUnits)
.name(name() + ".num_cycles_with_instr_type_no_issue")
.desc("Number of cycles no instruction of specific type issued")
.init(computeUnit->numExeUnits())
.name(name() + ".num_cycles_no_issue_exec_rsrc")
.desc("Number of clks no instructions issued to execution "
"resource type")
;
for (int i = 0; i < numSIMDs; ++i) {
numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
int c = 0;
for (int i = 0; i < computeUnit->numVectorALUs; i++,c++) {
std::string s = "VectorALU" + std::to_string(i);
numCyclesWithNoInstrTypeIssued.subname(c, s);
numCyclesWithInstrTypeIssued.subname(c, s);
}
for (int i = 0; i < computeUnit->numScalarALUs; i++,c++) {
std::string s = "ScalarALU" + std::to_string(i);
numCyclesWithNoInstrTypeIssued.subname(c, s);
numCyclesWithInstrTypeIssued.subname(c, s);
}
numCyclesWithNoInstrTypeIssued.subname(c, "VectorMemPipe");
numCyclesWithInstrTypeIssued.subname(c++, "VectorMemPipe");
numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe");
numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe");
numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe");
numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe");
}

View File

@@ -35,6 +35,7 @@
#define __EXEC_STAGE_HH__
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
@@ -53,8 +54,9 @@ enum STAT_STATUS
enum DISPATCH_STATUS
{
EMPTY = 0,
FILLED
EMPTY = 0, // no wave present in dispatchList slot
EXREADY, // wave ready for execution
SKIP, // extra memory resource needed, Shared Mem. only
};
// Execution stage.
@@ -72,18 +74,21 @@ class ExecStage
void init(ComputeUnit *cu);
void exec();
std::string dispStatusToStr(int j);
void dumpDispList();
std::string name() { return _name; }
void regStats();
// number of idle cycles
Stats::Scalar numCyclesWithNoIssue;
// number of busy cycles
Stats::Scalar numCyclesWithInstrIssued;
// number of cycles (per execution unit) during which at least one
// instruction was issued to that unit
// number of cycles during which at least one
// instruction was issued to an execution resource type
Stats::Vector numCyclesWithInstrTypeIssued;
// number of idle cycles (per execution unit) during which the unit issued
// no instruction targeting that unit, even though there is at least one
// Wavefront with such an instruction as the oldest
// number of idle cycles during which the scheduler
// issued no instructions targeting a specific
// execution resource type
Stats::Vector numCyclesWithNoInstrTypeIssued;
// SIMDs active per cycle
Stats::Distribution spc;
@@ -92,11 +97,6 @@ class ExecStage
void collectStatistics(enum STAT_STATUS stage, int unitId);
void initStatistics();
ComputeUnit *computeUnit;
uint32_t numSIMDs;
// Number of memory execution resources;
// both global and local memory execution resources in CU
uint32_t numMemUnits;
// List of waves which will be dispatched to
// each execution resource. A FILLED implies
@@ -108,18 +108,12 @@ class ExecStage
// dispatchList is used to communicate between schedule
// and exec stage
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
// flag per vector SIMD unit that is set when there is at least one
// WV that has a vector ALU instruction as the oldest in its
// Instruction Buffer
std::vector<bool> *vectorAluInstAvail;
int *glbMemInstAvail;
int *shrMemInstAvail;
bool lastTimeInstExecuted;
bool thisTimeInstExecuted;
bool instrExecuted;
Stats::Scalar numTransActiveIdle;
Stats::Distribution idleDur;
uint32_t executionResourcesUsed;
int executionResourcesUsed;
uint64_t idle_dur;
std::string _name;
};

View File

@@ -36,18 +36,18 @@
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/wavefront.hh"
FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
computeUnit(nullptr)
FetchStage::FetchStage(const ComputeUnitParams* p) :
numVectorALUs(p->num_SIMDs), computeUnit(nullptr)
{
for (int j = 0; j < numSIMDs; ++j) {
for (int j = 0; j < numVectorALUs; ++j) {
FetchUnit newFetchUnit(p);
fetchUnit.push_back(newFetchUnit);
_fetchUnit.push_back(newFetchUnit);
}
}
FetchStage::~FetchStage()
{
fetchUnit.clear();
_fetchUnit.clear();
}
void
@@ -56,17 +56,17 @@ FetchStage::init(ComputeUnit *cu)
computeUnit = cu;
_name = computeUnit->name() + ".FetchStage";
for (int j = 0; j < numSIMDs; ++j) {
fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
fetchUnit[j].init(computeUnit);
for (int j = 0; j < numVectorALUs; ++j) {
_fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
_fetchUnit[j].init(computeUnit);
}
}
void
FetchStage::exec()
{
for (int j = 0; j < numSIMDs; ++j) {
fetchUnit[j].exec();
for (int j = 0; j < numVectorALUs; ++j) {
_fetchUnit[j].exec();
}
}
@@ -83,13 +83,13 @@ FetchStage::processFetchReturn(PacketPtr pkt)
instFetchInstReturned.sample(num_instructions);
uint32_t simdId = wavefront->simdId;
fetchUnit[simdId].processFetchReturn(pkt);
_fetchUnit[simdId].processFetchReturn(pkt);
}
void
FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
{
fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
_fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
}
void

View File

@@ -62,14 +62,15 @@ class FetchStage
std::string name() { return _name; }
void regStats();
Stats::Distribution instFetchInstReturned;
FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
private:
uint32_t numSIMDs;
int numVectorALUs;
ComputeUnit *computeUnit;
// List of fetch units. A fetch unit is
// instantiated per SIMD
std::vector<FetchUnit> fetchUnit;
// instantiated per VALU/SIMD
std::vector<FetchUnit> _fetchUnit;
std::string _name;
};

View File

@@ -45,11 +45,9 @@
uint32_t FetchUnit::globalFetchUnitID;
FetchUnit::FetchUnit(const ComputeUnitParams* params) :
timingSim(true),
computeUnit(nullptr),
fetchScheduler(params),
waveList(nullptr)
FetchUnit::FetchUnit(const ComputeUnitParams* params)
: timingSim(true), computeUnit(nullptr), fetchScheduler(params),
waveList(nullptr), fetchDepth(params->fetch_depth)
{
}
@@ -66,9 +64,14 @@ FetchUnit::init(ComputeUnit *cu)
timingSim = computeUnit->shader->timingSim;
fetchQueue.clear();
fetchStatusQueue.resize(computeUnit->shader->n_wf);
fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc());
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
for (int i = 0; i < computeUnit->shader->n_wf; ++i) {
Wavefront *wf = waveList->at(i);
assert(wf->wfSlotId == i);
fetchStatusQueue[i] = std::make_pair(wf, false);
fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf);
fetchBuf[i].decoder(&decoder);
}
fetchScheduler.bindList(&fetchQueue);
@@ -77,6 +80,23 @@ FetchUnit::init(ComputeUnit *cu)
void
FetchUnit::exec()
{
/**
* now we check if any of the fetch buffers have
* buffered instruction data that can be decoded
* and sent to its wavefront's instruction buffer.
* then we check if any of the fetch buffer entries
* can be released. we only check if we can
* release a buffer
*/
for (auto &fetch_buf : fetchBuf) {
if (!fetch_buf.hasFreeSpace()) {
fetch_buf.checkWaveReleaseBuf();
}
if (fetch_buf.hasFetchDataToProcess()) {
fetch_buf.decodeInsts();
}
}
// re-evaluate waves which are marked as not ready for fetch
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
// Following code assumes 64-bit opertaion and all insts are
@@ -88,9 +108,10 @@ FetchUnit::exec()
// 4 or less instructions and it can not have any branches to
// prevent speculative instruction fetches
if (!fetchStatusQueue[j].second) {
if (curWave->status == Wavefront::S_RUNNING &&
curWave->instructionBuffer.size() <= 4 &&
!curWave->instructionBufferHasBranch() &&
if ((curWave->getStatus() == Wavefront::S_RUNNING ||
curWave->getStatus() == Wavefront::S_WAITCNT) &&
fetchBuf[j].hasFreeSpace() &&
!curWave->stopFetch() &&
!curWave->pendingFetch) {
fetchQueue.push_back(curWave);
fetchStatusQueue[j].second = true;
@@ -111,45 +132,38 @@ FetchUnit::exec()
void
FetchUnit::initiateFetch(Wavefront *wavefront)
{
// calculate the virtual address to fetch from the SQC
Addr vaddr = wavefront->pc();
assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace());
/**
* the instruction buffer holds one instruction per entry, regardless
* of the underlying instruction's size. the PC, however, addresses
* instrutions on a 32b granularity so we must account for that here.
*/
for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) {
vaddr +=
wavefront->instructionBuffer.at(i)->staticInstruction()->instSize();
}
vaddr = wavefront->basePtr + vaddr;
* calculate the virtual address to fetch from the SQC. the fetch
* buffer holds a configurable number of cache lines. we start
* fetching at the address of the cache line immediately following
* the buffered line(s).
*/
Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr();
// this should already be aligned to a cache line
assert(vaddr == makeLineAddress(vaddr,
computeUnit->getCacheLineBits()));
// shouldn't be fetching a line that is already buffered
assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
"from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
// Since this is an instruction prefetch, if you're split then just finish
// out the current line.
int block_size = computeUnit->cacheLineSize();
// check for split accesses
Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
int size = block_size;
if (split_addr > vaddr) {
// misaligned access, just grab the rest of the line
size = split_addr - vaddr;
}
// set up virtual request
RequestPtr req = std::make_shared<Request>(
vaddr, size, Request::INST_FETCH,
vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH,
computeUnit->masterId(), 0, 0, nullptr);
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
// This fetchBlock is kind of faux right now - because the translations so
// far don't actually return Data
uint64_t fetchBlock;
pkt->dataStatic(&fetchBlock);
if (timingSim) {
// SenderState needed on Return
@@ -210,19 +224,23 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
// this is necessary because the GPU TLB receives packets instead of
// requests. when the translation is complete, all relevent fields in the
// request will be populated, but not in the packet. here we create the
// new packet so we can set the size, addr, and proper flags.
/**
* this is necessary because the GPU TLB receives packets instead of
* requests. when the translation is complete, all relevent fields in
* the request will be populated, but not in the packet. here we create
* the new packet so we can set the size, addr, and proper flags.
*/
PacketPtr oldPkt = pkt;
pkt = new Packet(oldPkt->req, oldPkt->cmd);
delete oldPkt;
TheGpuISA::RawMachInst *data =
new TheGpuISA::RawMachInst[pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst)];
pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
/**
* we should have reserved an entry in the fetch buffer
* for this cache line. here we get the pointer to the
* entry used to buffer this request's line data.
*/
pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId)
.reservedBuf(pkt->req->getVaddr()));
// New SenderState for the memory access
pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
@@ -257,47 +275,15 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
Wavefront *wavefront = sender_state->wavefront;
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
"%d bytes, %d instructions!\n", computeUnit->cu_id,
wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
pkt->req->getSize(), pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst));
"%d bytes!\n", computeUnit->cu_id, wavefront->simdId,
wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
if (wavefront->dropFetch) {
assert(wavefront->instructionBuffer.empty());
assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
wavefront->dropFetch = false;
} else {
TheGpuISA::RawMachInst *inst_index_ptr =
(TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
assert(wavefront->instructionBuffer.size() <= 4);
for (int i = 0; i < pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst); ++i) {
GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
assert(inst_ptr);
if (inst_ptr->instSize() == 8) {
/**
* this instruction occupies 2 consecutive
* entries in the instruction array, the
* second of which contains a nullptr. so if
* this inst is 8 bytes we advance two entries
* instead of 1
*/
++i;
}
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
computeUnit->cu_id, wavefront->simdId,
wavefront->wfSlotId, inst_ptr->disassemble());
GPUDynInstPtr gpuDynInst =
std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
computeUnit->getAndIncSeqNum());
wavefront->instructionBuffer.push_back(gpuDynInst);
}
fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
}
wavefront->pendingFetch = false;
@@ -306,8 +292,337 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
delete pkt;
}
void
FetchUnit::flushBuf(int wfSlotId)
{
fetchBuf.at(wfSlotId).flushBuf();
}
void
FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
{
waveList = wave_list;
}
/** FetchBufDesc */
void
FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size,
Wavefront *wf)
{
wavefront = wf;
fetchDepth = fetch_depth;
maxIbSize = wavefront->maxIbSize;
cacheLineSize = cache_line_size;
maxFbSize = cacheLineSize * fetchDepth;
// Calculate the number of bits to address a cache line
panic_if(!isPowerOf2(cacheLineSize),
"Cache line size should be a power of two.");
cacheLineBits = floorLog2(cacheLineSize);
bufStart = new uint8_t[maxFbSize];
readPtr = bufStart;
bufEnd = bufStart + maxFbSize;
for (int i = 0; i < fetchDepth; ++i) {
freeList.emplace_back(readPtr + i * cacheLineSize);
}
}
void
FetchUnit::FetchBufDesc::flushBuf()
{
restartFromBranch = true;
/**
* free list may have some entries
* so we clear it here to avoid duplicates
*/
freeList.clear();
bufferedPCs.clear();
reservedPCs.clear();
readPtr = bufStart;
for (int i = 0; i < fetchDepth; ++i) {
freeList.push_back(bufStart + i * cacheLineSize);
}
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
"buffer\n", wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId);
}
Addr
FetchUnit::FetchBufDesc::nextFetchAddr()
{
Addr next_line = 0;
if (bufferedAndReservedLines()) {
Addr last_line_fetched = 0;
if (!reservedLines()) {
/**
* get the PC of the most recently fetched cache line,
* then return the address of the next line.
*/
last_line_fetched = bufferedPCs.rbegin()->first;
} else {
last_line_fetched = reservedPCs.rbegin()->first;
}
next_line = last_line_fetched + cacheLineSize;
/**
* should not be trying to fetch a line that has already
* been fetched.
*/
assert(bufferedPCs.find(next_line) == bufferedPCs.end());
assert(reservedPCs.find(next_line) == reservedPCs.end());
} else {
/**
* we do not have any buffered cache lines yet, so we
* assume this is the initial fetch, or the first fetch
* after a branch, and get the PC directly from the WF.
* in the case of a branch, we may not start at the
* beginning of a cache line, so we adjust the readPtr by
* the current PC's offset from the start of the line.
*/
next_line = makeLineAddress(wavefront->pc(), cacheLineBits);
readPtr = bufStart;
/**
* if we are here we have no buffered lines. in the case we flushed
* the buffer due to a branch, we may need to start fetching from
* some offset from the start of the fetch buffer, so we adjust for
* that here.
*/
if (restartFromBranch) {
restartFromBranch = false;
int byte_offset
= wavefront->pc() - makeLineAddress(wavefront->pc(),
cacheLineBits);
readPtr += byte_offset;
}
}
return next_line;
}
void
FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
{
// we should have free buffer space, and the line
// at vaddr should not already be cached.
assert(hasFreeSpace());
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
assert(reservedPCs.find(vaddr) == reservedPCs.end());
assert(bufferedAndReservedLines() < fetchDepth);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry "
"for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId, vaddr);
/**
* we reserve buffer space, by moving it out of the
* free list, however we do not mark the buffered
* line as valid until the fetch unit for this buffer
* has receieved the response from the memory system.
*/
uint8_t *inst_buf = freeList.front();
reservedPCs.emplace(vaddr, inst_buf);
freeList.pop_front();
}
void
FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
{
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId, vaddr);
/**
* this address should have an entry reserved in the
* fetch buffer already, however it should be invalid
* until the fetch completes.
*/
auto reserved_pc = reservedPCs.find(vaddr);
assert(reserved_pc != reservedPCs.end());
bufferedPCs.emplace(vaddr, reserved_pc->second);
if (readPtr == bufEnd) {
readPtr = bufStart;
}
reserved_pc->second = nullptr;
reservedPCs.erase(reserved_pc);
}
bool
FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
{
return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst);
}
void
FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
{
Addr cur_wave_pc = roundDown(wavefront->pc(),
wavefront->computeUnit->cacheLineSize());
if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still "
"being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId, cur_wave_pc);
// should be reserved, but not buffered yet
assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
return;
}
auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
auto oldest_buffered_pc = bufferedPCs.begin();
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
"(PC = %#x) can be released.\n", wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
wavefront->pc());
#ifdef DEBUG
int idx = 0;
for (const auto &buf_pc : bufferedPCs) {
DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first);
++idx;
}
#endif
// if we haven't buffered data for this PC, we shouldn't
// be fetching from it.
assert(current_buffered_pc != bufferedPCs.end());
/**
* we're using a std::map so the addresses are sorted. if this
* PC is not the oldest one in the map, we must be fetching from
* a newer block, and we can release the oldest PC's fetch buffer
* entry back to the free list.
*/
if (current_buffered_pc != oldest_buffered_pc) {
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
"removing it from the fetch buffer.\n", wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId,
oldest_buffered_pc->first);
freeList.emplace_back(oldest_buffered_pc->second);
oldest_buffered_pc->second = nullptr;
bufferedPCs.erase(oldest_buffered_pc);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n",
wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
bufferedLines());
}
}
void
FetchUnit::FetchBufDesc::decodeInsts()
{
assert(readPtr);
if (splitDecode()) {
decodeSplitInst();
}
while (wavefront->instructionBuffer.size() < maxIbSize
&& hasFetchDataToProcess()) {
if (splitDecode()) {
decodeSplitInst();
} else {
TheGpuISA::MachInst mach_inst
= reinterpret_cast<TheGpuISA::MachInst>(readPtr);
GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
readPtr += gpu_static_inst->instSize();
assert(readPtr <= bufEnd);
GPUDynInstPtr gpu_dyn_inst
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
wavefront, gpu_static_inst,
wavefront->computeUnit->
getAndIncSeqNum());
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
"%d bytes remain.\n", wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId,
gpu_static_inst->disassemble(),
gpu_static_inst->instSize(),
fetchBytesRemaining());
}
}
}
void
FetchUnit::FetchBufDesc::decodeSplitInst()
{
TheGpuISA::RawMachInst split_inst = 0;
int dword_size = sizeof(uint32_t);
int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
for (int i = 0; i < num_dwords; ++i) {
((uint32_t*)(&split_inst))[i] = *reinterpret_cast<uint32_t*>(readPtr);
if (readPtr + dword_size >= bufEnd) {
readPtr = bufStart;
}
}
assert(readPtr == bufStart);
TheGpuISA::MachInst mach_inst
= reinterpret_cast<TheGpuISA::MachInst>(&split_inst);
GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
readPtr += (gpu_static_inst->instSize() - dword_size);
assert(readPtr < bufEnd);
GPUDynInstPtr gpu_dyn_inst
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
wavefront, gpu_static_inst,
wavefront->computeUnit->
getAndIncSeqNum());
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
"(%d bytes). %d bytes remain in %d buffered lines.\n",
wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
gpu_static_inst->disassemble(), split_inst,
gpu_static_inst->instSize(), fetchBytesRemaining(),
bufferedLines());
}
bool
FetchUnit::FetchBufDesc::splitDecode() const
{
/**
* if a read of a raw instruction would go beyond the end
* of the fetch buffer, then we must perform a split decode.
*/
bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd;
return is_split;
}
int
FetchUnit::FetchBufDesc::fetchBytesRemaining() const
{
int bytes_remaining = 0;
if (bufferedLines() && readPtr != bufEnd) {
auto last_buf_pc = bufferedPCs.rbegin();
uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
int byte_diff = end_ptr - readPtr;
if (end_ptr > readPtr) {
bytes_remaining = byte_diff;
} else if (end_ptr < readPtr) {
bytes_remaining = bufferedBytes() + byte_diff;
}
}
assert(bytes_remaining <= bufferedBytes());
return bytes_remaining;
}

View File

@@ -36,7 +36,6 @@
#include <string>
#include <utility>
#include <vector>
#include "arch/gpu_decoder.hh"
#include "base/statistics.hh"
@@ -58,9 +57,170 @@ class FetchUnit
void initiateFetch(Wavefront *wavefront);
void fetch(PacketPtr pkt, Wavefront *wavefront);
void processFetchReturn(PacketPtr pkt);
void flushBuf(int wfSlotId);
static uint32_t globalFetchUnitID;
private:
/**
* fetch buffer descriptor. holds buffered
* instruction data in the fetch unit.
*/
class FetchBufDesc
{
public:
FetchBufDesc() : bufStart(nullptr), bufEnd(nullptr),
readPtr(nullptr), fetchDepth(0), maxIbSize(0), maxFbSize(0),
cacheLineSize(0), restartFromBranch(false), wavefront(nullptr),
_decoder(nullptr)
{
}
~FetchBufDesc()
{
delete[] bufStart;
}
/**
* allocate the fetch buffer space, and set the fetch depth
* (number of lines that may be buffered), fetch size
* (cache line size), and parent WF for this fetch buffer.
*/
void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf);
int
bufferedAndReservedLines() const
{
return bufferedLines() + reservedLines();
}
int bufferedLines() const { return bufferedPCs.size(); }
int bufferedBytes() const { return bufferedLines() * cacheLineSize; }
int reservedLines() const { return reservedPCs.size(); }
bool hasFreeSpace() const { return !freeList.empty(); }
void flushBuf();
Addr nextFetchAddr();
/**
* reserve an entry in the fetch buffer for PC = vaddr,
*/
void reserveBuf(Addr vaddr);
/**
* return a pointer to the raw fetch buffer data.
* this allows the fetch pkt to use this data directly
* to avoid unnecessary memcpy and malloc/new.
*/
uint8_t*
reservedBuf(Addr vaddr) const
{
auto reserved_pc = reservedPCs.find(vaddr);
assert(reserved_pc != reservedPCs.end());
assert(reserved_pc == reservedPCs.begin());
return reserved_pc->second;
}
void fetchDone(Addr vaddr);
/**
* checks if the buffer contains valid data. this essentially
* tells fetch when there is data remaining that needs to be
* decoded into the WF's IB.
*/
bool hasFetchDataToProcess() const;
/**
* each time the fetch stage is ticked, we check if there
* are any data in the fetch buffer that may be decoded and
* sent to the IB. because we are modeling the fetch buffer
* as a circular buffer, it is possible that an instruction
* can straddle the end/beginning of the fetch buffer, so
* decodeSplitInsts() handles that case.
*/
void decodeInsts();
/**
* checks if the wavefront can release any of its fetch
* buffer entries. this will occur when the WF's PC goes
* beyond any of the currently buffered cache lines.
*/
void checkWaveReleaseBuf();
void
decoder(TheGpuISA::Decoder *dec)
{
_decoder = dec;
}
bool
pcBuffered(Addr pc) const
{
bool buffered = bufferedPCs.find(pc) != bufferedPCs.end()
&& reservedPCs.find(pc) != reservedPCs.end();
return buffered;
}
/**
* calculates the number of fetched bytes that have yet
* to be decoded.
*/
int fetchBytesRemaining() const;
private:
void decodeSplitInst();
/**
* check if the next instruction to be processed out of
* the fetch buffer is split across the end/beginning of
* the fetch buffer.
*/
bool splitDecode() const;
/**
* the set of PCs (fetch addresses) that are currently
* buffered. bufferedPCs are valid, reservedPCs are
* waiting for their buffers to be filled with valid
* fetch data.
*/
std::map<Addr, uint8_t*> bufferedPCs;
std::map<Addr, uint8_t*> reservedPCs;
/**
* represents the fetch buffer free list. holds buffer space
* that is currently free. each pointer in this array must
* have enough space to hold a cache line. in reality we
* have one actual fetch buffer: 'bufStart', these pointers
* point to addresses within bufStart that are aligned to the
* cache line size.
*/
std::deque<uint8_t*> freeList;
/**
* raw instruction buffer. holds cache line data associated with
* the set of PCs (fetch addresses) that are buffered here.
*/
uint8_t *bufStart;
uint8_t *bufEnd;
/**
* pointer that points to the next chunk of inst data to be
* decoded.
*/
uint8_t *readPtr;
// how many lines the fetch unit may buffer
int fetchDepth;
// maximum size (in number of insts) of the WF's IB
int maxIbSize;
// maximum size (in bytes) of this fetch buffer
int maxFbSize;
int cacheLineSize;
int cacheLineBits;
bool restartFromBranch;
// wavefront whose IB is serviced by this fetch buffer
Wavefront *wavefront;
TheGpuISA::Decoder *_decoder;
};
bool timingSim;
ComputeUnit *computeUnit;
TheGpuISA::Decoder decoder;
@@ -82,6 +242,15 @@ class FetchUnit
// Pointer to list of waves dispatched on to this SIMD unit
std::vector<Wavefront*> *waveList;
// holds the fetch buffers. each wave has 1 entry.
std::vector<FetchBufDesc> fetchBuf;
/**
* number of cache lines we can fetch and buffer.
* this includes the currently fetched line (i.e., the
* line that corresponds to the WF's current PC), as
* well as any lines that may be prefetched.
*/
int fetchDepth;
};
#endif // __FETCH_UNIT_HH__

View File

@@ -31,12 +31,13 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpu-compute/global_memory_pipeline.hh"
#define __STDC_FORMAT_MACROS
#include <cinttypes>
#include "debug/GPUCoalescer.hh"
#include "debug/GPUMem.hh"
#include "debug/GPUReg.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/global_memory_pipeline.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/vector_register_file.hh"
@@ -44,7 +45,7 @@
GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
maxWaveRequests(p->max_wave_requests), inflightStores(0),
inflightLoads(0)
{
}
@@ -76,6 +77,31 @@ GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
return true;
}
void
GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
{
// We require one token from the coalescer's uncoalesced table to
// proceed
int token_count = 1;
DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
}
bool
GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
{
// Ensure we haven't exceeded the maximum number of vmem requests
// for this wavefront
if ((mp->wavefront()->outstandingReqsRdGm
+ mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
return false;
}
return true;
}
void
GlobalMemPipeline::exec()
{
@@ -87,42 +113,60 @@ GlobalMemPipeline::exec()
// check the VRF to see if the operands of a load (or load component
// of an atomic) are accessible
if ((m) && (m->isLoad() || m->isAtomicRet())) {
if (m && (m->isLoad() || m->isAtomicRet())) {
w = m->wavefront();
accessVrf =
w->computeUnit->vrf[w->simdId]->
vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
accessVrf = w->computeUnit->vrf[w->simdId]->
canScheduleWriteOperandsFromLoad(w, m);
}
if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
accessVrf && m->statusBitVector == VectorMask(0) &&
(computeUnit->shader->coissue_return ||
computeUnit->wfWait.at(m->pipeId).rdy())) {
accessVrf && (computeUnit->shader->coissue_return ||
computeUnit->vectorGlobalMemUnit.rdy())) {
w = m->wavefront();
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
m->completeAcc(m);
if (m->isLoad() || m->isAtomicRet()) {
w->computeUnit->vrf[w->simdId]->
scheduleWriteOperandsFromLoad(w, m);
}
completeRequest(m);
// Decrement outstanding register count
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
Tick accessTime = curTick() - m->getAccessTime();
if (m->isStore() || m->isAtomic()) {
// Decrement outstanding requests count
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
if (m->isStore() || m->isAtomic() || m->isMemSync()) {
computeUnit->shader->sampleStore(accessTime);
computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
m->time, -1);
}
if (m->isLoad() || m->isAtomic()) {
if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
computeUnit->shader->sampleLoad(accessTime);
computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
m->time, -1);
}
w->validateRequestCounters();
// Generate stats for round-trip time for vectory memory insts
// going all the way to memory and stats for individual cache
// blocks generated by the instruction.
m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
// Mark write bus busy for appropriate amount of time
computeUnit->glbMemToVrfBus.set(m->time);
if (!computeUnit->shader->coissue_return)
w->computeUnit->wfWait.at(m->pipeId).set(m->time);
w->computeUnit->vectorGlobalMemUnit.set(m->time);
}
// If pipeline has executed a global memory instruction
@@ -148,13 +192,13 @@ GlobalMemPipeline::exec()
mp->disassemble(), mp->seqNum());
// Memfences will not return tokens and must be issued so we should
// not request one as this will deplete the token count until deadlock
if (!mp->isMemFence()) {
if (!mp->isMemSync()) {
assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
mp->computeUnit()->getTokenManager()->acquireTokens(1);
}
mp->initiateAcc(mp);
if (!outOfOrderDataDelivery && !mp->isMemFence()) {
if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
/**
* if we are not in out-of-order data delivery mode
* then we keep the responses sorted in program order.
@@ -178,19 +222,11 @@ GlobalMemPipeline::exec()
GPUDynInstPtr
GlobalMemPipeline::getNextReadyResp()
{
if (outOfOrderDataDelivery) {
if (!gmReturnedLoads.empty()) {
return gmReturnedLoads.front();
} else if (!gmReturnedStores.empty()) {
return gmReturnedStores.front();
}
} else {
if (!gmOrderedRespBuffer.empty()) {
auto mem_req = gmOrderedRespBuffer.begin();
if (!gmOrderedRespBuffer.empty()) {
auto mem_req = gmOrderedRespBuffer.begin();
if (mem_req->second.second) {
return mem_req->second.first;
}
if (mem_req->second.second) {
return mem_req->second.first;
}
}
@@ -208,51 +244,33 @@ GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
--inflightStores;
}
if (outOfOrderDataDelivery) {
if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
assert(!gmReturnedLoads.empty());
gmReturnedLoads.pop();
} else if (gpuDynInst->isStore()) {
assert(!gmReturnedStores.empty());
gmReturnedStores.pop();
}
} else {
// we should only pop the oldest requst, and it
// should be marked as done if we are here
assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
assert(gmOrderedRespBuffer.begin()->second.second);
// remove this instruction from the buffer by its
// unique seq ID
gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
}
// we should only pop the oldest requst, and it
// should be marked as done if we are here
assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
assert(gmOrderedRespBuffer.begin()->second.second);
// remove this instruction from the buffer by its
// unique seq ID
gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
}
void
GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
{
gpuDynInst->setAccessTime(curTick());
gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
gmIssuedRequests.push(gpuDynInst);
}
void
GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
{
if (outOfOrderDataDelivery) {
if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
assert(isGMLdRespFIFOWrRdy());
gmReturnedLoads.push(gpuDynInst);
} else {
assert(isGMStRespFIFOWrRdy());
gmReturnedStores.push(gpuDynInst);
}
} else {
auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
// if we are getting a response for this mem request,
// then it ought to already be in the ordered response
// buffer
assert(mem_req != gmOrderedRespBuffer.end());
mem_req->second.second = true;
}
auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
// if we are getting a response for this mem request,
// then it ought to already be in the ordered response
// buffer
assert(mem_req != gmOrderedRespBuffer.end());
mem_req->second.second = true;
}
void

View File

@@ -60,52 +60,34 @@ class GlobalMemPipeline
void init(ComputeUnit *cu);
void exec();
std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
/**
* find the next ready response to service. for OoO mode we
* simply pop the oldest (based on when the response was
* received) response in the response FIFOs. for in-order mode
* we pop the oldest (in program order) response, and only if
* it is marked as done.
* Find the next ready response to service. In order to ensure
* that no waitcnts are violated, we pop the oldest (in program order)
* response, and only if it is marked as done. This is because waitcnt
* values expect memory operations to complete and decrement their
* counter values in program order.
*/
GPUDynInstPtr getNextReadyResp();
/**
* once a memory request is finished we remove it from the
* buffer. this method determines which response buffer
* we're using based on the mode (in-order vs. OoO).
* buffer.
*/
void completeRequest(GPUDynInstPtr gpuDynInst);
/**
* issues a request to the pipeline - i.e., enqueue it
* in the request buffer.
* Issues a request to the pipeline (i.e., enqueue it
* in the request buffer).
*/
void issueRequest(GPUDynInstPtr gpuDynInst);
/**
* this method handles responses sent to this GM pipeline by the
* CU. in the case of in-order delivery it simply marks the reqeust
* as done in the ordered buffer to indicate that the requst is
* finished. for out-of-order data delivery, the requests are enqueued
* (in the order in which they are received) in the response FIFOs.
* This method handles responses sent to this GM pipeline by the
* CU. Simply marks the reqeust as done in the ordered buffer to
* indicate that the requst is finished.
*/
void handleResponse(GPUDynInstPtr gpuDynInst);
bool
isGMLdRespFIFOWrRdy() const
{
return gmReturnedLoads.size() < gmQueueSize;
}
bool
isGMStRespFIFOWrRdy() const
{
return gmReturnedStores.size() < gmQueueSize;
}
bool
isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
{
@@ -114,7 +96,6 @@ class GlobalMemPipeline
const std::string &name() const { return _name; }
void regStats();
void
incLoadVRFBankConflictCycles(int num_cycles)
{
@@ -122,12 +103,15 @@ class GlobalMemPipeline
}
bool coalescerReady(GPUDynInstPtr mp) const;
bool outstandingReqsCheck(GPUDynInstPtr mp) const;
void acqCoalescerToken(GPUDynInstPtr mp);
private:
ComputeUnit *computeUnit;
std::string _name;
int gmQueueSize;
bool outOfOrderDataDelivery;
int maxWaveRequests;
// number of cycles of delaying the update of a VGPR that is the
// target of a load instruction (or the load component of an atomic)
@@ -143,12 +127,11 @@ class GlobalMemPipeline
int globalMemSize;
/*
* this buffer holds the memory responses when in-order data
* deilvery is used - the responses are ordered by their unique
* sequence number, which is monotonically increasing. when a
* memory request returns its "done" flag is set to true. during
* each tick the the GM pipeline will check if the oldest request
* is finished, and if so it will be removed from the queue.
* This buffer holds the memory responses in order data - the responses
* are ordered by their unique sequence number, which is monotonically
* increasing. When a memory request returns its "done" flag is set to
* true. During each tick the the GM pipeline will check if the oldest
* request is finished, and if so it will be removed from the queue.
*
* key: memory instruction's sequence ID
*
@@ -161,14 +144,6 @@ class GlobalMemPipeline
// Global Memory Request FIFO: all global memory requests
// are issued to this FIFO from the memory pipelines
std::queue<GPUDynInstPtr> gmIssuedRequests;
// Globa Store Response FIFO: all responses of global memory
// stores are sent to this FIFO from TCP
std::queue<GPUDynInstPtr> gmReturnedStores;
// Global Load Response FIFO: all responses of global memory
// loads are sent to this FIFO from TCP
std::queue<GPUDynInstPtr> gmReturnedLoads;
};
#endif // __GLOBAL_MEMORY_PIPELINE_HH__

View File

@@ -0,0 +1,215 @@
/*
* Copyright (c) 2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Anthony Gutierrez
*/
#include "gpu-compute/gpu_command_processor.hh"
#include "debug/GPUCommandProc.hh"
#include "debug/GPUKernelInfo.hh"
#include "gpu-compute/dispatcher.hh"
#include "params/GPUCommandProcessor.hh"
GPUCommandProcessor::GPUCommandProcessor(const Params *p)
: HSADevice(p), dispatcher(*p->dispatcher)
{
dispatcher.setCommandProcessor(this);
}
/**
* submitDispatchPkt() is the entry point into the CP from the HSAPP
* and is only meant to be used with AQL kernel dispatch packets.
* After the HSAPP receives and extracts an AQL packet, it sends
* it to the CP, which is responsible for gathering all relevant
* information about a task, initializing CU state, and sending
* it to the dispatcher for WG creation and dispatch.
*
* First we need capture all information from the the AQL pkt and
* the code object, then store it in an HSAQueueEntry. Once the
* packet and code are extracted, we extract information from the
* queue descriptor that the CP needs to perform state initialization
* on the CU. Finally we call dispatch() to send the task to the
* dispatcher. When the task completely finishes, we call finishPkt()
* on the HSA packet processor in order to remove the packet from the
* queue, and notify the runtime that the task has completed.
*/
void
GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
Addr host_pkt_addr)
{
static int dynamic_task_id = 0;
_hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
/**
* we need to read a pointer in the application's address
* space to pull out the kernel code descriptor.
*/
auto *tc = sys->threads[0];
auto &virt_proxy = tc->getVirtProxy();
/**
* The kernel_object is a pointer to the machine code, whose entry
* point is an 'amd_kernel_code_t' type, which is included in the
* kernel binary, and describes various aspects of the kernel. The
* desired entry is the 'kernel_code_entry_byte_offset' field,
* which provides the byte offset (positive or negative) from the
* address of the amd_kernel_code_t to the start of the machine
* instructions.
*/
AMDKernelCode akc;
virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
sizeof(AMDKernelCode));
DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
"kernel object\n", akc.kernel_code_entry_byte_offset);
Addr machine_code_addr = (Addr)disp_pkt->kernel_object
+ akc.kernel_code_entry_byte_offset;
DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
machine_code_addr);
Addr kern_name_addr(0);
virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
(uint8_t*)&kern_name_addr, 0x8);
std::string kernel_name;
virt_proxy.readString(kernel_name, kern_name_addr);
DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
"grid size (%dx%dx%d) kernarg addr: %#x, completion "
"signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
disp_pkt->grid_size_x, disp_pkt->grid_size_y,
disp_pkt->grid_size_z, disp_pkt->kernarg_address,
disp_pkt->completion_signal);
DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
"num scalar regs: %d, code addr: %#x, kernarg size: %d, "
"LDS size: %d)\n", kernel_name, task->numVectorRegs(),
task->numScalarRegs(), task->codeAddr(), 0, 0);
initABI(task);
++dynamic_task_id;
}
/**
* submitVendorPkt() is for accepting vendor-specific packets from
* the HSAPP. Vendor-specific packets may be used by the runtime to
* send commands to the HSA device that are specific to a particular
* vendor. The vendor-specific packets should be defined by the vendor
* in the runtime.
*/
/**
* TODO: For now we simply tell the HSAPP to finish the packet,
* however a future patch will update this method to provide
* the proper handling of any required vendor-specific packets.
* In the version of ROCm that is currently supported (1.6)
* the runtime will send packets that direct the CP to
* invalidate the GPUs caches. We do this automatically on
* each kernel launch in the CU, so this is safe for now.
*/
void
GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
Addr host_pkt_addr)
{
hsaPP->finishPkt(raw_pkt, queue_id);
}
/**
* Once the CP has finished extracting all relevant information about
* a task and has initialized the ABI state, we send a description of
* the task to the dispatcher. The dispatcher will create and dispatch
* WGs to the CUs.
*/
void
GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
{
dispatcher.dispatch(task);
}
/**
* The CP is responsible for traversing all HSA-ABI-related data
* structures from memory and initializing the ABI state.
* Information provided by the MQD, AQL packet, and code object
* metadata will be used to initialze register file state.
*/
void
GPUCommandProcessor::initABI(HSAQueueEntry *task)
{
auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
Addr hostReadIdxPtr
= hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
&readDispIdOffEvent->readDispIdOffset);
}
System*
GPUCommandProcessor::system()
{
return sys;
}
AddrRangeList
GPUCommandProcessor::getAddrRanges() const
{
AddrRangeList ranges;
return ranges;
}
void
GPUCommandProcessor::setShader(Shader *shader)
{
_shader = shader;
}
Shader*
GPUCommandProcessor::shader()
{
return _shader;
}
GPUCommandProcessor*
GPUCommandProcessorParams::create()
{
return new GPUCommandProcessor(this);
}

View File

@@ -0,0 +1,165 @@
/*
* Copyright (c) 2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Anthony Gutierrez
*/
/**
* @file
* The GPUCommandProcessor (CP) is responsible for accepting commands, in
* the form of HSA AQL packets, from the HSA packet processor (HSAPP). The CP
* works with several components, including the HSAPP and the dispatcher.
* When the HSAPP sends a ready task to the CP, it will perform the necessary
* operations to extract relevant data structures from memory, such as the
* AQL queue descriptor and AQL packet, and initializes register state for the
* task's wavefronts.
*/
#ifndef __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
#define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
#include "dev/hsa/hsa_device.hh"
#include "gpu-compute/hsa_queue_entry.hh"
struct GPUCommandProcessorParams;
class GPUDispatcher;
class Shader;
class GPUCommandProcessor : public HSADevice
{
public:
typedef GPUCommandProcessorParams Params;
GPUCommandProcessor() = delete;
GPUCommandProcessor(const Params *p);
void setShader(Shader *shader);
Shader* shader();
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
Addr host_pkt_addr) override;
void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
Addr host_pkt_addr) override;
void dispatchPkt(HSAQueueEntry *task);
Tick write(PacketPtr pkt) override { return 0; }
Tick read(PacketPtr pkt) override { return 0; }
AddrRangeList getAddrRanges() const override;
System *system();
private:
Shader *_shader;
GPUDispatcher &dispatcher;
void initABI(HSAQueueEntry *task);
/**
* Perform a DMA read of the read_dispatch_id_field_base_byte_offset
* field, which follows directly after the read_dispatch_id (the read
* pointer) in the amd_hsa_queue_t struct (aka memory queue descriptor
* (MQD)), to find the base address of the MQD. The MQD is the runtime's
* soft representation of a HW queue descriptor (HQD).
*
* Any fields below the read dispatch ID in the amd_hsa_queue_t should
* not change according to the HSA standard, therefore we should be able
* to get them based on their known relative position to the read dispatch
* ID.
*/
class ReadDispIdOffsetDmaEvent : public DmaCallback
{
public:
ReadDispIdOffsetDmaEvent(GPUCommandProcessor &gpu_cmd_proc,
HSAQueueEntry *task)
: DmaCallback(), readDispIdOffset(0), gpuCmdProc(gpu_cmd_proc),
_task(task)
{
}
void
process() override
{
/**
* Now that the read pointer's offset from the base of
* the MQD is known, we can use that to calculate the
* the address of the MQD itself, the dispatcher will
* DMA that into the HSAQueueEntry when a kernel is
* launched.
*/
_task->hostAMDQueueAddr
= gpuCmdProc.hsaPP->getQueueDesc(_task->queueId())
->hostReadIndexPtr - readDispIdOffset;
/**
* DMA a copy of the MQD into the task. Some fields of
* the MQD will be used to initialize register state.
*/
auto *mqdDmaEvent = new MQDDmaEvent(gpuCmdProc, _task);
gpuCmdProc.dmaReadVirt(_task->hostAMDQueueAddr,
sizeof(_amd_queue_t), mqdDmaEvent,
&_task->amdQueue);
}
uint32_t readDispIdOffset;
private:
GPUCommandProcessor &gpuCmdProc;
HSAQueueEntry *_task;
};
/**
* Perform a DMA read of the MQD that corresponds to a hardware
* queue descriptor (HQD). We store a copy of the MQD in the
* HSAQueueEntry object so we can send a copy of it along with
* a dispatch packet, which is needed to initialize register
* state.
*/
class MQDDmaEvent : public DmaCallback
{
public:
MQDDmaEvent(GPUCommandProcessor &gpu_cmd_proc, HSAQueueEntry *task)
: DmaCallback(), gpuCmdProc(gpu_cmd_proc), _task(task)
{
}
void
process() override
{
gpuCmdProc.dispatchPkt(_task);
}
private:
GPUCommandProcessor &gpuCmdProc;
HSAQueueEntry *_task;
};
};
#endif // __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__

View File

@@ -0,0 +1,417 @@
/*
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Sooraj Puthoor
* Anthony Gutierrez
*/
#include "gpu-compute/gpu_compute_driver.hh"
#include "cpu/thread_context.hh"
#include "debug/GPUDriver.hh"
#include "dev/hsa/hsa_device.hh"
#include "dev/hsa/hsa_packet_processor.hh"
#include "dev/hsa/kfd_ioctl.h"
#include "params/GPUComputeDriver.hh"
#include "sim/syscall_emul_buf.hh"
GPUComputeDriver::GPUComputeDriver(Params *p)
: HSADriver(p)
{
DPRINTF(GPUDriver, "Constructing KFD: device\n");
}
int
GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
{
auto &virt_proxy = tc->getVirtProxy();
switch (req) {
case AMDKFD_IOC_GET_VERSION:
{
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");
TypedBufferArg<kfd_ioctl_get_version_args> args(ioc_buf);
args->major_version = 1;
args->minor_version = 0;
args.copyOut(virt_proxy);
}
break;
case AMDKFD_IOC_CREATE_QUEUE:
{
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
allocateQueue(virt_proxy, ioc_buf);
DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
}
break;
case AMDKFD_IOC_DESTROY_QUEUE:
{
TypedBufferArg<kfd_ioctl_destroy_queue_args> args(ioc_buf);
args.copyIn(virt_proxy);
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
"queue offset %d\n", args->queue_id);
device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id);
}
break;
case AMDKFD_IOC_SET_MEMORY_POLICY:
{
warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
}
break;
case AMDKFD_IOC_GET_CLOCK_COUNTERS:
{
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
TypedBufferArg<kfd_ioctl_get_clock_counters_args> args(ioc_buf);
args.copyIn(virt_proxy);
// Set nanosecond resolution
args->system_clock_freq = 1000000000;
/**
* Derive all clock counters based on the tick. All
* device clocks are identical and perfectly in sync.
*/
uint64_t elapsed_nsec = curTick() / SimClock::Int::ns;
args->gpu_clock_counter = elapsed_nsec;
args->cpu_clock_counter = elapsed_nsec;
args->system_clock_counter = elapsed_nsec;
args.copyOut(virt_proxy);
}
break;
case AMDKFD_IOC_GET_PROCESS_APERTURES:
{
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");
TypedBufferArg<kfd_ioctl_get_process_apertures_args> args(ioc_buf);
args->num_of_nodes = 1;
/**
* Set the GPUVM/LDS/Scratch APEs exactly as they
* are in the real driver, see the KFD driver
* in the ROCm Linux kernel source:
* drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
*/
for (int i = 0; i < args->num_of_nodes; ++i) {
/**
* While the GPU node numbers start at 0, we add 1
* to force the count to start at 1. This is to
* ensure that the base/limit addresses are
* calculated correctly.
*/
args->process_apertures[i].scratch_base
= scratchApeBase(i + 1);
args->process_apertures[i].scratch_limit =
scratchApeLimit(args->process_apertures[i].scratch_base);
args->process_apertures[i].lds_base = ldsApeBase(i + 1);
args->process_apertures[i].lds_limit =
ldsApeLimit(args->process_apertures[i].lds_base);
args->process_apertures[i].gpuvm_base = gpuVmApeBase(i + 1);
args->process_apertures[i].gpuvm_limit =
gpuVmApeLimit(args->process_apertures[i].gpuvm_base);
// NOTE: Must match ID populated by hsaTopology.py
args->process_apertures[i].gpu_id = 2765;
DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
args->process_apertures[i].gpuvm_base);
DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i,
args->process_apertures[i].gpuvm_limit);
DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i,
args->process_apertures[i].lds_base);
DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i,
args->process_apertures[i].lds_limit);
DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i,
args->process_apertures[i].scratch_base);
DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i,
args->process_apertures[i].scratch_limit);
/**
* The CPU's 64b address space can only use the
* areas with VA[63:47] == 0x1ffff or VA[63:47] == 0,
* therefore we must ensure that the apertures do not
* fall in the CPU's address space.
*/
assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
47) != 0x1ffff);
assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
47) != 0);
assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
47) != 0x1ffff);
assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
47) != 0);
assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
47) != 0x1ffff);
assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
47) != 0);
assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
47) != 0x1ffff);
assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
47) != 0);
assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
47) != 0x1ffff);
assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
47) != 0);
assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
47) != 0x1ffff);
assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
47) != 0);
}
args.copyOut(virt_proxy);
}
break;
case AMDKFD_IOC_UPDATE_QUEUE:
{
warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
}
break;
case AMDKFD_IOC_CREATE_EVENT:
{
warn("unimplemented ioctl: AMDKFD_IOC_CREATE_EVENT\n");
}
break;
case AMDKFD_IOC_DESTROY_EVENT:
{
warn("unimplemented ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
}
break;
case AMDKFD_IOC_SET_EVENT:
{
warn("unimplemented ioctl: AMDKFD_IOC_SET_EVENT\n");
}
break;
case AMDKFD_IOC_RESET_EVENT:
{
warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
}
break;
case AMDKFD_IOC_WAIT_EVENTS:
{
warn("unimplemented ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
}
break;
case AMDKFD_IOC_DBG_REGISTER:
{
warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
}
break;
case AMDKFD_IOC_DBG_UNREGISTER:
{
warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
}
break;
case AMDKFD_IOC_DBG_ADDRESS_WATCH:
{
warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
}
break;
case AMDKFD_IOC_DBG_WAVE_CONTROL:
{
warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
}
break;
case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
{
warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
}
break;
case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
{
warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
}
break;
case AMDKFD_IOC_MAP_MEMORY_TO_GPU:
{
warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
}
break;
case AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU:
{
warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
}
case AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH:
{
warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH\n");
}
break;
case AMDKFD_IOC_SET_CU_MASK:
{
warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
}
break;
case AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE:
{
warn("unimplemented ioctl: AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE"
"\n");
}
break;
case AMDKFD_IOC_SET_TRAP_HANDLER:
{
warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
}
break;
case AMDKFD_IOC_GET_PROCESS_APERTURES_NEW:
{
DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
TypedBufferArg<kfd_ioctl_get_process_apertures_new_args>
ioc_args(ioc_buf);
ioc_args.copyIn(virt_proxy);
ioc_args->num_of_nodes = 1;
for (int i = 0; i < ioc_args->num_of_nodes; ++i) {
TypedBufferArg<kfd_process_device_apertures> ape_args
(ioc_args->kfd_process_device_apertures_ptr);
ape_args->scratch_base = scratchApeBase(i + 1);
ape_args->scratch_limit =
scratchApeLimit(ape_args->scratch_base);
ape_args->lds_base = ldsApeBase(i + 1);
ape_args->lds_limit = ldsApeLimit(ape_args->lds_base);
ape_args->gpuvm_base = gpuVmApeBase(i + 1);
ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base);
ape_args->gpu_id = 2765;
assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0);
assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0x1ffff);
assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0);
assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0x1ffff);
assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0);
ape_args.copyOut(virt_proxy);
}
ioc_args.copyOut(virt_proxy);
}
break;
case AMDKFD_IOC_GET_DMABUF_INFO:
{
warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
}
break;
case AMDKFD_IOC_IMPORT_DMABUF:
{
warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
}
break;
case AMDKFD_IOC_GET_TILE_CONFIG:
{
warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
}
break;
case AMDKFD_IOC_IPC_IMPORT_HANDLE:
{
warn("unimplemented ioctl: AMDKFD_IOC_IPC_IMPORT_HANDLE\n");
}
break;
case AMDKFD_IOC_IPC_EXPORT_HANDLE:
{
warn("unimplemented ioctl: AMDKFD_IOC_IPC_EXPORT_HANDLE\n");
}
break;
case AMDKFD_IOC_CROSS_MEMORY_COPY:
{
warn("unimplemented ioctl: AMDKFD_IOC_CROSS_MEMORY_COPY\n");
}
break;
case AMDKFD_IOC_OPEN_GRAPHIC_HANDLE:
{
warn("unimplemented ioctl: AMDKFD_IOC_OPEN_GRAPHIC_HANDLE\n");
}
break;
default:
fatal("%s: bad ioctl %d\n", req);
break;
}
return 0;
}
Addr
GPUComputeDriver::gpuVmApeBase(int gpuNum) const
{
return ((Addr)gpuNum << 61) + 0x1000000000000L;
}
Addr
GPUComputeDriver::gpuVmApeLimit(Addr apeBase) const
{
return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
}
Addr
GPUComputeDriver::scratchApeBase(int gpuNum) const
{
return ((Addr)gpuNum << 61) + 0x100000000L;
}
Addr
GPUComputeDriver::scratchApeLimit(Addr apeBase) const
{
return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
}
Addr
GPUComputeDriver::ldsApeBase(int gpuNum) const
{
return ((Addr)gpuNum << 61) + 0x0;
}
Addr
GPUComputeDriver::ldsApeLimit(Addr apeBase) const
{
return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
}
GPUComputeDriver*
GPUComputeDriverParams::create()
{
return new GPUComputeDriver(this);
}

View File

@@ -0,0 +1,83 @@
/*
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Sooraj Puthoor
* Anthony Gutierrez
*/
/**
* @file
* The GPUComputeDriver implements an HSADriver for an HSA AMD GPU
* agent. Other GPU devices, or other HSA agents, should not derive
* from this class. Instead device-specific implementations of an
* HSADriver should be provided for each unique device.
*/
#ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
#define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
#include "dev/hsa/hsa_driver.hh"
struct GPUComputeDriverParams;
class GPUComputeDriver final : public HSADriver
{
public:
typedef GPUComputeDriverParams Params;
GPUComputeDriver(Params *p);
int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;
private:
/**
* The aperture (APE) base/limit pairs are set
* statically at startup by the real KFD. AMD
* x86_64 CPUs only use the areas in the 64b
* address space where VA[63:47] == 0x1ffff or
* VA[63:47] = 0. These methods generate the APE
* base/limit pairs in exactly the same way as
* the real KFD does, which ensures these APEs do
* not fall into the CPU's address space
*
* see the macros in the KFD driver in the ROCm
* Linux kernel source:
*
* drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
*/
Addr gpuVmApeBase(int gpuNum) const;
Addr gpuVmApeLimit(Addr apeBase) const;
Addr scratchApeBase(int gpuNum) const;
Addr scratchApeLimit(Addr apeBase) const;
Addr ldsApeBase(int gpuNum) const;
Addr ldsApeLimit(Addr apeBase) const;
};
#endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__

View File

@@ -35,26 +35,50 @@
#include "debug/GPUMem.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
GPUStaticInst *static_inst, uint64_t instSeqNum)
: GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
n_reg(0), useContinuation(false),
statusBitVector(0), _staticInst(static_inst), _seqNum(instSeqNum)
GPUStaticInst *static_inst, InstSeqNum instSeqNum)
: GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
(Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false),
_staticInst(static_inst), _seqNum(instSeqNum)
{
tlbHitLevel.assign(computeUnit()->wfSize(), -1);
d_data = new uint8_t[computeUnit()->wfSize() * 16];
// vector instructions can have up to 4 source/destination operands
d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
a_data = new uint8_t[computeUnit()->wfSize() * 8];
x_data = new uint8_t[computeUnit()->wfSize() * 8];
// scalar loads can read up to 16 Dwords of data (see publicly
// available GCN3 ISA manual)
scalar_data = new uint8_t[16 * sizeof(uint32_t)];
for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
scalar_data[i] = 0;
}
for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
a_data[i] = 0;
x_data[i] = 0;
}
for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {
d_data[i] = 0;
}
time = 0;
cu_id = _cu->cu_id;
if (_wf) {
simdId = _wf->simdId;
wfDynId = _wf->wfDynId;
kern_id = _wf->kernId;
wg_id = _wf->wgId;
wfSlotId = _wf->wfSlotId;
} else {
simdId = -1;
wfDynId = -1;
kern_id = -1;
wg_id = -1;
wfSlotId = -1;
}
}
GPUDynInst::~GPUDynInst()
@@ -62,6 +86,8 @@ GPUDynInst::~GPUDynInst()
delete[] d_data;
delete[] a_data;
delete[] x_data;
delete[] scalar_data;
delete _staticInst;
}
void
@@ -82,6 +108,36 @@ GPUDynInst::numDstRegOperands()
return _staticInst->numDstRegOperands();
}
int
GPUDynInst::numSrcVecOperands()
{
return _staticInst->numSrcVecOperands();
}
int
GPUDynInst::numDstVecOperands()
{
return _staticInst->numDstVecOperands();
}
int
GPUDynInst::numSrcVecDWORDs()
{
return _staticInst->numSrcVecDWORDs();
}
int
GPUDynInst::numDstVecDWORDs()
{
return _staticInst->numDstVecDWORDs();
}
int
GPUDynInst::numOpdDWORDs(int operandIdx)
{
return _staticInst->numOpdDWORDs(operandIdx);
}
int
GPUDynInst::getNumOperands()
{
@@ -100,12 +156,6 @@ GPUDynInst::isScalarRegister(int operandIdx)
return _staticInst->isScalarRegister(operandIdx);
}
bool
GPUDynInst::isCondRegister(int operandIdx)
{
return _staticInst->isCondRegister(operandIdx);
}
int
GPUDynInst::getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst)
{
@@ -130,13 +180,82 @@ GPUDynInst::isSrcOperand(int operandIdx)
return _staticInst->isSrcOperand(operandIdx);
}
bool
GPUDynInst::hasSourceSgpr() const
{
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
return true;
}
}
return false;
}
bool
GPUDynInst::hasSourceVgpr() const
{
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
if (_staticInst->isVectorRegister(i) && _staticInst->isSrcOperand(i)) {
return true;
}
}
return false;
}
bool
GPUDynInst::hasDestinationSgpr() const
{
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
return true;
}
}
return false;
}
bool
GPUDynInst::srcIsVgpr(int index) const
{
assert(index >= 0 && index < _staticInst->getNumOperands());
if (_staticInst->isVectorRegister(index) &&
_staticInst->isSrcOperand(index)) {
return true;
}
return false;
}
bool
GPUDynInst::hasDestinationVgpr() const
{
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
if (_staticInst->isVectorRegister(i) && _staticInst->isDstOperand(i)) {
return true;
}
}
return false;
}
bool
GPUDynInst::isOpcode(const std::string& opcodeStr,
const std::string& extStr) const
{
return _staticInst->opcode().find(opcodeStr) != std::string::npos &&
_staticInst->opcode().find(extStr) != std::string::npos;
}
bool
GPUDynInst::isOpcode(const std::string& opcodeStr) const
{
return _staticInst->opcode().find(opcodeStr) != std::string::npos;
}
const std::string&
GPUDynInst::disassemble() const
{
return _staticInst->disassemble();
}
uint64_t
InstSeqNum
GPUDynInst::seqNum() const
{
return _seqNum;
@@ -148,6 +267,40 @@ GPUDynInst::executedAs()
return _staticInst->executed_as;
}
bool
GPUDynInst::hasVgprRawDependence(GPUDynInstPtr s)
{
assert(s);
for (int i = 0; i < getNumOperands(); ++i) {
if (isVectorRegister(i) && isSrcOperand(i)) {
for (int j = 0; j < s->getNumOperands(); ++j) {
if (s->isVectorRegister(j) && s->isDstOperand(j)) {
if (i == j)
return true;
}
}
}
}
return false;
}
bool
GPUDynInst::hasSgprRawDependence(GPUDynInstPtr s)
{
assert(s);
for (int i = 0; i < getNumOperands(); ++i) {
if (isScalarRegister(i) && isSrcOperand(i)) {
for (int j = 0; j < s->getNumOperands(); ++j) {
if (s->isScalarRegister(j) && s->isDstOperand(j)) {
if (i == j)
return true;
}
}
}
}
return false;
}
// Process a memory instruction and (if necessary) submit timing request
void
GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
@@ -156,12 +309,15 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
cu->cu_id, simdId, wfSlotId, exec_mask);
_staticInst->initiateAcc(gpuDynInst);
time = 0;
}
void
GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
{
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="
"%#x\n complete",
cu->cu_id, simdId, wfSlotId, exec_mask);
_staticInst->completeAcc(gpuDynInst);
}
@@ -181,12 +337,42 @@ GPUDynInst::isBranch() const
return _staticInst->isBranch();
}
bool
GPUDynInst::isCondBranch() const
{
return _staticInst->isCondBranch();
}
bool
GPUDynInst::isNop() const
{
return _staticInst->isNop();
}
bool
GPUDynInst::isEndOfKernel() const
{
return _staticInst->isEndOfKernel();
}
bool
GPUDynInst::isKernelLaunch() const
{
return _staticInst->isKernelLaunch();
}
bool
GPUDynInst::isSDWAInst() const
{
return _staticInst->isSDWAInst();
}
bool
GPUDynInst::isDPPInst() const
{
return _staticInst->isDPPInst();
}
bool
GPUDynInst::isReturn() const
{
@@ -218,9 +404,9 @@ GPUDynInst::isBarrier() const
}
bool
GPUDynInst::isMemFence() const
GPUDynInst::isMemSync() const
{
return _staticInst->isMemFence();
return _staticInst->isMemSync();
}
bool
@@ -265,6 +451,12 @@ GPUDynInst::isAtomicRet() const
return _staticInst->isAtomicRet();
}
bool
GPUDynInst::isVector() const
{
return !_staticInst->isScalar();
}
bool
GPUDynInst::isScalar() const
{
@@ -295,6 +487,78 @@ GPUDynInst::writesVCC() const
return _staticInst->writesVCC();
}
bool
GPUDynInst::readsMode() const
{
return _staticInst->readsMode();
}
bool
GPUDynInst::writesMode() const
{
return _staticInst->writesMode();
}
bool
GPUDynInst::readsEXEC() const
{
return _staticInst->readsEXEC();
}
bool
GPUDynInst::writesEXEC() const
{
return _staticInst->writesEXEC();
}
bool
GPUDynInst::ignoreExec() const
{
return _staticInst->ignoreExec();
}
bool
GPUDynInst::writesExecMask() const
{
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
return _staticInst->isDstOperand(i) &&
_staticInst->isExecMaskRegister(i);
}
return false;
}
bool
GPUDynInst::readsExecMask() const
{
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
return _staticInst->isSrcOperand(i) &&
_staticInst->isExecMaskRegister(i);
}
return false;
}
bool
GPUDynInst::writesFlatScratch() const
{
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
return _staticInst->isFlatScratchRegister(i);
}
}
return false;
}
bool
GPUDynInst::readsFlatScratch() const
{
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
return _staticInst->isFlatScratchRegister(i);
}
}
return false;
}
bool
GPUDynInst::isAtomicAnd() const
{
@@ -420,72 +684,6 @@ GPUDynInst::isSpillSeg() const
return _staticInst->isSpillSeg();
}
bool
GPUDynInst::isWorkitemScope() const
{
return _staticInst->isWorkitemScope();
}
bool
GPUDynInst::isWavefrontScope() const
{
return _staticInst->isWavefrontScope();
}
bool
GPUDynInst::isWorkgroupScope() const
{
return _staticInst->isWorkgroupScope();
}
bool
GPUDynInst::isDeviceScope() const
{
return _staticInst->isDeviceScope();
}
bool
GPUDynInst::isSystemScope() const
{
return _staticInst->isSystemScope();
}
bool
GPUDynInst::isNoScope() const
{
return _staticInst->isNoScope();
}
bool
GPUDynInst::isRelaxedOrder() const
{
return _staticInst->isRelaxedOrder();
}
bool
GPUDynInst::isAcquire() const
{
return _staticInst->isAcquire();
}
bool
GPUDynInst::isRelease() const
{
return _staticInst->isRelease();
}
bool
GPUDynInst::isAcquireRelease() const
{
return _staticInst->isAcquireRelease();
}
bool
GPUDynInst::isNoOrder() const
{
return _staticInst->isNoOrder();
}
bool
GPUDynInst::isGloballyCoherent() const
{
@@ -498,12 +696,240 @@ GPUDynInst::isSystemCoherent() const
return _staticInst->isSystemCoherent();
}
bool
GPUDynInst::isF16() const
{
return _staticInst->isF16();
}
bool
GPUDynInst::isF32() const
{
return _staticInst->isF32();
}
bool
GPUDynInst::isF64() const
{
return _staticInst->isF64();
}
bool
GPUDynInst::isFMA() const
{
return _staticInst->isFMA();
}
bool
GPUDynInst::isMAC() const
{
return _staticInst->isMAC();
}
bool
GPUDynInst::isMAD() const
{
return _staticInst->isMAD();
}
void
GPUDynInst::doApertureCheck(const VectorMask &mask)
{
assert(mask.any());
// find the segment of the first active address, after
// that we check that all other active addresses also
// fall within the same APE
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
if (mask[lane]) {
if (computeUnit()->shader->isLdsApe(addr[lane])) {
// group segment
staticInstruction()->executed_as = Enums::SC_GROUP;
break;
} else if (computeUnit()->shader->isScratchApe(addr[lane])) {
// private segment
staticInstruction()->executed_as = Enums::SC_PRIVATE;
break;
} else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {
// we won't support GPUVM
fatal("flat access is in GPUVM APE\n");
} else if (bits(addr[lane], 63, 47) != 0x1FFFF &&
bits(addr[lane], 63, 47)) {
// we are in the "hole", this is a memory violation
fatal("flat access at addr %#x has a memory violation\n",
addr[lane]);
} else {
// global memory segment
staticInstruction()->executed_as = Enums::SC_GLOBAL;
break;
}
}
}
// we should have found the segment
assert(executedAs() != Enums::SC_NONE);
// flat accesses should not straddle multiple APEs so we
// must check that all addresses fall within the same APE
if (executedAs() == Enums::SC_GROUP) {
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
if (mask[lane]) {
// if the first valid addr we found above was LDS,
// all the rest should be
assert(computeUnit()->shader->isLdsApe(addr[lane]));
}
}
} else if (executedAs() == Enums::SC_PRIVATE) {
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
if (mask[lane]) {
// if the first valid addr we found above was private,
// all the rest should be
assert(computeUnit()->shader->isScratchApe(addr[lane]));
}
}
} else {
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
if (mask[lane]) {
// if the first valid addr we found above was global,
// all the rest should be. because we don't have an
// explicit range of the global segment, we just make
// sure that the address fall in no other APE and that
// it is not a memory violation
assert(!computeUnit()->shader->isLdsApe(addr[lane]));
assert(!computeUnit()->shader->isScratchApe(addr[lane]));
assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));
assert(!(bits(addr[lane], 63, 47) != 0x1FFFF
&& bits(addr[lane], 63, 47)));
}
}
}
}
void
GPUDynInst::resolveFlatSegment(const VectorMask &mask)
{
doApertureCheck(mask);
// Now that we know the aperature, do the following:
// 1. Transform the flat address to its segmented equivalent.
// 2. Set the execUnitId based an the aperture check.
// 3. Decrement any extra resources that were reserved. Other
// resources are released as normal, below.
if (executedAs() == Enums::SC_GLOBAL) {
// no transormation for global segment
wavefront()->execUnitId = wavefront()->flatGmUnitId;
if (isLoad()) {
wavefront()->rdLmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrLmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->wrLmReqsInPipe--;
wavefront()->rdLmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
}
} else if (executedAs() == Enums::SC_GROUP) {
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
// flat address calculation goes here.
// addr[lane] = segmented address
panic("Flat group memory operation is unimplemented!\n");
}
}
wavefront()->execUnitId = wavefront()->flatLmUnitId;
if (isLoad()) {
wavefront()->rdGmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrGmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->rdGmReqsInPipe--;
wavefront()->wrGmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
}
} else if (executedAs() == Enums::SC_PRIVATE) {
/**
* Flat instructions may resolve to the private segment (scratch),
* which is backed by main memory and provides per-lane scratch
* memory. Flat addressing uses apertures - registers that specify
* the address range in the VA space where LDS/private memory is
* mapped. The value of which is set by the kernel mode driver.
* These apertures use addresses that are not used by x86 CPUs.
* When the address of a Flat operation falls into one of the
* apertures, the Flat operation is redirected to either LDS or
* to the private memory segment.
*
* For private memory the SW runtime will allocate some space in
* the VA space for each AQL queue. The base address of which is
* stored in scalar registers per the AMD GPU ABI. The amd_queue_t
* scratch_backing_memory_location provides the base address in
* memory for the queue's private segment. Various other fields
* loaded into register state during kernel launch specify per-WF
* and per-work-item offsets so that individual lanes may access
* their private segment allocation.
*
* For more details about flat addressing see:
* http://rocm-documentation.readthedocs.io/en/latest/
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
*
* https://github.com/ROCm-Developer-Tools/
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
* #flat-addressing
*/
uint32_t numSgprs = wavefront()->maxSgprs;
uint32_t physSgprIdx =
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
numSgprs - 3);
uint32_t offset =
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
physSgprIdx =
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
numSgprs - 4);
uint32_t size =
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
addr[lane] = addr[lane] + lane * size + offset +
wavefront()->computeUnit->shader->getHiddenPrivateBase() -
wavefront()->computeUnit->shader->getScratchBase();
}
}
wavefront()->execUnitId = wavefront()->flatLmUnitId;
if (isLoad()) {
wavefront()->rdGmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrGmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->rdGmReqsInPipe--;
wavefront()->wrGmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
}
} else {
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
panic("flat addr %#llx maps to bad segment %d\n",
addr[lane], executedAs());
}
}
}
}
TheGpuISA::ScalarRegU32
GPUDynInst::srcLiteral() const
{
return _staticInst->srcLiteral();
}
void
GPUDynInst::updateStats()
{
if (_staticInst->isLocalMem()) {
// access to LDS (shared) memory
cu->dynamicLMemInstrCnt++;
} else if (_staticInst->isFlat()) {
cu->dynamicFlatMemInstrCnt++;
} else {
// access to global memory
@@ -536,3 +962,28 @@ GPUDynInst::updateStats()
cu->dynamicGMemInstrCnt++;
}
}
void
GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)
{
// Only take the first measurement in the case of coalescing
if (roundTripTime.size() > hopId)
return;
roundTripTime.push_back(currentTime);
}
void
GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
{
if (lineAddressTime.count(addr)) {
if (lineAddressTime[addr].size() > hopId) {
return;
}
lineAddressTime[addr].push_back(currentTime);
} else if (hopId == 0) {
auto addressTimeVec = std::vector<Tick> { currentTime };
lineAddressTime.insert(std::make_pair(addr, addressTimeVec));
}
}

View File

@@ -39,7 +39,6 @@
#include "base/amo.hh"
#include "base/logging.hh"
#include "enums/MemType.hh"
#include "enums/StorageClassType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_exec_context.hh"
@@ -68,20 +67,10 @@ class AtomicOpCAS : public TypedAtomicOpFunctor<T>
} else {
computeUnit->numFailedCASOps++;
}
if (computeUnit->xact_cas_mode) {
computeUnit->xactCasLoadMap.clear();
}
}
AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
};
typedef enum
{
VT_32,
VT_64,
} vgpr_type;
class GPUDynInst : public GPUExecContext
{
public:
@@ -91,27 +80,51 @@ class GPUDynInst : public GPUExecContext
void execute(GPUDynInstPtr gpuDynInst);
int numSrcRegOperands();
int numDstRegOperands();
int numDstVecOperands();
int numSrcVecOperands();
int numSrcVecDWORDs();
int numDstVecDWORDs();
int numOpdDWORDs(int operandIdx);
int getNumOperands();
bool isVectorRegister(int operandIdx);
bool isScalarRegister(int operandIdx);
bool isCondRegister(int operandIdx);
int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
int getOperandSize(int operandIdx);
bool isDstOperand(int operandIdx);
bool isSrcOperand(int operandIdx);
bool hasDestinationSgpr() const;
bool hasSourceSgpr() const;
bool hasDestinationVgpr() const;
bool hasSourceVgpr() const;
bool hasSgprRawDependence(GPUDynInstPtr s);
bool hasVgprRawDependence(GPUDynInstPtr s);
// returns true if the string "opcodeStr" is found in the
// opcode of the instruction
bool isOpcode(const std::string& opcodeStr) const;
bool isOpcode(const std::string& opcodeStr,
const std::string& extStr) const;
// returns true if source operand at "index" is a vector register
bool srcIsVgpr(int index) const;
const std::string &disassemble() const;
uint64_t seqNum() const;
InstSeqNum seqNum() const;
Enums::StorageClassType executedAs();
// The address of the memory operation
// virtual address for scalar memory operations
Addr scalarAddr;
// virtual addressies for vector memory operations
std::vector<Addr> addr;
Addr pAddr;
// The data to get written
// vector data to get written
uint8_t *d_data;
// scalar data to be transferred
uint8_t *scalar_data;
// Additional data (for atomics)
uint8_t *a_data;
// Additional data (for atomics)
@@ -119,19 +132,6 @@ class GPUDynInst : public GPUExecContext
// The execution mask
VectorMask exec_mask;
// The memory type (M_U32, M_S32, ...)
Enums::MemType m_type;
// The equivalency class
int equiv;
// The return VGPR type (VT_32 or VT_64)
vgpr_type v_type;
// Number of VGPR's accessed (1, 2, or 4)
int n_reg;
// The return VGPR index
int dst_reg;
// There can be max 4 dest regs>
int dst_reg_vec[4];
// SIMD where the WF of the memory instruction has been mapped to
int simdId;
// unique id of the WF where the memory instruction belongs to
@@ -140,21 +140,16 @@ class GPUDynInst : public GPUExecContext
int kern_id;
// The CU id of the requesting wf
int cu_id;
// The workgroup id of the requesting wf
int wg_id;
// HW slot id where the WF is mapped to inside a SIMD unit
int wfSlotId;
// execution pipeline id where the memory instruction has been scheduled
int pipeId;
int execUnitId;
// The execution time of this operation
Tick time;
// The latency of this operation
WaitClass latency;
// A list of bank conflicts for the 4 cycles.
uint32_t bc[4];
// A pointer to ROM
uint8_t *rom;
// The size of the READONLY segment
int sz_rom;
// Initiate the specified memory operation, by creating a
// memory request and sending it off to the memory system.
@@ -168,16 +163,23 @@ class GPUDynInst : public GPUExecContext
GPUStaticInst* staticInstruction() { return _staticInst; }
TheGpuISA::ScalarRegU32 srcLiteral() const;
bool isALU() const;
bool isBranch() const;
bool isCondBranch() const;
bool isNop() const;
bool isReturn() const;
bool isEndOfKernel() const;
bool isKernelLaunch() const;
bool isSDWAInst() const;
bool isDPPInst() const;
bool isUnconditionalJump() const;
bool isSpecialOp() const;
bool isWaitcnt() const;
bool isBarrier() const;
bool isMemFence() const;
bool isMemSync() const;
bool isMemRef() const;
bool isFlat() const;
bool isLoad() const;
@@ -188,10 +190,20 @@ class GPUDynInst : public GPUExecContext
bool isAtomicRet() const;
bool isScalar() const;
bool isVector() const;
bool readsSCC() const;
bool writesSCC() const;
bool readsVCC() const;
bool writesVCC() const;
bool readsEXEC() const;
bool writesEXEC() const;
bool readsMode() const;
bool writesMode() const;
bool ignoreExec() const;
bool readsFlatScratch() const;
bool writesFlatScratch() const;
bool readsExecMask() const;
bool writesExecMask() const;
bool isAtomicAnd() const;
bool isAtomicOr() const;
@@ -217,39 +229,25 @@ class GPUDynInst : public GPUExecContext
bool isReadOnlySeg() const;
bool isSpillSeg() const;
bool isWorkitemScope() const;
bool isWavefrontScope() const;
bool isWorkgroupScope() const;
bool isDeviceScope() const;
bool isSystemScope() const;
bool isNoScope() const;
bool isRelaxedOrder() const;
bool isAcquire() const;
bool isRelease() const;
bool isAcquireRelease() const;
bool isNoOrder() const;
bool isGloballyCoherent() const;
bool isSystemCoherent() const;
/*
* Loads/stores/atomics may have acquire/release semantics associated
* withthem. Some protocols want to see the acquire/release as separate
* requests from the load/store/atomic. We implement that separation
* using continuations (i.e., a function pointer with an object associated
* with it). When, for example, the front-end generates a store with
* release semantics, we will first issue a normal store and set the
* continuation in the GPUDynInst to a function that generate a
* release request. That continuation will be called when the normal
* store completes (in ComputeUnit::DataPort::recvTimingResponse). The
* continuation will be called in the context of the same GPUDynInst
* that generated the initial store.
*/
std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
bool isF16() const;
bool isF32() const;
bool isF64() const;
// when true, call execContinuation when response arrives
bool useContinuation;
bool isFMA() const;
bool isMAC() const;
bool isMAD() const;
// for FLAT memory ops. check the segment address
// against the APE registers to see if it falls
// within one of the APE ranges for LDS/SCRATCH/GPUVM.
// if it does not fall into one of the three APEs, it
// will be a regular global access.
void doApertureCheck(const VectorMask &mask);
// Function to resolve a flat accesses during execution stage.
void resolveFlatSegment(const VectorMask &mask);
template<typename c0> AtomicOpFunctorPtr
makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
@@ -282,62 +280,31 @@ class GPUDynInst : public GPUExecContext
}
void
setRequestFlags(RequestPtr req, bool setMemOrder=true)
setRequestFlags(RequestPtr req) const
{
// currently these are the easy scopes to deduce
if (isPrivateSeg()) {
req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
} else if (isSpillSeg()) {
req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
} else if (isGlobalSeg()) {
req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
} else if (isReadOnlySeg()) {
req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
} else if (isGroupSeg()) {
req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
} else if (isFlat()) {
panic("TODO: translate to correct scope");
} else {
fatal("%s has bad segment type\n", disassemble());
if (isGloballyCoherent()) {
req->setCacheCoherenceFlags(Request::GLC_BIT);
}
if (isWavefrontScope()) {
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
Request::WAVEFRONT_SCOPE);
} else if (isWorkgroupScope()) {
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
Request::WORKGROUP_SCOPE);
} else if (isDeviceScope()) {
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
Request::DEVICE_SCOPE);
} else if (isSystemScope()) {
req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
Request::SYSTEM_SCOPE);
} else if (!isNoScope() && !isWorkitemScope()) {
fatal("%s has bad scope type\n", disassemble());
if (isSystemCoherent()) {
req->setCacheCoherenceFlags(Request::SLC_BIT);
}
if (setMemOrder) {
// set acquire and release flags
if (isAcquire()) {
req->setFlags(Request::ACQUIRE);
} else if (isRelease()) {
req->setFlags(Request::RELEASE);
} else if (isAcquireRelease()) {
req->setFlags(Request::ACQUIRE | Request::RELEASE);
} else if (!isNoOrder()) {
fatal("%s has bad memory order\n", disassemble());
}
}
// set atomic type
// currently, the instruction genenerator only produces atomic return
// but a magic instruction can produce atomic no return
if (isAtomicRet()) {
req->setFlags(Request::ATOMIC_RETURN_OP);
} else if (isAtomicNoRet()) {
req->setFlags(Request::ATOMIC_NO_RETURN_OP);
}
if (isMemSync()) {
// the path for kernel launch and kernel end is different
// from non-kernel mem sync.
assert(!isKernelLaunch());
assert(!isEndOfKernel());
// must be wbinv inst if not kernel launch/end
req->setCacheCoherenceFlags(Request::ACQUIRE);
}
}
// Map returned packets and the addresses they satisfy with which lane they
@@ -348,12 +315,39 @@ class GPUDynInst : public GPUExecContext
// Track the status of memory requests per lane, a bit per lane
VectorMask statusBitVector;
// for ld_v# or st_v#
std::vector<int> statusVector;
std::vector<int> tlbHitLevel;
// for misaligned scalar ops we track the number
// of outstanding reqs here
int numScalarReqs;
Tick getAccessTime() const { return accessTime; }
void setAccessTime(Tick currentTime) { accessTime = currentTime; }
void profileRoundTripTime(Tick currentTime, int hopId);
std::vector<Tick> getRoundTripTime() const { return roundTripTime; }
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
{ return lineAddressTime; }
// inst used to save/restore a wavefront context
bool isSaveRestore;
private:
GPUStaticInst *_staticInst;
uint64_t _seqNum;
const InstSeqNum _seqNum;
// the time the request was started
Tick accessTime = -1;
// hold the tick when the instruction arrives at certain hop points
// on it's way to main memory
std::vector<Tick> roundTripTime;
// hold each cache block address for the instruction and a vector
// to hold the tick when the block arrives at certain hop points
std::map<Addr, std::vector<Tick>> lineAddressTime;
};
#endif // __GPU_DYN_INST_HH__

View File

@@ -59,8 +59,8 @@ GPUExecContext::readMiscReg(int opIdx) const
}
void
GPUExecContext::writeMiscReg(int opIdx, RegVal operandVal)
GPUExecContext::writeMiscReg(int opIdx, RegVal val)
{
assert(gpuISA);
gpuISA->writeMiscReg(opIdx, operandVal);
gpuISA->writeMiscReg(opIdx, val);
}

View File

@@ -34,10 +34,10 @@
#include "gpu-compute/gpu_static_inst.hh"
GPUStaticInst::GPUStaticInst(const std::string &opcode)
: executed_as(Enums::SC_NONE), opcode(opcode),
_instNum(0), _instAddr(0)
: executed_as(Enums::SC_NONE), _opcode(opcode),
_instNum(0), _instAddr(0), srcVecOperands(-1), dstVecOperands(-1),
srcVecDWORDs(-1), dstVecDWORDs(-1)
{
setFlag(NoOrder);
}
const std::string&
@@ -50,3 +50,80 @@ GPUStaticInst::disassemble()
return disassembly;
}
int
GPUStaticInst::numSrcVecOperands()
{
if (srcVecOperands > -1)
return srcVecOperands;
srcVecOperands = 0;
if (!isScalar()) {
for (int k = 0; k < getNumOperands(); ++k) {
if (isVectorRegister(k) && isSrcOperand(k))
srcVecOperands++;
}
}
return srcVecOperands;
}
int
GPUStaticInst::numDstVecOperands()
{
if (dstVecOperands > -1)
return dstVecOperands;
dstVecOperands = 0;
if (!isScalar()) {
for (int k = 0; k < getNumOperands(); ++k) {
if (isVectorRegister(k) && isDstOperand(k))
dstVecOperands++;
}
}
return dstVecOperands;
}
int
GPUStaticInst::numSrcVecDWORDs()
{
if (srcVecDWORDs > -1) {
return srcVecDWORDs;
}
srcVecDWORDs = 0;
if (!isScalar()) {
for (int i = 0; i < getNumOperands(); i++) {
if (isVectorRegister(i) && isSrcOperand(i)) {
int dwords = numOpdDWORDs(i);
srcVecDWORDs += dwords;
}
}
}
return srcVecDWORDs;
}
int
GPUStaticInst::numDstVecDWORDs()
{
if (dstVecDWORDs > -1) {
return dstVecDWORDs;
}
dstVecDWORDs = 0;
if (!isScalar()) {
for (int i = 0; i < getNumOperands(); i++) {
if (isVectorRegister(i) && isDstOperand(i)) {
int dwords = numOpdDWORDs(i);
dstVecDWORDs += dwords;
}
}
}
return dstVecDWORDs;
}
int
GPUStaticInst::numOpdDWORDs(int operandIdx)
{
return getOperandSize(operandIdx) <= 4 ? 1
: getOperandSize(operandIdx) / 4;
}

View File

@@ -59,6 +59,7 @@ class GPUStaticInst : public GPUStaticInstFlags
{
public:
GPUStaticInst(const std::string &opcode);
virtual ~GPUStaticInst() { }
void instAddr(int inst_addr) { _instAddr = inst_addr; }
int instAddr() const { return _instAddr; }
int nextInstAddr() const { return _instAddr + instSize(); }
@@ -71,15 +72,18 @@ class GPUStaticInst : public GPUStaticInstFlags
int ipdInstNum() const { return _ipdInstNum; }
virtual TheGpuISA::ScalarRegU32 srcLiteral() const { return 0; }
virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
virtual void generateDisassembly() = 0;
const std::string& disassemble();
virtual int getNumOperands() = 0;
virtual bool isCondRegister(int operandIndex) = 0;
virtual bool isScalarRegister(int operandIndex) = 0;
virtual bool isVectorRegister(int operandIndex) = 0;
virtual bool isSrcOperand(int operandIndex) = 0;
virtual bool isDstOperand(int operandIndex) = 0;
virtual bool isFlatScratchRegister(int opIdx) = 0;
virtual bool isExecMaskRegister(int opIdx) = 0;
virtual int getOperandSize(int operandIndex) = 0;
virtual int getRegisterIndex(int operandIndex,
@@ -88,12 +92,24 @@ class GPUStaticInst : public GPUStaticInstFlags
virtual int numDstRegOperands() = 0;
virtual int numSrcRegOperands() = 0;
virtual bool isValid() const = 0;
virtual int coalescerTokenCount() const { return 0; }
int numDstVecOperands();
int numSrcVecOperands();
int numDstVecDWORDs();
int numSrcVecDWORDs();
int numOpdDWORDs(int operandIdx);
bool isALU() const { return _flags[ALU]; }
bool isBranch() const { return _flags[Branch]; }
bool isCondBranch() const { return _flags[CondBranch]; }
bool isNop() const { return _flags[Nop]; }
bool isReturn() const { return _flags[Return]; }
bool isEndOfKernel() const { return _flags[EndOfKernel]; }
bool isKernelLaunch() const { return _flags[KernelLaunch]; }
bool isSDWAInst() const { return _flags[IsSDWA]; }
bool isDPPInst() const { return _flags[IsDPP]; }
bool
isUnconditionalJump() const
@@ -105,7 +121,7 @@ class GPUStaticInst : public GPUStaticInstFlags
bool isWaitcnt() const { return _flags[Waitcnt]; }
bool isBarrier() const { return _flags[MemBarrier]; }
bool isMemFence() const { return _flags[MemFence]; }
bool isMemSync() const { return _flags[MemSync]; }
bool isMemRef() const { return _flags[MemoryRef]; }
bool isFlat() const { return _flags[Flat]; }
bool isLoad() const { return _flags[Load]; }
@@ -125,6 +141,13 @@ class GPUStaticInst : public GPUStaticInstFlags
bool writesSCC() const { return _flags[WritesSCC]; }
bool readsVCC() const { return _flags[ReadsVCC]; }
bool writesVCC() const { return _flags[WritesVCC]; }
// Identify instructions that implicitly read the Execute mask
// as a source operand but not to dictate which threads execute.
bool readsEXEC() const { return _flags[ReadsEXEC]; }
bool writesEXEC() const { return _flags[WritesEXEC]; }
bool readsMode() const { return _flags[ReadsMode]; }
bool writesMode() const { return _flags[WritesMode]; }
bool ignoreExec() const { return _flags[IgnoreExec]; }
bool isAtomicAnd() const { return _flags[AtomicAnd]; }
bool isAtomicOr() const { return _flags[AtomicOr]; }
@@ -166,34 +189,29 @@ class GPUStaticInst : public GPUStaticInstFlags
bool isReadOnlySeg() const { return _flags[ReadOnlySegment]; }
bool isSpillSeg() const { return _flags[SpillSegment]; }
bool isWorkitemScope() const { return _flags[WorkitemScope]; }
bool isWavefrontScope() const { return _flags[WavefrontScope]; }
bool isWorkgroupScope() const { return _flags[WorkgroupScope]; }
bool isDeviceScope() const { return _flags[DeviceScope]; }
bool isSystemScope() const { return _flags[SystemScope]; }
bool isNoScope() const { return _flags[NoScope]; }
bool isRelaxedOrder() const { return _flags[RelaxedOrder]; }
bool isAcquire() const { return _flags[Acquire]; }
bool isRelease() const { return _flags[Release]; }
bool isAcquireRelease() const { return _flags[AcquireRelease]; }
bool isNoOrder() const { return _flags[NoOrder]; }
/**
* Coherence domain of a memory instruction. Only valid for
* machine ISA. The coherence domain specifies where it is
* possible to perform memory synchronization, e.g., acquire
* or release, from the shader kernel.
* Coherence domain of a memory instruction. The coherence domain
* specifies where it is possible to perform memory synchronization
* (e.g., acquire or release) from the shader kernel.
*
* isGloballyCoherent(): returns true if kernel is sharing memory
* with other work-items on the same device (GPU)
* isGloballyCoherent(): returns true if WIs share same device
* isSystemCoherent(): returns true if WIs or threads in different
* devices share memory
*
* isSystemCoherent(): returns true if kernel is sharing memory
* with other work-items on a different device (GPU) or the host (CPU)
*/
bool isGloballyCoherent() const { return _flags[GloballyCoherent]; }
bool isSystemCoherent() const { return _flags[SystemCoherent]; }
// Floating-point instructions
bool isF16() const { return _flags[F16]; }
bool isF32() const { return _flags[F32]; }
bool isF64() const { return _flags[F64]; }
// FMA, MAC, MAD instructions
bool isFMA() const { return _flags[FMA]; }
bool isMAC() const { return _flags[MAC]; }
bool isMAD() const { return _flags[MAD]; }
virtual int instSize() const = 0;
// only used for memory instructions
@@ -217,37 +235,36 @@ class GPUStaticInst : public GPUStaticInstFlags
// For flat memory accesses
Enums::StorageClassType executed_as;
void setFlag(Flags flag) { _flags[flag] = true; }
void setFlag(Flags flag) {
_flags[flag] = true;
virtual void
execLdAcq(GPUDynInstPtr gpuDynInst)
{
fatal("calling execLdAcq() on a non-load instruction.\n");
}
virtual void
execSt(GPUDynInstPtr gpuDynInst)
{
fatal("calling execLdAcq() on a non-load instruction.\n");
}
virtual void
execAtomic(GPUDynInstPtr gpuDynInst)
{
fatal("calling execAtomic() on a non-atomic instruction.\n");
}
virtual void
execAtomicAcq(GPUDynInstPtr gpuDynInst)
{
fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
if (isGroupSeg()) {
executed_as = Enums::SC_GROUP;
} else if (isGlobalSeg()) {
executed_as = Enums::SC_GLOBAL;
} else if (isPrivateSeg()) {
executed_as = Enums::SC_PRIVATE;
} else if (isSpillSeg()) {
executed_as = Enums::SC_SPILL;
} else if (isReadOnlySeg()) {
executed_as = Enums::SC_READONLY;
} else if (isKernArgSeg()) {
executed_as = Enums::SC_KERNARG;
} else if (isArgSeg()) {
executed_as = Enums::SC_ARG;
}
}
const std::string& opcode() const { return _opcode; }
protected:
const std::string opcode;
const std::string _opcode;
std::string disassembly;
int _instNum;
int _instAddr;
int srcVecOperands;
int dstVecOperands;
int srcVecDWORDs;
int dstVecDWORDs;
/**
* Identifier of the immediate post-dominator instruction.
*/
@@ -262,9 +279,9 @@ class KernelLaunchStaticInst : public GPUStaticInst
KernelLaunchStaticInst() : GPUStaticInst("kernel_launch")
{
setFlag(Nop);
setFlag(KernelLaunch);
setFlag(MemSync);
setFlag(Scalar);
setFlag(Acquire);
setFlag(SystemScope);
setFlag(GlobalSegment);
}
@@ -277,11 +294,14 @@ class KernelLaunchStaticInst : public GPUStaticInst
void
generateDisassembly() override
{
disassembly = opcode;
disassembly = _opcode;
}
int getNumOperands() override { return 0; }
bool isCondRegister(int operandIndex) override { return false; }
bool isFlatScratchRegister(int opIdx) override { return false; }
// return true if the Execute mask is explicitly used as a source
// register operand
bool isExecMaskRegister(int opIdx) override { return false; }
bool isScalarRegister(int operandIndex) override { return false; }
bool isVectorRegister(int operandIndex) override { return false; }
bool isSrcOperand(int operandIndex) override { return false; }
@@ -296,7 +316,6 @@ class KernelLaunchStaticInst : public GPUStaticInst
int numDstRegOperands() override { return 0; }
int numSrcRegOperands() override { return 0; }
bool isValid() const override { return true; }
int instSize() const override { return 0; }
};

View File

@@ -74,7 +74,6 @@ namespace X86ISA
allocationPolicy = p->allocationPolicy;
hasMemSidePort = false;
accessDistance = p->accessDistance;
clock = p->clk_domain->clockPeriod();
tlb.assign(size, TlbEntry());
@@ -624,8 +623,8 @@ namespace X86ISA
{
bool delayedResponse;
return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
latency);
return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse,
false, latency);
}
void
@@ -803,13 +802,13 @@ namespace X86ISA
}
/*
* We now know the TLB lookup outcome (if it's a hit or a miss), as well
* as the TLB access latency.
* We now know the TLB lookup outcome (if it's a hit or a miss), as
* well as the TLB access latency.
*
* We create and schedule a new TLBEvent which will help us take the
* appropriate actions (e.g., update TLB on a hit, send request to lower
* level TLB on a miss, or start a page walk if this was the last-level
* TLB)
* appropriate actions (e.g., update TLB on a hit, send request to
* lower level TLB on a miss, or start a page walk if this was the
* last-level TLB)
*/
TLBEvent *tlb_event =
new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
@@ -823,15 +822,15 @@ namespace X86ISA
assert(tlb_event);
DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
curTick() + this->ticks(hitLatency));
curTick() + cyclesToTicks(Cycles(hitLatency)));
schedule(tlb_event, curTick() + this->ticks(hitLatency));
schedule(tlb_event, curTick() + cyclesToTicks(Cycles(hitLatency)));
}
GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
PacketPtr _pkt)
: Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
outcome(tlb_outcome), pkt(_pkt)
GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr,
tlbOutcome tlb_outcome, PacketPtr _pkt)
: Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
outcome(tlb_outcome), pkt(_pkt)
{
}
@@ -848,7 +847,8 @@ namespace X86ISA
bool storeCheck = flags & (StoreCheck << FlagShift);
// Do paging protection checks.
bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
bool inUser
= (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
@@ -874,10 +874,9 @@ namespace X86ISA
* The latter calls handelHit with TLB miss as tlbOutcome.
*/
void
GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
PacketPtr pkt)
GpuTLB::handleTranslationReturn(Addr virt_page_addr,
tlbOutcome tlb_outcome, PacketPtr pkt)
{
assert(pkt);
Addr vaddr = pkt->req->getVaddr();
@@ -890,15 +889,18 @@ namespace X86ISA
TlbEntry *local_entry, *new_entry;
if (tlb_outcome == TLB_HIT) {
DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n",
vaddr);
local_entry = sender_state->tlbEntry;
} else {
DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
vaddr);
// We are returning either from a page walk or from a hit at a lower
// TLB level. The senderState should be "carrying" a pointer to the
// correct TLBEntry.
/**
* We are returning either from a page walk or from a hit at a
* lower TLB level. The senderState should be "carrying" a pointer
* to the correct TLBEntry.
*/
new_entry = sender_state->tlbEntry;
assert(new_entry);
local_entry = new_entry;
@@ -1024,7 +1026,8 @@ namespace X86ISA
TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
assert(tlb_event);
tlb_event->updateOutcome(PAGE_WALK);
schedule(tlb_event, curTick() + ticks(missLatency2));
schedule(tlb_event,
curTick() + cyclesToTicks(Cycles(missLatency2)));
}
} else if (outcome == PAGE_WALK) {
if (update_stats)
@@ -1095,7 +1098,7 @@ namespace X86ISA
return virtPageAddr;
}
/*
/**
* recvTiming receives a coalesced timing request from a TLBCoalescer
* and it calls issueTLBLookup()
* It only rejects the packet if we have exceeded the max
@@ -1145,9 +1148,11 @@ namespace X86ISA
DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
"%#x\n", vaddr);
// We are returning either from a page walk or from a hit at a lower
// TLB level. The senderState should be "carrying" a pointer to the
// correct TLBEntry.
/**
* We are returning either from a page walk or from a hit at a
* lower TLB level. The senderState should be "carrying" a pointer
* to the correct TLBEntry.
*/
new_entry = sender_state->tlbEntry;
assert(new_entry);
local_entry = new_entry;
@@ -1267,8 +1272,8 @@ namespace X86ISA
} else {
// If this was a prefetch, then do the normal thing if it
// was a successful translation. Otherwise, send an empty
// TLB entry back so that it can be figured out as empty and
// handled accordingly.
// TLB entry back so that it can be figured out as empty
// and handled accordingly.
if (pte) {
DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
pte->paddr);
@@ -1343,7 +1348,7 @@ namespace X86ISA
assert(virt_page_addr == tlb_event->getTLBEventVaddr());
tlb_event->updateOutcome(MISS_RETURN);
tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
tlb->schedule(tlb_event, curTick()+tlb->clockPeriod());
return true;
}
@@ -1393,8 +1398,8 @@ namespace X86ISA
tmp_access_info.sumDistance = 0;
tmp_access_info.meanDistance = 0;
ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
tmp_access_info));
ret = TLBFootprint.insert(
AccessPatternTable::value_type(virt_page_addr, tmp_access_info));
bool first_page_access = ret.second;
@@ -1428,74 +1433,74 @@ namespace X86ISA
page_stat_file = simout.create(name().c_str())->stream();
// print header
*page_stat_file << "page,max_access_distance,mean_access_distance, "
<< "stddev_distance" << std::endl;
*page_stat_file
<< "page,max_access_distance,mean_access_distance, "
<< "stddev_distance" << std::endl;
}
// update avg. reuse distance footprint
AccessPatternTable::iterator iter, iter_begin, iter_end;
unsigned int sum_avg_reuse_distance_per_page = 0;
// iterate through all pages seen by this TLB
for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
iter->second.accessesPerPage;
for (auto &iter : TLBFootprint) {
sum_avg_reuse_distance_per_page += iter.second.totalReuseDistance /
iter.second.accessesPerPage;
if (accessDistance) {
unsigned int tmp = iter->second.localTLBAccesses[0];
unsigned int tmp = iter.second.localTLBAccesses[0];
unsigned int prev = tmp;
for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
if (i) {
tmp = prev + 1;
}
prev = iter->second.localTLBAccesses[i];
prev = iter.second.localTLBAccesses[i];
// update the localTLBAccesses value
// with the actual differece
iter->second.localTLBAccesses[i] -= tmp;
iter.second.localTLBAccesses[i] -= tmp;
// compute the sum of AccessDistance per page
// used later for mean
iter->second.sumDistance +=
iter->second.localTLBAccesses[i];
iter.second.sumDistance +=
iter.second.localTLBAccesses[i];
}
iter->second.meanDistance =
iter->second.sumDistance / iter->second.accessesPerPage;
iter.second.meanDistance =
iter.second.sumDistance / iter.second.accessesPerPage;
// compute std_dev and max (we need a second round because we
// need to know the mean value
unsigned int max_distance = 0;
unsigned int stddev_distance = 0;
for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
unsigned int tmp_access_distance =
iter->second.localTLBAccesses[i];
iter.second.localTLBAccesses[i];
if (tmp_access_distance > max_distance) {
max_distance = tmp_access_distance;
}
unsigned int diff =
tmp_access_distance - iter->second.meanDistance;
tmp_access_distance - iter.second.meanDistance;
stddev_distance += pow(diff, 2);
}
stddev_distance =
sqrt(stddev_distance/iter->second.accessesPerPage);
sqrt(stddev_distance/iter.second.accessesPerPage);
if (page_stat_file) {
*page_stat_file << std::hex << iter->first << ",";
*page_stat_file << std::hex << iter.first << ",";
*page_stat_file << std::dec << max_distance << ",";
*page_stat_file << std::dec << iter->second.meanDistance
*page_stat_file << std::dec << iter.second.meanDistance
<< ",";
*page_stat_file << std::dec << stddev_distance;
*page_stat_file << std::endl;
}
// erase the localTLBAccesses array
iter->second.localTLBAccesses.clear();
iter.second.localTLBAccesses.clear();
}
}

View File

@@ -69,26 +69,7 @@ namespace X86ISA
uint32_t configAddress;
// TLB clock: will inherit clock from shader's clock period in terms
// of nuber of ticks of curTime (aka global simulation clock)
// The assignment of TLB clock from shader clock is done in the python
// config files.
int clock;
public:
// clock related functions ; maps to-and-from Simulation ticks and
// object clocks.
Tick frequency() const { return SimClock::Frequency / clock; }
Tick
ticks(int numCycles) const
{
return (Tick)clock * numCycles;
}
Tick curCycle() const { return curTick() / clock; }
Tick tickToCycles(Tick val) const { return val / clock;}
typedef X86GPUTLBParams Params;
GpuTLB(const Params *p);
~GpuTLB();

View File

@@ -0,0 +1,467 @@
/*
* Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Anthony Gutierrez
*/
/**
* @file
* HSAQueuEntry is the simulator's internal representation of an
* AQL queue entry (task). It encasulates all of the relevant info
* about a task, which is gathered from various runtime data
* structures including: the AQL MQD, the AQL packet, and the code
* object.
*/
#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
#include <bitset>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <vector>
#include "base/intmath.hh"
#include "base/types.hh"
#include "dev/hsa/hsa_packet.hh"
#include "dev/hsa/hsa_queue.hh"
#include "gpu-compute/kernel_code.hh"
class HSAQueueEntry
{
public:
HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
Addr host_pkt_addr, Addr code_addr)
: kernName(kernel_name),
_wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
_gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
numVgprs(akc->workitem_vgpr_count),
numSgprs(akc->wavefront_sgpr_count),
_queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
_hostDispPktAddr(host_pkt_addr),
_completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
->completion_signal),
codeAddress(code_addr),
kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
_outstandingInvs(-1), _outstandingWbs(0),
_ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
group_segment_size),
_privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
private_segment_size),
_contextId(0), _wgId{{ 0, 0, 0 }},
_numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
_globalWgId(0), dispatchComplete(false)
{
initialVgprState.reset();
initialSgprState.reset();
for (int i = 0; i < MAX_DIM; ++i) {
_numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
_numWgTotal *= _numWg[i];
}
parseKernelCode(akc);
}
const std::string&
kernelName() const
{
return kernName;
}
int
wgSize(int dim) const
{
assert(dim < MAX_DIM);
return _wgSize[dim];
}
int
gridSize(int dim) const
{
assert(dim < MAX_DIM);
return _gridSize[dim];
}
int
numVectorRegs() const
{
return numVgprs;
}
int
numScalarRegs() const
{
return numSgprs;
}
uint32_t
queueId() const
{
return _queueId;
}
int
dispatchId() const
{
return _dispatchId;
}
void*
dispPktPtr()
{
return dispPkt;
}
Addr
hostDispPktAddr() const
{
return _hostDispPktAddr;
}
Addr
completionSignal() const
{
return _completionSignal;
}
Addr
codeAddr() const
{
return codeAddress;
}
Addr
kernargAddr() const
{
return kernargAddress;
}
int
ldsSize() const
{
return _ldsSize;
}
int privMemPerItem() const { return _privMemPerItem; }
int
contextId() const
{
return _contextId;
}
bool
dispComplete() const
{
return dispatchComplete;
}
int
wgId(int dim) const
{
assert(dim < MAX_DIM);
return _wgId[dim];
}
void
wgId(int dim, int val)
{
assert(dim < MAX_DIM);
_wgId[dim] = val;
}
int
globalWgId() const
{
return _globalWgId;
}
void
globalWgId(int val)
{
_globalWgId = val;
}
int
numWg(int dim) const
{
assert(dim < MAX_DIM);
return _numWg[dim];
}
void
notifyWgCompleted()
{
++_numWgCompleted;
}
int
numWgCompleted() const
{
return _numWgCompleted;
}
int
numWgTotal() const
{
return _numWgTotal;
}
void
markWgDispatch()
{
++_wgId[0];
++_globalWgId;
if (wgId(0) * wgSize(0) >= gridSize(0)) {
_wgId[0] = 0;
++_wgId[1];
if (wgId(1) * wgSize(1) >= gridSize(1)) {
_wgId[1] = 0;
++_wgId[2];
if (wgId(2) * wgSize(2) >= gridSize(2)) {
dispatchComplete = true;
}
}
}
}
int
numWgAtBarrier() const
{
return numWgArrivedAtBarrier;
}
bool vgprBitEnabled(int bit) const
{
return initialVgprState.test(bit);
}
bool sgprBitEnabled(int bit) const
{
return initialSgprState.test(bit);
}
/**
* Host-side addr of the amd_queue_t on which
* this task was queued.
*/
Addr hostAMDQueueAddr;
/**
* Keep a copy of the AMD HSA queue because we
* need info from some of its fields to initialize
* register state.
*/
_amd_queue_t amdQueue;
// the maximum number of dimensions for a grid or workgroup
const static int MAX_DIM = 3;
/* getter */
int
outstandingInvs() {
return _outstandingInvs;
}
/**
* Whether invalidate has started or finished -1 is the
* initial value indicating inv has not started for the
* kernel.
*/
bool
isInvStarted()
{
return (_outstandingInvs != -1);
}
/**
* update the number of pending invalidate requests
*
* val: negative to decrement, positive to increment
*/
void
updateOutstandingInvs(int val)
{
_outstandingInvs += val;
assert(_outstandingInvs >= 0);
}
/**
* Forcefully change the state to be inv done.
*/
void
markInvDone()
{
_outstandingInvs = 0;
}
/**
* Is invalidate done?
*/
bool
isInvDone() const
{
assert(_outstandingInvs >= 0);
return (_outstandingInvs == 0);
}
int
outstandingWbs() const
{
return _outstandingWbs;
}
/**
* Update the number of pending writeback requests.
*
* val: negative to decrement, positive to increment
*/
void
updateOutstandingWbs(int val)
{
_outstandingWbs += val;
assert(_outstandingWbs >= 0);
}
private:
void
parseKernelCode(AMDKernelCode *akc)
{
/** set the enable bits for the initial SGPR state */
initialSgprState.set(PrivateSegBuf,
akc->enable_sgpr_private_segment_buffer);
initialSgprState.set(DispatchPtr,
akc->enable_sgpr_dispatch_ptr);
initialSgprState.set(QueuePtr,
akc->enable_sgpr_queue_ptr);
initialSgprState.set(KernargSegPtr,
akc->enable_sgpr_kernarg_segment_ptr);
initialSgprState.set(DispatchId,
akc->enable_sgpr_dispatch_id);
initialSgprState.set(FlatScratchInit,
akc->enable_sgpr_flat_scratch_init);
initialSgprState.set(PrivateSegSize,
akc->enable_sgpr_private_segment_size);
initialSgprState.set(GridWorkgroupCountX,
akc->enable_sgpr_grid_workgroup_count_x);
initialSgprState.set(GridWorkgroupCountY,
akc->enable_sgpr_grid_workgroup_count_y);
initialSgprState.set(GridWorkgroupCountZ,
akc->enable_sgpr_grid_workgroup_count_z);
initialSgprState.set(WorkgroupIdX,
akc->enable_sgpr_workgroup_id_x);
initialSgprState.set(WorkgroupIdY,
akc->enable_sgpr_workgroup_id_y);
initialSgprState.set(WorkgroupIdZ,
akc->enable_sgpr_workgroup_id_z);
initialSgprState.set(WorkgroupInfo,
akc->enable_sgpr_workgroup_info);
initialSgprState.set(PrivSegWaveByteOffset,
akc->enable_sgpr_private_segment_wave_byte_offset);
/**
* set the enable bits for the initial VGPR state. the
* workitem Id in the X dimension is always initialized.
*/
initialVgprState.set(WorkitemIdX, true);
initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id_y);
initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id_z);
}
// name of the kernel associated with the AQL entry
std::string kernName;
// workgroup Size (3 dimensions)
std::array<int, MAX_DIM> _wgSize;
// grid Size (3 dimensions)
std::array<int, MAX_DIM> _gridSize;
// total number of VGPRs per work-item
int numVgprs;
// total number of SGPRs per wavefront
int numSgprs;
// id of AQL queue in which this entry is placed
uint32_t _queueId;
int _dispatchId;
// raw AQL packet pointer
void *dispPkt;
// host-side addr of the dispatch packet
Addr _hostDispPktAddr;
// pointer to bool
Addr _completionSignal;
// base address of the raw machine code
Addr codeAddress;
// base address of the kernel args
Addr kernargAddress;
/**
* Number of outstanding invs for the kernel.
* values:
* -1: initial value, invalidate has not started for the kernel
* 0: 1)-1->0, about to start (a transient state, added in the same cycle)
* 2)+1->0, all inv requests are finished, i.e., invalidate done
* ?: positive value, indicating the number of pending inv requests
*/
int _outstandingInvs;
/**
* Number of outstanding wbs for the kernel
* values:
* 0: 1)initial value, flush has not started for the kernel
* 2)+1->0: all wb requests are finished, i.e., flush done
* ?: positive value, indicating the number of pending wb requests
*/
int _outstandingWbs;
int _ldsSize;
int _privMemPerItem;
int _contextId;
std::array<int, MAX_DIM> _wgId;
std::array<int, MAX_DIM> _numWg;
int _numWgTotal;
int numWgArrivedAtBarrier;
// The number of completed work groups
int _numWgCompleted;
int _globalWgId;
bool dispatchComplete;
std::bitset<NumVectorInitFields> initialVgprState;
std::bitset<NumScalarInitFields> initialSgprState;
};
#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__

View File

@@ -0,0 +1,193 @@
/*
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Anthony Gutierrez
*/
#ifndef __GPU_COMPUTE_KERNEL_CODE_HH__
#define __GPU_COMPUTE_KERNEL_CODE_HH__
#include <bitset>
#include <cstdint>
/**
* these enums represent the indices into the
* initialRegState bitfields in HsaKernelInfo.
* each bit specifies whether or not the
* particular piece of state that the bit
* corresponds to should be initialized into
* the VGPRs/SGPRs. the order in which the
* fields are placed matters, as all enabled
* pieces of state will be initialized into
* contiguous registers in the same order
* as their position in the bitfield - which
* is specified in the HSA ABI.
*/
enum ScalarRegInitFields : int
{
PrivateSegBuf = 0,
DispatchPtr = 1,
QueuePtr = 2,
KernargSegPtr = 3,
DispatchId = 4,
FlatScratchInit = 5,
PrivateSegSize = 6,
GridWorkgroupCountX = 7,
GridWorkgroupCountY = 8,
GridWorkgroupCountZ = 9,
WorkgroupIdX = 10,
WorkgroupIdY = 11,
WorkgroupIdZ = 12,
WorkgroupInfo = 13,
PrivSegWaveByteOffset = 14,
NumScalarInitFields = 15
};
enum VectorRegInitFields : int
{
WorkitemIdX = 0,
WorkitemIdY = 1,
WorkitemIdZ = 2,
NumVectorInitFields = 3
};
struct AMDKernelCode
{
uint32_t amd_kernel_code_version_major;
uint32_t amd_kernel_code_version_minor;
uint16_t amd_machine_kind;
uint16_t amd_machine_version_major;
uint16_t amd_machine_version_minor;
uint16_t amd_machine_version_stepping;
int64_t kernel_code_entry_byte_offset;
int64_t kernel_code_prefetch_byte_offset;
uint64_t kernel_code_prefetch_byte_size;
uint64_t max_scratch_backing_memory_byte_size;
/**
* The fields below are used to set program settings for
* compute shaders. Here they are primarily used to setup
* initial register state. See the following for full details
* about kernel launch, state initialization, and the AMD kernel
* code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/
* blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
* #initial-kernel-register-state
*/
// the 32b below here represent the fields of
// the COMPUTE_PGM_RSRC1 register
uint32_t granulated_workitem_vgpr_count : 6;
uint32_t granulated_wavefront_sgpr_count : 4;
uint32_t priority : 2;
uint32_t float_mode_round_32 : 2;
uint32_t float_mode_round_16_64 : 2;
uint32_t float_mode_denorm_32 : 2;
uint32_t float_mode_denorm_16_64 : 2;
uint32_t priv : 1;
uint32_t enable_dx10_clamp : 1;
uint32_t debug_mode : 1;
uint32_t enable_ieee_mode : 1;
uint32_t bulky : 1;
uint32_t cdbg_user : 1;
uint32_t compute_pgm_rsrc1_reserved : 6;
// end COMPUTE_PGM_RSRC1 register
// the 32b below here represent the fields of
// the COMPUTE_PGM_RSRC2 register
uint32_t enable_sgpr_private_segment_wave_byte_offset : 1;
uint32_t user_sgpr_count : 5;
uint32_t enable_trap_handler : 1;
uint32_t enable_sgpr_workgroup_id_x : 1;
uint32_t enable_sgpr_workgroup_id_y : 1;
uint32_t enable_sgpr_workgroup_id_z : 1;
uint32_t enable_sgpr_workgroup_info : 1;
uint32_t enable_vgpr_workitem_id_y : 1;
uint32_t enable_vgpr_workitem_id_z : 1;
uint32_t enable_exception_address_watch : 1;
uint32_t enable_exception_memory_violation : 1;
uint32_t granulated_lds_size : 9;
uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
uint32_t enable_exception_fp_denormal_source : 1;
uint32_t enable_exception_ieee_754_fp_division_by_zero : 1;
uint32_t enable_exception_ieee_754_fp_overflow : 1;
uint32_t enable_exception_ieee_754_fp_underflow : 1;
uint32_t enable_exception_ieee_754_fp_inexact : 1;
uint32_t enable_exception_int_divide_by_zero : 1;
uint32_t compute_pgm_rsrc2_reserved : 1;
// end COMPUTE_PGM_RSRC2
// the 32b below here represent the fields of
// KERNEL_CODE_PROPERTIES
uint32_t enable_sgpr_private_segment_buffer : 1;
uint32_t enable_sgpr_dispatch_ptr : 1;
uint32_t enable_sgpr_queue_ptr : 1;
uint32_t enable_sgpr_kernarg_segment_ptr : 1;
uint32_t enable_sgpr_dispatch_id : 1;
uint32_t enable_sgpr_flat_scratch_init : 1;
uint32_t enable_sgpr_private_segment_size : 1;
uint32_t enable_sgpr_grid_workgroup_count_x : 1;
uint32_t enable_sgpr_grid_workgroup_count_y : 1;
uint32_t enable_sgpr_grid_workgroup_count_z : 1;
uint32_t kernel_code_properties_reserved1 : 6;
uint32_t enable_ordered_append_gds : 1;
uint32_t private_element_size : 2;
uint32_t is_ptr64 : 1;
uint32_t is_dynamic_callstack : 1;
uint32_t is_debug_enabled : 1;
uint32_t is_xnack_enabled : 1;
uint32_t kernel_code_properties_reserved2 : 9;
// end KERNEL_CODE_PROPERTIES
uint32_t workitem_private_segment_byte_size;
uint32_t workgroup_group_segment_byte_size;
uint32_t gds_segment_byte_size;
uint64_t kernarg_segment_byte_size;
uint32_t workgroup_fbarrier_count;
uint16_t wavefront_sgpr_count;
uint16_t workitem_vgpr_count;
uint16_t reserved_vgpr_first;
uint16_t reserved_vgpr_count;
uint16_t reserved_sgpr_first;
uint16_t reserved_sgpr_count;
uint16_t debug_wavefront_private_segment_offset_sgpr;
uint16_t debug_private_segment_buffer_sgpr;
uint8_t kernarg_segment_alignment;
uint8_t group_segment_alignment;
uint8_t private_segment_alignment;
uint8_t wavefront_size;
int32_t call_convention;
uint8_t reserved[12];
uint64_t runtime_loader_kernel_symbol;
uint64_t control_directives[16];
};
#endif // __GPU_COMPUTE_KERNEL_CODE_HH__

View File

@@ -210,8 +210,8 @@ LdsState::processPacket(PacketPtr packet)
parent->loadBusLength();
// delay for accessing the LDS
Tick processingTime =
parent->shader->ticks(bankConflicts * bankConflictPenalty) +
parent->shader->ticks(busLength);
parent->cyclesToTicks(Cycles(bankConflicts * bankConflictPenalty)) +
parent->cyclesToTicks(Cycles(busLength));
// choose (delay + last packet in queue) or (now + delay) as the time to
// return this
Tick doneAt = earliestReturnTime() + processingTime;

View File

@@ -41,7 +41,6 @@
#include <utility>
#include <vector>
#include "enums/MemType.hh"
#include "gpu-compute/misc.hh"
#include "mem/port.hh"
#include "params/LdsState.hh"
@@ -50,8 +49,8 @@
class ComputeUnit;
/**
* this represents a slice of the overall LDS, intended to be associated with an
* individual workgroup
* this represents a slice of the overall LDS, intended to be associated with
* an individual workgroup
*/
class LdsChunk
{
@@ -71,7 +70,8 @@ class LdsChunk
read(const uint32_t index)
{
fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
"chunk");
T *p0 = (T *) (&(chunk.at(index)));
return *p0;
}
@@ -84,7 +84,8 @@ class LdsChunk
write(const uint32_t index, const T value)
{
fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
"chunk");
T *p0 = (T *) (&(chunk.at(index)));
*p0 = value;
}
@@ -203,14 +204,16 @@ class LdsState: public ClockedObject
protected:
// the lds reference counter
// The key is the workgroup ID and dispatch ID
// The value is the number of wavefronts that reference this LDS, as
// wavefronts are launched, the counter goes up for that workgroup and when
// they return it decreases, once it reaches 0 then this chunk of the LDS is
// returned to the available pool. However,it is deallocated on the 1->0
// transition, not whenever the counter is 0 as it always starts with 0 when
// the workgroup asks for space
/**
* the lds reference counter
* The key is the workgroup ID and dispatch ID
* The value is the number of wavefronts that reference this LDS, as
* wavefronts are launched, the counter goes up for that workgroup and when
* they return it decreases, once it reaches 0 then this chunk of the LDS
* is returned to the available pool. However,it is deallocated on the 1->0
* transition, not whenever the counter is 0 as it always starts with 0
* when the workgroup asks for space
*/
std::unordered_map<uint32_t,
std::unordered_map<uint32_t, int32_t>> refCounter;
@@ -356,22 +359,41 @@ class LdsState: public ClockedObject
const uint32_t size)
{
if (chunkMap.find(dispatchId) != chunkMap.end()) {
fatal_if(
panic_if(
chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
"duplicate workgroup ID asking for space in the LDS "
"did[%d] wgid[%d]", dispatchId, wgId);
}
fatal_if(bytesAllocated + size > maximumSize,
"request would ask for more space than is available");
if (bytesAllocated + size > maximumSize) {
return nullptr;
} else {
bytesAllocated += size;
bytesAllocated += size;
auto value = chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
panic_if(!value.second, "was unable to allocate a new chunkMap");
chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
// make an entry for this workgroup
refCounter[dispatchId][wgId] = 0;
// make an entry for this workgroup
refCounter[dispatchId][wgId] = 0;
return &chunkMap[dispatchId][wgId];
return &chunkMap[dispatchId][wgId];
}
}
/*
* return pointer to lds chunk for wgid
*/
LdsChunk *
getLdsChunk(const uint32_t dispatchId, const uint32_t wgId)
{
fatal_if(chunkMap.find(dispatchId) == chunkMap.end(),
"fetch for unknown dispatch ID did[%d]", dispatchId);
fatal_if(chunkMap[dispatchId].find(wgId) == chunkMap[dispatchId].end(),
"fetch for unknown workgroup ID wgid[%d] in dispatch ID did[%d]",
wgId, dispatchId);
return &chunkMap[dispatchId][wgId];
}
bool

View File

@@ -33,6 +33,7 @@
#include "gpu-compute/local_memory_pipeline.hh"
#include "debug/GPUMem.hh"
#include "debug/GPUPort.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
@@ -62,24 +63,31 @@ LocalMemPipeline::exec()
bool accessVrf = true;
Wavefront *w = nullptr;
if ((m) && (m->isLoad() || m->isAtomicRet())) {
if ((m) && m->latency.rdy() && (m->isLoad() || m->isAtomicRet())) {
w = m->wavefront();
accessVrf =
w->computeUnit->vrf[w->simdId]->
vrfOperandAccessReady(m->seqNum(), w, m,
VrfAccessType::WRITE);
accessVrf = w->computeUnit->vrf[w->simdId]->
canScheduleWriteOperandsFromLoad(w, m);
}
if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
|| computeUnit->wfWait.at(m->pipeId).rdy())) {
computeUnit->locMemToVrfBus.rdy()
&& (computeUnit->shader->coissue_return
|| computeUnit->vectorSharedMemUnit.rdy())) {
lmReturnedRequests.pop();
w = m->wavefront();
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
m->completeAcc(m);
if (m->isLoad() || m->isAtomicRet()) {
w->computeUnit->vrf[w->simdId]->
scheduleWriteOperandsFromLoad(w, m);
}
// Decrement outstanding request count
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
@@ -96,7 +104,7 @@ LocalMemPipeline::exec()
// Mark write bus busy for appropriate amount of time
computeUnit->locMemToVrfBus.set(m->time);
if (computeUnit->shader->coissue_return == 0)
w->computeUnit->wfWait.at(m->pipeId).set(m->time);
w->computeUnit->vectorSharedMemUnit.set(m->time);
}
// If pipeline has executed a local memory instruction
@@ -114,6 +122,13 @@ LocalMemPipeline::exec()
}
}
void
LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
{
gpuDynInst->setAccessTime(curTick());
lmIssuedRequests.push(gpuDynInst);
}
void
LocalMemPipeline::regStats()
{

View File

@@ -58,10 +58,11 @@ class LocalMemPipeline
LocalMemPipeline(const ComputeUnitParams *params);
void init(ComputeUnit *cu);
void exec();
std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
void issueRequest(GPUDynInstPtr gpuDynInst);
bool
isLMRespFIFOWrRdy() const
{

View File

@@ -39,34 +39,62 @@
#include <memory>
#include "base/logging.hh"
#include "sim/clocked_object.hh"
class GPUDynInst;
typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
typedef std::bitset<std::numeric_limits<unsigned long long>::digits>
VectorMask;
typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
enum InstMemoryHop : int {
Initiate = 0,
CoalsrSend = 1,
CoalsrRecv = 2,
GMEnqueue = 3,
Complete = 4,
InstMemoryHopMax = 5
};
enum BlockMemoryHop : int {
BlockSend = 0,
BlockRecv = 1
};
class WaitClass
{
public:
WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
void init(uint64_t *_tcnt, uint32_t _numStages=0)
WaitClass() : nxtAvail(0), lookAheadAvail(0), clockedObject(nullptr) { }
WaitClass(ClockedObject *_clockedObject, uint64_t _numStages=0)
: nxtAvail(0), lookAheadAvail(0), clockedObject(_clockedObject),
numStages(_numStages) { }
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
{
tcnt = _tcnt;
clockedObject = _clockedObject;
numStages = _numStages;
}
void set(uint32_t i)
void set(uint64_t i)
{
fatal_if(nxtAvail > *tcnt,
fatal_if(nxtAvail > clockedObject->clockEdge(),
"Can't allocate resource because it is busy!!!");
nxtAvail = *tcnt + i;
nxtAvail = clockedObject->clockEdge() + i;
}
void preset(uint32_t delay)
void preset(uint64_t delay)
{
lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
lookAheadAvail = std::max(lookAheadAvail, delay +
(clockedObject->clockEdge()) - numStages);
}
bool rdy(Cycles cycles = Cycles(0)) const
{
return clockedObject->clockEdge(cycles) >= nxtAvail;
}
bool prerdy() const
{
return clockedObject->clockEdge() >= lookAheadAvail;
}
bool rdy() const { return *tcnt >= nxtAvail; }
bool prerdy() const { return *tcnt >= lookAheadAvail; }
private:
// timestamp indicating when resource will be available
@@ -75,11 +103,11 @@ class WaitClass
// pending uses of the resource (when there is a cycle gap between
// rdy() and set()
uint64_t lookAheadAvail;
// current timestamp
uint64_t *tcnt;
// clockedObject for current timestamp
ClockedObject *clockedObject;
// number of stages between checking if a resource is ready and
// setting the resource's utilization
uint32_t numStages;
uint64_t numStages;
};
class Float16
@@ -93,7 +121,7 @@ class Float16
Float16(float x)
{
uint32_t ai = *(uint32_t *)&x;
uint32_t ai = *(reinterpret_cast<uint32_t *>(&x));
uint32_t s = (ai >> 31) & 0x1;
uint32_t exp = (ai >> 23) & 0xff;
@@ -139,7 +167,7 @@ class Float16
val1 |= (exp << 23);
val1 |= (mant << 13);
return *(float*)&val1;
return *(reinterpret_cast<float *>(&val1));
}
};

View File

@@ -33,8 +33,8 @@
#include "gpu-compute/pool_manager.hh"
PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
: _minAllocation(minAlloc), _poolSize(poolSize)
PoolManager::PoolManager(const PoolManagerParams *p)
: SimObject(p), _minAllocation(p->min_alloc), _poolSize(p->pool_size)
{
assert(poolSize > 0);
assert(_poolSize > 0);
}

View File

@@ -38,11 +38,15 @@
#include <cstdint>
#include <string>
#include "params/PoolManager.hh"
#include "sim/sim_object.hh"
// Pool Manager Logic
class PoolManager
class PoolManager : public SimObject
{
public:
PoolManager(uint32_t minAlloc, uint32_t poolSize);
PoolManager(const PoolManagerParams *p);
virtual ~PoolManager() { _poolSize = 0; }
uint32_t minAllocation() { return _minAllocation; }
virtual std::string printRegion() = 0;
virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) = 0;

View File

@@ -0,0 +1,223 @@
/*
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: John Kalamatianos,
* Mark Wyse
*/
#include "gpu-compute/register_file.hh"
#include <sstream>
#include <string>
#include "base/intmath.hh"
#include "base/logging.hh"
#include "debug/GPURF.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
#include "params/RegisterFile.hh"
RegisterFile::RegisterFile(const RegisterFileParams *p)
: SimObject(p), simdId(p->simd_id), _numRegs(p->num_regs)
{
fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n");
fatal_if(simdId < 0, "Illegal SIMD id for VRF");
busy.clear();
busy.resize(_numRegs, 0);
}
RegisterFile::~RegisterFile()
{
}
void
RegisterFile::setParent(ComputeUnit *_computeUnit)
{
computeUnit = _computeUnit;
}
std::string
RegisterFile::dump() const
{
std::stringstream ss;
ss << "Busy: ";
for (int i = 0; i < busy.size(); i++) {
ss << (int)busy[i];
}
ss << "\n";
return ss.str();
}
// Scoreboard functions
bool
RegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
{
return true;
}
bool
RegisterFile::regBusy(int idx) const
{
return busy.at(idx);
}
void
RegisterFile::markReg(int regIdx, bool value)
{
DPRINTF(GPURF, "SIMD[%d] markReg(): physReg[%d] = %d\n",
simdId, regIdx, (int)value);
busy.at(regIdx) = value;
}
void
RegisterFile::enqRegFreeEvent(uint32_t regIdx, uint64_t delay)
{
DPRINTF(GPURF, "SIMD[%d] enqRegFreeEvent physReg[%d] at %llu\n",
simdId, regIdx, curTick() + delay);
schedule(new MarkRegFreeScbEvent(this, regIdx),
curTick() + delay);
}
void
RegisterFile::enqRegBusyEvent(uint32_t regIdx, uint64_t delay)
{
DPRINTF(GPURF, "SIMD[%d] enqRegBusyEvent physReg[%d] at %llu\n",
simdId, regIdx, curTick() + delay);
schedule(new MarkRegBusyScbEvent(this, regIdx),
curTick() + delay);
}
// Schedule functions
bool
RegisterFile::canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii)
{
return true;
}
void
RegisterFile::scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii)
{
}
bool
RegisterFile::canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
{
return true;
}
void
RegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
{
}
bool
RegisterFile::canScheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii)
{
return true;
}
void
RegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii)
{
}
bool
RegisterFile::operandReadComplete(Wavefront *w, GPUDynInstPtr ii)
{
return true;
}
// Exec functions
void
RegisterFile::exec()
{
}
void
RegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
{
}
RegisterFile*
RegisterFileParams::create()
{
return new RegisterFile(this);
}
// Events
// Mark a register as free in the scoreboard/busy vector
void
RegisterFile::MarkRegFreeScbEvent::process()
{
rf->markReg(regIdx, false);
}
// Mark a register as busy in the scoreboard/busy vector
void
RegisterFile::MarkRegBusyScbEvent::process()
{
rf->markReg(regIdx, true);
}
void
RegisterFile::dispatchInstruction(GPUDynInstPtr ii)
{
}
void
RegisterFile::regStats()
{
registerReads
.name(name() + ".register_reads")
.desc("Total number of DWORDs read from register file")
;
registerWrites
.name(name() + ".register_writes")
.desc("Total number of DWORDS written to register file")
;
sramReads
.name(name() + ".sram_reads")
.desc("Total number of register file bank SRAM activations for reads")
;
sramWrites
.name(name() + ".sram_writes")
.desc("Total number of register file bank SRAM activations for writes")
;
}

View File

@@ -0,0 +1,171 @@
/*
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: John Kalamatianos,
* Mark Wyse
*/
#ifndef __REGISTER_FILE_HH__
#define __REGISTER_FILE_HH__
#include <limits>
#include <vector>
#include "base/statistics.hh"
#include "base/types.hh"
#include "gpu-compute/misc.hh"
#include "sim/sim_object.hh"
class ComputeUnit;
class Shader;
class PoolManager;
class Wavefront;
struct RegisterFileParams;
// Abstract Register File
// This register file class can be inherited from to create both
// scalar and vector register files.
class RegisterFile : public SimObject
{
public:
RegisterFile(const RegisterFileParams *p);
virtual ~RegisterFile();
virtual void setParent(ComputeUnit *_computeUnit);
int numRegs() const { return _numRegs; }
virtual void regStats() override;
// State functions
// Scoreboard functions
virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
virtual bool regBusy(int idx) const;
virtual void markReg(int regIdx, bool value);
// Abstract Register Event
class RegisterEvent : public Event
{
protected:
RegisterFile *rf;
int regIdx;
public:
RegisterEvent(RegisterFile *_rf, int _regIdx)
: rf(_rf), regIdx(_regIdx) { setFlags(AutoDelete); }
};
// Register Event to mark a register as free in the scoreboard/busy vector
class MarkRegFreeScbEvent : public RegisterEvent
{
public:
MarkRegFreeScbEvent(RegisterFile *_rf, int _regIdx)
: RegisterEvent(_rf, _regIdx) { }
void process();
};
// Register Event to mark a register as busy in the scoreboard/busy vector
class MarkRegBusyScbEvent : public RegisterEvent
{
public:
MarkRegBusyScbEvent(RegisterFile *_rf, int _regIdx)
: RegisterEvent(_rf, _regIdx) { }
void process();
};
// Schedule an event to mark a register as free/busy in
// the scoreboard/busy vector. Delay is already in Ticks
virtual void enqRegFreeEvent(uint32_t regIdx, uint64_t delay);
virtual void enqRegBusyEvent(uint32_t regIdx, uint64_t delay);
// Schedule functions
// The following functions are called by the SCH stage when attempting
// to move a wave from the readyList to the schList.
// canSchedule* checks if the RF is ready to provide operands for
// the instruction, while schedule* requests the RF to begin reading
// and writing of operands. Calling schedule* may only occur
// immediately after canSchedule* was called and returned True
virtual bool canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii);
virtual bool canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii);
virtual void scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii);
virtual void scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii);
// The following function is called to check if all operands
// have been read for the given instruction
virtual bool operandReadComplete(Wavefront *w, GPUDynInstPtr ii);
// The following two functions are only called by returning loads to
// check if the register file can support the incoming writes
virtual bool canScheduleWriteOperandsFromLoad(Wavefront *w,
GPUDynInstPtr ii);
// Queue the register writes. Assumes canScheduleWriteOperandsFromLoad
// was called immediately prior and returned True
virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
GPUDynInstPtr ii);
// ExecRF is invoked every cycle by the compute unit and may be
// used to model detailed timing of the register file.
virtual void exec();
// Called to inform RF that an instruction is executing
// to schedule events for writeback, etc., as needed
virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii);
// Debug functions
virtual std::string dump() const;
virtual void dispatchInstruction(GPUDynInstPtr ii);
protected:
ComputeUnit* computeUnit;
int simdId;
// flag indicating if a register is busy
std::vector<bool> busy;
// numer of registers in this register file
int _numRegs;
// Stats
// Total number of register reads, incremented once per DWORD per thread
Stats::Scalar registerReads;
// Total number of register writes, incremented once per DWORD per thread
Stats::Scalar registerWrites;
// Number of register file SRAM activations for reads.
// The register file may be implemented with multiple SRAMs. This stat
// tracks how many times the SRAMs are accessed for reads.
Stats::Scalar sramReads;
// Number of register file SRAM activations for writes
Stats::Scalar sramWrites;
};
#endif // __REGISTER_FILE_HH__

View File

@@ -0,0 +1,143 @@
/*
* Copyright (c) 2016, 2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Mark Wyse
*/
#include "gpu-compute/register_manager.hh"
#include "config/the_gpu_isa.hh"
#include "debug/GPURename.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/static_register_manager_policy.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
#include "params/RegisterManager.hh"
RegisterManager::RegisterManager(const RegisterManagerParams *p)
: SimObject(p), srfPoolMgrs(p->srf_pool_managers),
vrfPoolMgrs(p->vrf_pool_managers)
{
if (p->policy == "static") {
policy = new StaticRegisterManagerPolicy();
} else {
fatal("Unimplemented Register Manager Policy");
}
}
RegisterManager::~RegisterManager()
{
for (auto mgr : srfPoolMgrs) {
delete mgr;
}
for (auto mgr : vrfPoolMgrs) {
delete mgr;
}
}
void
RegisterManager::exec()
{
policy->exec();
}
void
RegisterManager::setParent(ComputeUnit *cu)
{
computeUnit = cu;
policy->setParent(computeUnit);
for (int i = 0; i < srfPoolMgrs.size(); i++) {
fatal_if(computeUnit->srf[i]->numRegs() %
srfPoolMgrs[i]->minAllocation(),
"Min SGPR allocation is not multiple of VRF size\n");
}
for (int i = 0; i < vrfPoolMgrs.size(); i++) {
fatal_if(computeUnit->vrf[i]->numRegs() %
vrfPoolMgrs[i]->minAllocation(),
"Min VGPG allocation is not multiple of VRF size\n");
}
}
// compute mapping for vector register
int
RegisterManager::mapVgpr(Wavefront* w, int vgprIndex)
{
return policy->mapVgpr(w, vgprIndex);
}
// compute mapping for scalar register
int
RegisterManager::mapSgpr(Wavefront* w, int sgprIndex)
{
return policy->mapSgpr(w, sgprIndex);
}
// check if we can allocate registers
bool
RegisterManager::canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
{
return policy->canAllocateVgprs(simdId, nWfs, demandPerWf);
}
bool
RegisterManager::canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
{
return policy->canAllocateSgprs(simdId, nWfs, demandPerWf);
}
// allocate registers
void
RegisterManager::allocateRegisters(Wavefront *w, int vectorDemand,
int scalarDemand)
{
policy->allocateRegisters(w, vectorDemand, scalarDemand);
}
void
RegisterManager::freeRegisters(Wavefront* w)
{
policy->freeRegisters(w);
}
void
RegisterManager::regStats()
{
policy->regStats();
}
RegisterManager*
RegisterManagerParams::create()
{
return new RegisterManager(this);
}

View File

@@ -0,0 +1,94 @@
/*
* Copyright (c) 2016, 2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Mark Wyse
*/
#ifndef __REGISTER_MANAGER_HH__
#define __REGISTER_MANAGER_HH__
#include <cstdint>
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "gpu-compute/pool_manager.hh"
#include "gpu-compute/register_manager_policy.hh"
#include "sim/sim_object.hh"
#include "sim/stats.hh"
class ComputeUnit;
class Wavefront;
struct RegisterManagerParams;
/*
* Rename stage.
*/
class RegisterManager : public SimObject
{
public:
RegisterManager(const RegisterManagerParams* params);
~RegisterManager();
void setParent(ComputeUnit *cu);
void exec();
// Stats related variables and methods
void regStats();
// lookup virtual to physical register translation
int mapVgpr(Wavefront* w, int vgprIndex);
int mapSgpr(Wavefront* w, int sgprIndex);
// check if we can allocate registers
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf);
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf);
// allocate registers
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand);
// free all registers used by the WF
void freeRegisters(Wavefront *w);
std::vector<PoolManager*> srfPoolMgrs;
std::vector<PoolManager*> vrfPoolMgrs;
private:
RegisterManagerPolicy *policy;
ComputeUnit *computeUnit;
std::string _name;
};
#endif // __REGISTER_MANAGER_HH__

View File

@@ -0,0 +1,86 @@
/*
* Copyright (c) 2016 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Mark Wyse
*/
#ifndef __REGISTER_MANAGER_POLICY_HH__
#define __REGISTER_MANAGER_POLICY_HH__
#include <cstdint>
class ComputeUnit;
class HSAQueueEntry;
class Wavefront;
/**
* Register Manager Policy abstract class
*
* A Register Manager Policy implements all of the functionality
* of the Register Manager, including register mapping, allocation,
* and freeing. Different policies may be implemented that support
* different architectures or different methods of mapping and
* allocation.
*/
class RegisterManagerPolicy
{
public:
virtual void setParent(ComputeUnit *_cu) { cu = _cu; }
// Execute: called by RenameStage::execute()
virtual void exec() = 0;
// provide virtual to physical register mapping
virtual int mapVgpr(Wavefront* w, int vgprIndex) = 0;
virtual int mapSgpr(Wavefront* w, int sgprIndex) = 0;
// check if requested number of vector registers can be allocated
virtual bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) = 0;
// check if requested number of scalar registers can be allocated
// machine ISA only
virtual bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) = 0;
// allocate vector registers and reserve from register pool
virtual void allocateRegisters(Wavefront *w, int vectorDemand,
int scalarDemand) = 0;
// free all remaining registers held by specified WF
virtual void freeRegisters(Wavefront *w) = 0;
// stats
virtual void regStats() = 0;
protected:
ComputeUnit *cu;
};
#endif // __REGISTER_MANAGER_POLICY_HH__

View File

@@ -36,6 +36,7 @@
#include <vector>
#include "base/logging.hh"
#include "gpu-compute/scheduling_policy.hh"
#include "gpu-compute/wavefront.hh"

View File

@@ -0,0 +1,153 @@
/*
* Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: John Kalamatianos
*/
#include "gpu-compute/scalar_memory_pipeline.hh"
#include "debug/GPUMem.hh"
#include "debug/GPUReg.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
ScalarMemPipeline::ScalarMemPipeline(const ComputeUnitParams* p) :
computeUnit(nullptr), queueSize(p->scalar_mem_queue_size),
inflightStores(0), inflightLoads(0)
{
}
void
ScalarMemPipeline::init(ComputeUnit *cu)
{
computeUnit = cu;
_name = computeUnit->name() + ".ScalarMemPipeline";
}
void
ScalarMemPipeline::exec()
{
// afind oldest scalar request whose data has arrived
GPUDynInstPtr m = !returnedLoads.empty() ? returnedLoads.front() :
!returnedStores.empty() ? returnedStores.front() : nullptr;
Wavefront *w = nullptr;
bool accessSrf = true;
// check the SRF to see if the operands of a load (or load component
// of an atomic) are accessible
if ((m) && (m->isLoad() || m->isAtomicRet())) {
w = m->wavefront();
accessSrf =
w->computeUnit->srf[w->simdId]->
canScheduleWriteOperandsFromLoad(w, m);
}
if ((!returnedStores.empty() || !returnedLoads.empty()) &&
m->latency.rdy() && computeUnit->scalarMemToSrfBus.rdy() &&
accessSrf &&
(computeUnit->shader->coissue_return ||
computeUnit->scalarMemUnit.rdy())) {
w = m->wavefront();
if (m->isLoad() || m->isAtomicRet()) {
w->computeUnit->srf[w->simdId]->
scheduleWriteOperandsFromLoad(w, m);
}
m->completeAcc(m);
if (m->isLoad() || m->isAtomic()) {
returnedLoads.pop();
assert(inflightLoads > 0);
--inflightLoads;
} else {
returnedStores.pop();
assert(inflightStores > 0);
--inflightStores;
}
// Decrement outstanding register count
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
if (m->isStore() || m->isAtomic()) {
computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
m->time, -1);
}
if (m->isLoad() || m->isAtomic()) {
computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
m->time, -1);
}
// Mark write bus busy for appropriate amount of time
computeUnit->scalarMemToSrfBus.set(m->time);
if (!computeUnit->shader->coissue_return)
w->computeUnit->scalarMemUnit.set(m->time);
}
// If pipeline has executed a global memory instruction
// execute global memory packets and issue global
// memory packets to DTLB
if (!issuedRequests.empty()) {
GPUDynInstPtr mp = issuedRequests.front();
if (mp->isLoad() || mp->isAtomic()) {
if (inflightLoads >= queueSize) {
return;
} else {
++inflightLoads;
}
} else {
if (inflightStores >= queueSize) {
return;
} else {
++inflightStores;
}
}
mp->initiateAcc(mp);
issuedRequests.pop();
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping scalar mem_op\n",
computeUnit->cu_id, mp->simdId, mp->wfSlotId);
}
}
void
ScalarMemPipeline::regStats()
{
}

View File

@@ -0,0 +1,114 @@
/*
* Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: John Kalamatianos
*/
#ifndef __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
#define __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
#include <queue>
#include <string>
#include "gpu-compute/misc.hh"
#include "params/ComputeUnit.hh"
#include "sim/stats.hh"
/*
* @file scalar_memory_pipeline.hh
*
* The scalar memory pipeline issues global memory packets
* from the scalar ALU to the DTLB and L1 Scalar Data Cache.
* The exec() method of the memory packet issues
* the packet to the DTLB if there is space available in the return fifo.
* This exec() method also retires previously issued loads and stores that have
* returned from the memory sub-system.
*/
class ComputeUnit;
class ScalarMemPipeline
{
public:
ScalarMemPipeline(const ComputeUnitParams *params);
void init(ComputeUnit *cu);
void exec();
std::queue<GPUDynInstPtr> &getGMReqFIFO() { return issuedRequests; }
std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return returnedStores; }
std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return returnedLoads; }
bool
isGMLdRespFIFOWrRdy() const
{
return returnedLoads.size() < queueSize;
}
bool
isGMStRespFIFOWrRdy() const
{
return returnedStores.size() < queueSize;
}
bool
isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
{
return (issuedRequests.size() + pendReqs) < queueSize;
}
const std::string &name() const { return _name; }
void regStats();
private:
ComputeUnit *computeUnit;
std::string _name;
int queueSize;
// Counters to track and limit the inflight scalar loads and stores
// generated by this memory pipeline.
int inflightStores;
int inflightLoads;
// Scalar Memory Request FIFO: all global memory scalar requests
// are issued to this FIFO from the scalar memory pipelines
std::queue<GPUDynInstPtr> issuedRequests;
// Scalar Store Response FIFO: all responses of global memory
// scalar stores are sent to this FIFO from L1 Scalar Data Cache
std::queue<GPUDynInstPtr> returnedStores;
// Scalar Load Response FIFO: all responses of global memory
// scalar loads are sent to this FIFO from L1 Scalar Data Cache
std::queue<GPUDynInstPtr> returnedLoads;
};
#endif // __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__

View File

@@ -0,0 +1,164 @@
/*
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: John Kalamatianos,
* Mark Wyse
*/
#include "gpu-compute/scalar_register_file.hh"
#include "base/logging.hh"
#include "debug/GPUSRF.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
#include "params/ScalarRegisterFile.hh"
ScalarRegisterFile::ScalarRegisterFile(const ScalarRegisterFileParams *p)
: RegisterFile(p)
{
regFile.resize(numRegs(), 0);
}
bool
ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
{
for (int i = 0; i < ii->getNumOperands(); ++i) {
if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
int sgprIdx = ii->getRegisterIndex(i, ii);
int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
ii->getOperandSize(i) / 4;
for (int j = 0; j < nRegs; ++j) {
int pSgpr =
computeUnit->registerManager->mapSgpr(w, sgprIdx + j);
if (regBusy(pSgpr)) {
if (ii->isDstOperand(i)) {
w->numTimesBlockedDueWAXDependencies++;
} else if (ii->isSrcOperand(i)) {
DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
w->wfDynId, ii->disassemble(), pSgpr);
w->numTimesBlockedDueRAWDependencies++;
}
return false;
}
} // nRegs
} // isScalar
} // operand
return true;
}
void
ScalarRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
{
// iterate over all register destination operands
for (int i = 0; i < ii->getNumOperands(); ++i) {
if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
int sgprIdx = ii->getRegisterIndex(i, ii);
int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
ii->getOperandSize(i) / 4;
for (int j = 0; j < nRegs; ++j) {
int physReg =
computeUnit->registerManager->mapSgpr(w, sgprIdx + j);
// mark the destination scalar register as busy
markReg(physReg, true);
}
}
}
}
void
ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
{
for (int i = 0; i < ii->getNumOperands(); i++) {
if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
int DWORDs = ii->getOperandSize(i) <= 4 ? 1
: ii->getOperandSize(i) / 4;
registerReads += DWORDs;
}
}
if (!ii->isLoad() && !(ii->isAtomic() || ii->isMemSync())) {
Cycles delay(computeUnit->scalarPipeLength());
Tick tickDelay = computeUnit->cyclesToTicks(delay);
for (int i = 0; i < ii->getNumOperands(); i++) {
if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
int sgprIdx = ii->getRegisterIndex(i, ii);
int nRegs = ii->getOperandSize(i) <= 4 ? 1
: ii->getOperandSize(i) / 4;
for (int j = 0; j < nRegs; j++) {
int physReg = computeUnit->registerManager->
mapSgpr(w, sgprIdx + j);
enqRegFreeEvent(physReg, tickDelay);
}
registerWrites += nRegs;
}
}
}
}
void
ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w,
GPUDynInstPtr ii)
{
assert(ii->isLoad() || ii->isAtomicRet());
for (int i = 0; i < ii->getNumOperands(); ++i) {
if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
int sgprIdx = ii->getRegisterIndex(i, ii);
int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
ii->getOperandSize(i) / 4;
for (int j = 0; j < nRegs; ++j) {
int physReg = computeUnit->registerManager->
mapSgpr(w, sgprIdx + j);
enqRegFreeEvent(physReg, computeUnit->clockPeriod());
}
registerWrites += nRegs;
}
}
}
ScalarRegisterFile*
ScalarRegisterFileParams::create()
{
return new ScalarRegisterFile(this);
}

View File

@@ -0,0 +1,104 @@
/*
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: John Kalamatianos,
* Mark Wyse
*/
#ifndef __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
#define __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
#include "arch/gpu_isa.hh"
#include "base/statistics.hh"
#include "base/trace.hh"
#include "base/types.hh"
#include "debug/GPUSRF.hh"
#include "gpu-compute/register_file.hh"
#include "gpu-compute/wavefront.hh"
struct ScalarRegisterFileParams;
// Scalar Register File
class ScalarRegisterFile : public RegisterFile
{
public:
using ScalarRegU32 = TheGpuISA::ScalarRegU32;
ScalarRegisterFile(const ScalarRegisterFileParams *p);
~ScalarRegisterFile() { }
virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override;
virtual void scheduleWriteOperands(Wavefront *w,
GPUDynInstPtr ii) override;
virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
GPUDynInstPtr ii) override;
virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override;
void
setParent(ComputeUnit *_computeUnit) override
{
RegisterFile::setParent(_computeUnit);
}
// Read a register that is writeable (e.g., a DST operand)
ScalarRegU32&
readWriteable(int regIdx)
{
return regFile[regIdx];
}
// Read a register that is not writeable (e.g., src operand)
ScalarRegU32
read(int regIdx) const
{
return regFile[regIdx];
}
// Write a register
void
write(int regIdx, ScalarRegU32 value)
{
regFile[regIdx] = value;
}
void
printReg(Wavefront *wf, int regIdx) const
{
DPRINTF(GPUSRF, "WF[%d][%d]: Id%d s[%d] = %#x\n", wf->simdId,
wf->wfSlotId, wf->wfDynId, regIdx, regFile[regIdx]);
}
private:
std::vector<ScalarRegU32> regFile;
};
#endif // __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__

View File

@@ -33,24 +33,36 @@
#include "gpu-compute/schedule_stage.hh"
#include <unordered_set>
#include "debug/GPUSched.hh"
#include "debug/GPUVRF.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
: numSIMDs(p->num_SIMDs),
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu)
: vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
locMemBusRdy(false), locMemIssueRdy(false)
{
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
for (int j = 0; j < cu->numExeUnits(); ++j) {
scheduler.emplace_back(p);
}
wavesInSch.clear();
schList.resize(cu->numExeUnits());
for (auto &dq : schList) {
dq.clear();
}
}
ScheduleStage::~ScheduleStage()
{
scheduler.clear();
waveStatusList.clear();
wavesInSch.clear();
schList.clear();
}
void
@@ -59,56 +71,597 @@ ScheduleStage::init(ComputeUnit *cu)
computeUnit = cu;
_name = computeUnit->name() + ".ScheduleStage";
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
fatal_if(scheduler.size() != computeUnit->readyList.size(),
"Scheduler should have same number of entries as CU's readyList");
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
scheduler[j].bindList(&computeUnit->readyList[j]);
}
for (int j = 0; j < numSIMDs; ++j) {
waveStatusList.push_back(&computeUnit->waveStatusList[j]);
}
dispatchList = &computeUnit->dispatchList;
assert(computeUnit->numVectorGlobalMemUnits == 1);
assert(computeUnit->numVectorSharedMemUnits == 1);
}
void
ScheduleStage::arbitrate()
ScheduleStage::exec()
{
// iterate over all Memory pipelines
for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
if (dispatchList->at(j).first) {
Wavefront *waveToMemPipe = dispatchList->at(j).first;
// iterate over all execution pipelines
for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
if ((i != j) && (dispatchList->at(i).first)) {
Wavefront *waveToExePipe = dispatchList->at(i).first;
// if the two selected wavefronts are mapped to the same
// SIMD unit then they share the VRF
if (waveToMemPipe->simdId == waveToExePipe->simdId) {
int simdId = waveToMemPipe->simdId;
// Read VRF port arbitration:
// If there are read VRF port conflicts between the
// a memory and another instruction we drop the other
// instruction. We don't need to check for write VRF
// port conflicts because the memory instruction either
// does not need to write to the VRF (store) or will
// write to the VRF when the data comes back (load) in
// which case the arbiter of the memory pipes will
// resolve any conflicts
if (computeUnit->vrf[simdId]->
isReadConflict(waveToMemPipe->wfSlotId,
waveToExePipe->wfSlotId)) {
// FIXME: The "second" member variable is never
// used in the model. I am setting it to READY
// simply to follow the protocol of setting it
// when the WF has an instruction ready to issue
waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
.second = READY;
// Update readyList
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
// delete all ready wavefronts whose instruction buffers are now
// empty because the last instruction was executed
computeUnit->updateReadyList(j);
/**
* Remove any wave that already has an instruction present in SCH
* waiting for RF reads to complete. This prevents out of order
* execution within a wave.
*/
for (auto wIt = computeUnit->readyList.at(j).begin();
wIt != computeUnit->readyList.at(j).end();) {
if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
*wIt = nullptr;
wIt = computeUnit->readyList.at(j).erase(wIt);
} else {
wIt++;
}
}
}
dispatchList->at(i).first = nullptr;
dispatchList->at(i).second = EMPTY;
break;
}
// Attempt to add another wave for each EXE type to schList queues
// VMEM resources are iterated first, effectively giving priority
// to VMEM over VALU for scheduling read of operands to the RFs.
// Scalar Memory are iterated after VMEM
// Iterate VMEM and SMEM
int firstMemUnit = computeUnit->firstMemUnit();
int lastMemUnit = computeUnit->lastMemUnit();
for (int j = firstMemUnit; j <= lastMemUnit; j++) {
int readyListSize = computeUnit->readyList[j].size();
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
rdyListEmpty[j]++;
continue;
}
rdyListNotEmpty[j]++;
// Pick a wave and attempt to add it to schList
Wavefront *w = scheduler[j].chooseWave();
if (!addToSchList(j, w)) {
// For waves not added to schList, increment count of cycles
// this wave spends in SCH stage.
w->schCycles++;
addToSchListStalls[j]++;
}
}
// Iterate everything else
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
// skip the VMEM resources
if (j >= firstMemUnit && j <= lastMemUnit) {
continue;
}
int readyListSize = computeUnit->readyList[j].size();
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
rdyListEmpty[j]++;
continue;
}
rdyListNotEmpty[j]++;
// Pick a wave and attempt to add it to schList
Wavefront *w = scheduler[j].chooseWave();
if (!addToSchList(j, w)) {
// For waves not added to schList, increment count of cycles
// this wave spends in SCH stage.
w->schCycles++;
addToSchListStalls[j]++;
}
}
// At this point, the schList queue per EXE type may contain
// multiple waves, in order of age (oldest to youngest).
// Wave may be in RFBUSY, indicating they are waiting for registers
// to be read, or in RFREADY, indicating they are candidates for
// the dispatchList and execution
// Iterate schList queues and check if any of the waves have finished
// reading their operands, moving those waves to RFREADY status
checkRfOperandReadComplete();
// Fill the dispatch list with the oldest wave of each EXE type that
// is ready to execute
// Wave is picked if status in schList is RFREADY and it passes resource
// ready checks similar to those currently in SCB
fillDispatchList();
// Resource arbitration on waves in dispatchList
// Losing waves are re-inserted to the schList at a location determined
// by wave age
// Arbitrate access to the VRF->LDS bus
arbitrateVrfToLdsBus();
// Schedule write operations to the register files
scheduleRfDestOperands();
// Lastly, reserve resources for waves that are ready to execute.
reserveResources();
}
void
ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
Wavefront *w)
{
dispatchList->at(unitId).first = w;
dispatchList->at(unitId).second = s;
}
bool
ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
{
GPUDynInstPtr ii = w->instructionBuffer.front();
assert(ii);
bool accessVrfWr = true;
if (!ii->isScalar()) {
accessVrfWr =
computeUnit->vrf[w->simdId]->canScheduleWriteOperands(w, ii);
}
bool accessSrfWr =
computeUnit->srf[w->simdId]->canScheduleWriteOperands(w, ii);
bool accessRf = accessVrfWr && accessSrfWr;
if (accessRf) {
if (!ii->isScalar()) {
computeUnit->vrf[w->simdId]->scheduleWriteOperands(w, ii);
}
computeUnit->srf[w->simdId]->scheduleWriteOperands(w, ii);
return true;
} else {
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
if (!accessSrfWr) {
rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
}
if (!accessVrfWr) {
rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
}
// Increment stall counts for WF
w->schStalls++;
w->schRfAccessStalls++;
}
return false;
}
void
ScheduleStage::scheduleRfDestOperands()
{
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
if (!dispatchList->at(j).first) {
continue;
}
// get the wave on dispatch list and attempt to allocate write
// resources in the RFs
Wavefront *w = dispatchList->at(j).first;
if (!schedRfWrites(j, w)) {
reinsertToSchList(j, w);
doDispatchListTransition(j, EMPTY);
// if this is a flat inst, also transition the LM pipe to empty
// Note: since FLAT/LM arbitration occurs before scheduling
// destination operands to the RFs, it is possible that a LM
// instruction lost arbitration, but would have been able to
// pass the RF destination operand check here, and execute
// instead of the FLAT.
if (w->instructionBuffer.front()->isFlat()) {
assert(dispatchList->at(w->localMem).second == SKIP);
doDispatchListTransition(w->localMem, EMPTY);
}
}
}
}
bool
ScheduleStage::addToSchList(int exeType, Wavefront *w)
{
// Attempt to add the wave to the schList if the VRF can support the
// wave's next instruction
GPUDynInstPtr ii = w->instructionBuffer.front();
assert(ii);
bool accessVrf = true;
if (!ii->isScalar()) {
accessVrf =
computeUnit->vrf[w->simdId]->canScheduleReadOperands(w, ii);
}
bool accessSrf =
computeUnit->srf[w->simdId]->canScheduleReadOperands(w, ii);
// If RFs can support instruction, add to schList in RFBUSY state,
// place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
// to the VRF
bool accessRf = accessVrf && accessSrf;
if (accessRf) {
DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
exeType, w->simdId, w->wfDynId,
ii->seqNum(), ii->disassemble());
computeUnit->insertInPipeMap(w);
wavesInSch.emplace(w->wfDynId);
schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
if (w->isOldestInstWaitcnt()) {
w->setStatus(Wavefront::S_WAITCNT);
}
if (!ii->isScalar()) {
computeUnit->vrf[w->simdId]->scheduleReadOperands(w, ii);
}
computeUnit->srf[w->simdId]->scheduleReadOperands(w, ii);
DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
exeType, w->simdId, w->wfDynId,
ii->seqNum(), ii->disassemble());
return true;
} else {
// Number of stall cycles due to RF access denied
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
// Count number of denials due to each reason
// Multiple items may contribute to the denied request
if (!accessVrf) {
rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
}
if (!accessSrf) {
rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
}
// Increment stall counts for WF
w->schStalls++;
w->schRfAccessStalls++;
DPRINTF(GPUSched, "schList[%d]: Could not add: "
"SIMD[%d] WV[%d]: %d: %s\n",
exeType, w->simdId, w->wfDynId,
ii->seqNum(), ii->disassemble());
}
return false;
}
void
ScheduleStage::reinsertToSchList(int exeType, Wavefront *w)
{
// Insert wave w into schList for specified exeType.
// Wave is inserted in age order, with oldest wave being at the
// front of the schList
auto schIter = schList.at(exeType).begin();
while (schIter != schList.at(exeType).end()
&& schIter->first->wfDynId < w->wfDynId) {
schIter++;
}
schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY));
}
void
ScheduleStage::checkMemResources()
{
// Check for resource availability in the next cycle
scalarMemBusRdy = false;
scalarMemIssueRdy = false;
// check if there is a SRF->Global Memory bus available and
if (computeUnit->srfToScalarMemPipeBus.rdy(Cycles(1))) {
scalarMemBusRdy = true;
}
// check if we can issue a scalar memory instruction
if (computeUnit->scalarMemUnit.rdy(Cycles(1))) {
scalarMemIssueRdy = true;
}
glbMemBusRdy = false;
glbMemIssueRdy = false;
// check if there is a VRF->Global Memory bus available
if (computeUnit->vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
glbMemBusRdy = true;
}
// check if we can issue a Global memory instruction
if (computeUnit->vectorGlobalMemUnit.rdy(Cycles(1))) {
glbMemIssueRdy = true;
}
locMemBusRdy = false;
locMemIssueRdy = false;
// check if there is a VRF->LDS bus available
if (computeUnit->vrfToLocalMemPipeBus.rdy(Cycles(1))) {
locMemBusRdy = true;
}
// check if we can issue a LDS instruction
if (computeUnit->vectorSharedMemUnit.rdy(Cycles(1))) {
locMemIssueRdy = true;
}
}
bool
ScheduleStage::dispatchReady(Wavefront *w)
{
vectorAluRdy = false;
scalarAluRdy = false;
// check for available vector/scalar ALUs in the next cycle
if (computeUnit->vectorALUs[w->simdId].rdy(Cycles(1))) {
vectorAluRdy = true;
}
if (computeUnit->scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
scalarAluRdy = true;
}
GPUDynInstPtr ii = w->instructionBuffer.front();
if (ii->isNop()) {
// S_NOP requires SALU. V_NOP requires VALU.
// TODO: Scalar NOP does not require SALU in hardware,
// and is executed out of IB directly.
if (ii->isScalar() && !scalarAluRdy) {
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
} else if (!ii->isScalar() && !vectorAluRdy) {
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
return false;
}
} else if (ii->isEndOfKernel()) {
// EndPgm instruction
if (ii->isScalar() && !scalarAluRdy) {
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
}
} else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) {
// Barrier, Branch, or ALU instruction
if (ii->isScalar() && !scalarAluRdy) {
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
} else if (!ii->isScalar() && !vectorAluRdy) {
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
return false;
}
} else if (!ii->isScalar() && ii->isGlobalMem()) {
// Vector Global Memory instruction
bool rdy = true;
if (!glbMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
}
if (!glbMemBusRdy) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
}
if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
}
if (!rdy) {
return false;
}
} else if (ii->isScalar() && ii->isGlobalMem()) {
// Scalar Global Memory instruction
bool rdy = true;
if (!scalarMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
}
if (!scalarMemBusRdy) {
rdy = false;
dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit->scalarMemoryPipe.
isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
w->scalarWrGmReqsInPipe)) {
rdy = false;
dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
}
} else if (!ii->isScalar() && ii->isLocalMem()) {
// Vector Local Memory instruction
bool rdy = true;
if (!locMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
}
if (!locMemBusRdy) {
rdy = false;
dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit->localMemoryPipe.
isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
rdy = false;
dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
}
} else if (!ii->isScalar() && ii->isFlat()) {
// Vector Flat memory instruction
bool rdy = true;
if (!glbMemIssueRdy || !locMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
}
if (!glbMemBusRdy || !locMemBusRdy) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
}
if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
}
if (!computeUnit->localMemoryPipe.
isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
}
} else {
panic("%s: unknown instr checked for readiness", ii->disassemble());
return false;
}
dispNrdyStalls[SCH_RDY]++;
return true;
}
void
ScheduleStage::fillDispatchList()
{
// update execution resource status
checkMemResources();
// iterate execution resources
for (int j = 0; j < computeUnit->numExeUnits(); j++) {
assert(dispatchList->at(j).second == EMPTY);
// iterate waves in schList to pick one for dispatch
auto schIter = schList.at(j).begin();
bool dispatched = false;
while (schIter != schList.at(j).end()) {
// only attempt to dispatch if status is RFREADY
if (schIter->second == RFREADY) {
// Check if this wave is ready for dispatch
bool dispRdy = dispatchReady(schIter->first);
if (!dispatched && dispRdy) {
// No other wave has been dispatched for this exe
// resource, and this wave is ready. Place this wave
// on dispatchList and make it ready for execution
// next cycle.
// Acquire a coalescer token if it is a global mem
// operation.
GPUDynInstPtr mp = schIter->first->
instructionBuffer.front();
if (!mp->isMemSync() && !mp->isScalar() &&
(mp->isGlobalMem() || mp->isFlat())) {
computeUnit->globalMemoryPipe.acqCoalescerToken(mp);
}
doDispatchListTransition(j, EXREADY, schIter->first);
DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
"EMPTY->EXREADY\n", j);
schIter->first = nullptr;
schIter = schList.at(j).erase(schIter);
dispatched = true;
} else {
// Either another wave has been dispatched, or this wave
// was not ready, so it is stalled this cycle
schIter->first->schStalls++;
if (!dispRdy) {
// not ready for dispatch, increment stall stat
schIter->first->schResourceStalls++;
}
// Examine next wave for this resource
schIter++;
}
} else {
// Wave not in RFREADY, try next wave
schIter++;
}
}
// Increment stall count if no wave sent to dispatchList for
// current execution resource
if (!dispatched) {
schListToDispListStalls[j]++;
} else {
schListToDispList[j]++;
}
}
}
void
ScheduleStage::arbitrateVrfToLdsBus()
{
// Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
// Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
// and a VRF->LDS bus. In GFx9, this is not the case.
// iterate the GM pipelines
for (int i = 0; i < computeUnit->numVectorGlobalMemUnits; i++) {
// get the GM pipe index in the dispatchList
int gm_exe_unit = computeUnit->firstMemUnit() + i;
// get the wave in the dispatchList
Wavefront *w = dispatchList->at(gm_exe_unit).first;
// If the WF is valid, ready to execute, and the instruction
// is a flat access, arbitrate with the WF's assigned LM pipe
if (w && dispatchList->at(gm_exe_unit).second == EXREADY &&
w->instructionBuffer.front()->isFlat()) {
// If the associated LM pipe also has a wave selected, block
// that wave and let the Flat instruction issue. The WF in the
// LM pipe is added back to the schList for consideration next
// cycle.
if (dispatchList->at(w->localMem).second == EXREADY) {
reinsertToSchList(w->localMem,
dispatchList->at(w->localMem).first);
// Increment stall stats for LDS-VRF arbitration
ldsBusArbStalls++;
dispatchList->at(w->localMem).first->schLdsArbStalls++;
}
// With arbitration of LM pipe complete, transition the
// LM pipe to SKIP state in the dispatchList to inform EX stage
// that a Flat instruction is executing next cycle
doDispatchListTransition(w->localMem, SKIP, w);
DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
"EXREADY->SKIP\n", w->localMem);
}
}
}
void
ScheduleStage::checkRfOperandReadComplete()
{
// Iterate the schList queues and check if operand reads
// have completed in the RFs. If so, mark the wave as ready for
// selection for dispatchList
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
for (auto &p : schList.at(j)) {
Wavefront *w = p.first;
assert(w);
// Increment the number of cycles the wave spends in the
// SCH stage, since this loop visits every wave in SCH.
w->schCycles++;
GPUDynInstPtr ii = w->instructionBuffer.front();
bool vrfRdy = true;
if (!ii->isScalar()) {
vrfRdy =
computeUnit->vrf[w->simdId]->operandReadComplete(w, ii);
}
bool srfRdy =
computeUnit->srf[w->simdId]->operandReadComplete(w, ii);
bool operandsReady = vrfRdy && srfRdy;
if (operandsReady) {
DPRINTF(GPUSched,
"schList[%d]: WV[%d] operands ready for: %d: %s\n",
j, w->wfDynId, ii->seqNum(), ii->disassemble());
DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
j, w->wfDynId);
p.second = RFREADY;
} else {
DPRINTF(GPUSched,
"schList[%d]: WV[%d] operands not ready for: %d: %s\n",
j, w->wfDynId, ii->seqNum(), ii->disassemble());
// operands not ready yet, increment SCH stage stats
// aggregate to all wavefronts on the CU
p.second = RFBUSY;
// Increment stall stats
w->schStalls++;
w->schOpdNrdyStalls++;
opdNrdyStalls[SCH_RF_OPD_NRDY]++;
if (!vrfRdy) {
opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
}
if (!srfRdy) {
opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
}
}
}
@@ -116,33 +669,177 @@ ScheduleStage::arbitrate()
}
void
ScheduleStage::exec()
ScheduleStage::reserveResources()
{
for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
uint32_t readyListSize = computeUnit->readyList[j].size();
std::vector<bool> exeUnitReservations;
exeUnitReservations.resize(computeUnit->numExeUnits(), false);
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
continue;
}
for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
Wavefront *dispatchedWave = dispatchList->at(j).first;
if (dispatchedWave) {
DISPATCH_STATUS s = dispatchList->at(j).second;
if (s == EMPTY) {
continue;
} else if (s == EXREADY) {
// Wave is ready for execution
std::vector<int> execUnitIds =
dispatchedWave->reserveResources();
GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
dispatchList->at(j).first = waveToBeDispatched;
waveToBeDispatched->updateResources();
dispatchList->at(j).second = FILLED;
if (!ii->isScalar()) {
computeUnit->vrf[dispatchedWave->simdId]->
dispatchInstruction(ii);
}
computeUnit->srf[dispatchedWave->simdId]->
dispatchInstruction(ii);
waveStatusList[waveToBeDispatched->simdId]->at(
waveToBeDispatched->wfSlotId).second = BLOCKED;
std::stringstream ss;
for (auto id : execUnitIds) {
ss << id << " ";
}
DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
" Reserving ExeRes[ %s]\n",
j, dispatchedWave->simdId, dispatchedWave->wfDynId,
ii->seqNum(), ii->disassemble(), ss.str());
// mark the resources as reserved for this cycle
for (auto execUnitId : execUnitIds) {
panic_if(exeUnitReservations.at(execUnitId),
"Execution unit %d is reserved!!!\n"
"SIMD[%d] WV[%d]: %d: %s",
execUnitId, dispatchedWave->simdId,
dispatchedWave->wfDynId,
ii->seqNum(), ii->disassemble());
exeUnitReservations.at(execUnitId) = true;
}
assert(computeUnit->readyList[j].size() == readyListSize - 1);
// If wavefront::reserveResources reserved multiple resources,
// then we're executing a flat memory instruction. This means
// that we've reserved a global and local memory unit. Thus,
// we need to mark the latter execution unit as not available.
if (execUnitIds.size() > 1) {
int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem;
assert(dispatchList->at(lm_exec_unit).second == SKIP);
}
} else if (s == SKIP) {
// Shared Memory pipe reserved for FLAT instruction.
// Verify the GM pipe for this wave is ready to execute
// and the wave in the GM pipe is the same as the wave
// in the LM pipe
int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem;
assert(dispatchList->at(gm_exec_unit).first->wfDynId ==
dispatchedWave->wfDynId);
assert(dispatchList->at(gm_exec_unit).second == EXREADY);
}
}
}
// arbitrate over all shared resources among instructions being issued
// simultaneously
arbitrate();
}
void
ScheduleStage::deleteFromSch(Wavefront *w)
{
wavesInSch.erase(w->wfDynId);
}
void
ScheduleStage::regStats()
{
rdyListNotEmpty
.init(computeUnit->numExeUnits())
.name(name() + ".rdy_list_not_empty")
.desc("number of cycles one or more wave on ready list per "
"execution resource")
;
rdyListEmpty
.init(computeUnit->numExeUnits())
.name(name() + ".rdy_list_empty")
.desc("number of cycles no wave on ready list per "
"execution resource")
;
addToSchListStalls
.init(computeUnit->numExeUnits())
.name(name() + ".sch_list_add_stalls")
.desc("number of cycles a wave is not added to schList per "
"execution resource when ready list is not empty")
;
schListToDispList
.init(computeUnit->numExeUnits())
.name(name() + ".sch_list_to_disp_list")
.desc("number of cycles a wave is added to dispatchList per "
"execution resource")
;
schListToDispListStalls
.init(computeUnit->numExeUnits())
.name(name() + ".sch_list_to_disp_list_stalls")
.desc("number of cycles no wave is added to dispatchList per "
"execution resource")
;
// Operand Readiness Stall Cycles
opdNrdyStalls
.init(SCH_RF_OPD_NRDY_CONDITIONS)
.name(name() + ".opd_nrdy_stalls")
.desc("number of stalls in SCH due to operands not ready")
;
opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
// dispatchReady Stall Cycles
dispNrdyStalls
.init(SCH_NRDY_CONDITIONS)
.name(name() + ".disp_nrdy_stalls")
.desc("number of stalls in SCH due to resource not ready")
;
dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
csprintf("VectorMemIssue"));
dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,
csprintf("VectorMemBusBusy"));
dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,
csprintf("VectorMemCoalescer"));
dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,
csprintf("ScalarMemIssue"));
dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,
csprintf("ScalarMemBusBusy"));
dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,
csprintf("ScalarMemFIFO"));
dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,
csprintf("LocalMemIssue"));
dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,
csprintf("LocalMemBusBusy"));
dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,
csprintf("LocalMemFIFO"));
dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,
csprintf("FlatMemIssue"));
dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,
csprintf("FlatMemBusBusy"));
dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,
csprintf("FlatMemCoalescer"));
dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,
csprintf("FlatMemFIFO"));
dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
// RF Access Stall Cycles
rfAccessStalls
.init(SCH_RF_ACCESS_NRDY_CONDITIONS)
.name(name() + ".rf_access_stalls")
.desc("number of stalls due to RF access denied")
;
rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
// Stall cycles due to wave losing LDS bus arbitration
ldsBusArbStalls
.name(name() + ".lds_bus_arb_stalls")
.desc("number of stalls due to VRF->LDS bus conflicts")
;
}

View File

@@ -34,6 +34,9 @@
#ifndef __SCHEDULE_STAGE_HH__
#define __SCHEDULE_STAGE_HH__
#include <deque>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
@@ -54,40 +57,169 @@ struct ComputeUnitParams;
class ScheduleStage
{
public:
ScheduleStage(const ComputeUnitParams *params);
ScheduleStage(const ComputeUnitParams *params, ComputeUnit *cu);
~ScheduleStage();
void init(ComputeUnit *cu);
void exec();
void arbitrate();
// Stats related variables and methods
std::string name() { return _name; }
enum SchNonRdyType {
SCH_SCALAR_ALU_NRDY,
SCH_VECTOR_ALU_NRDY,
SCH_VECTOR_MEM_ISSUE_NRDY,
SCH_VECTOR_MEM_BUS_BUSY_NRDY,
SCH_VECTOR_MEM_COALESCER_NRDY,
SCH_VECTOR_MEM_REQS_NRDY,
SCH_CEDE_SIMD_NRDY,
SCH_SCALAR_MEM_ISSUE_NRDY,
SCH_SCALAR_MEM_BUS_BUSY_NRDY,
SCH_SCALAR_MEM_FIFO_NRDY,
SCH_LOCAL_MEM_ISSUE_NRDY,
SCH_LOCAL_MEM_BUS_BUSY_NRDY,
SCH_LOCAL_MEM_FIFO_NRDY,
SCH_FLAT_MEM_ISSUE_NRDY,
SCH_FLAT_MEM_BUS_BUSY_NRDY,
SCH_FLAT_MEM_COALESCER_NRDY,
SCH_FLAT_MEM_REQS_NRDY,
SCH_FLAT_MEM_FIFO_NRDY,
SCH_RDY,
SCH_NRDY_CONDITIONS
};
enum schopdnonrdytype_e {
SCH_VRF_OPD_NRDY,
SCH_SRF_OPD_NRDY,
SCH_RF_OPD_NRDY,
SCH_RF_OPD_NRDY_CONDITIONS
};
enum schrfaccessnonrdytype_e {
SCH_VRF_RD_ACCESS_NRDY,
SCH_VRF_WR_ACCESS_NRDY,
SCH_SRF_RD_ACCESS_NRDY,
SCH_SRF_WR_ACCESS_NRDY,
SCH_RF_ACCESS_NRDY,
SCH_RF_ACCESS_NRDY_CONDITIONS
};
void regStats();
// Called by ExecStage to inform SCH of instruction execution
void deleteFromSch(Wavefront *w);
// Schedule List status
enum SCH_STATUS
{
RFBUSY = 0, // RF busy reading operands
RFREADY, // ready for exec
};
private:
ComputeUnit *computeUnit;
uint32_t numSIMDs;
uint32_t numMemUnits;
// Each execution resource will have its own
// scheduler and a dispatch list
std::vector<Scheduler> scheduler;
// Stores the status of waves. A READY implies the
// wave is ready to be scheduled this cycle and
// is already present in the readyList
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
waveStatusList;
// List of waves which will be dispatched to
// each execution resource. A FILLED implies
// dispatch list is non-empty and
// execution unit has something to execute
// this cycle. Currently, the dispatch list of
// each execution resource.
// Currently, the dispatch list of
// an execution resource can hold only one wave because
// an execution resource can execute only one wave in a cycle.
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
// Stats
// Number of cycles with empty (or not empty) readyList, per execution
// resource, when the CU is active (not sleeping)
Stats::Vector rdyListEmpty;
Stats::Vector rdyListNotEmpty;
// Number of cycles, per execution resource, when at least one wave
// was on the readyList and picked by scheduler, but was unable to be
// added to the schList, when the CU is active (not sleeping)
Stats::Vector addToSchListStalls;
// Number of cycles, per execution resource, when a wave is selected
// as candidate for dispatchList from schList
// Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
Stats::Vector schListToDispList;
// Per execution resource stat, incremented once per cycle if no wave
// was selected as candidate for dispatch and moved to dispatchList
Stats::Vector schListToDispListStalls;
// Number of times a wave is selected by the scheduler but cannot
// be added to the schList due to register files not being able to
// support reads or writes of operands. RF_ACCESS_NRDY condition is always
// incremented if at least one read/write not supported, other
// conditions are incremented independently from each other.
Stats::Vector rfAccessStalls;
// Number of times a wave is executing FLAT instruction and
// forces another wave occupying its required local memory resource
// to be deselected for execution, and placed back on schList
Stats::Scalar ldsBusArbStalls;
// Count of times VRF and/or SRF blocks waves on schList from
// performing RFBUSY->RFREADY transition
Stats::Vector opdNrdyStalls;
// Count of times resource required for dispatch is not ready and
// blocks wave in RFREADY state on schList from potentially moving
// to dispatchList
Stats::Vector dispNrdyStalls;
std::string _name;
// called by exec() to add a wave to schList if the RFs can support it
bool addToSchList(int exeType, Wavefront *w);
// re-insert a wave to schList if wave lost arbitration
// wave is inserted such that age order (oldest to youngest) is preserved
void reinsertToSchList(int exeType, Wavefront *w);
// check waves in schList to see if RF reads complete
void checkRfOperandReadComplete();
// check execution resources for readiness
bool vectorAluRdy;
bool scalarAluRdy;
bool scalarMemBusRdy;
bool scalarMemIssueRdy;
bool glbMemBusRdy;
bool glbMemIssueRdy;
bool locMemBusRdy;
bool locMemIssueRdy;
// check status of memory pipes and RF to Mem buses
void checkMemResources();
// resource ready check called by fillDispatchList
bool dispatchReady(Wavefront *w);
// pick waves from schList and populate dispatchList with one wave
// per EXE resource type
void fillDispatchList();
// arbitrate Shared Mem Pipe VRF/LDS bus for waves in dispatchList
void arbitrateVrfToLdsBus();
// schedule destination operand writes to register files for waves in
// dispatchList
void scheduleRfDestOperands();
// invoked by scheduleRfDestOperands to schedule RF writes for a wave
bool schedRfWrites(int exeType, Wavefront *w);
// reserve resources for waves surviving arbitration in dispatchList
void reserveResources();
void doDispatchListTransition(int unitId, DISPATCH_STATUS s,
Wavefront *w = nullptr);
// Set tracking wfDynId for each wave present in schedule stage
// Used to allow only one instruction per wave in schedule
std::unordered_set<uint64_t> wavesInSch;
// List of waves (one list per exe resource) that are in schedule
// stage. Waves are added to this list after selected by scheduler
// from readyList. Waves are removed from this list and placed on
// dispatchList when status reaches SCHREADY.
// Waves are kept ordered by age for each resource, always favoring
// forward progress for the oldest wave.
// The maximum number of waves per resource can be determined by either
// the VRF/SRF availability or limits imposed by paremeters (to be added)
// of the SCH stage or CU.
std::vector<std::deque<std::pair<Wavefront*, SCH_STATUS>>> schList;
};
#endif // __SCHEDULE_STAGE_HH__

View File

@@ -33,29 +33,23 @@
#include "gpu-compute/scoreboard_check_stage.hh"
#include "debug/GPUExec.hh"
#include "debug/GPUSched.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
#include "params/ComputeUnit.hh"
ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
: numSIMDs(p->num_SIMDs),
numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
numShrMemPipes(p->num_shared_mem_pipes),
vectorAluInstAvail(nullptr),
lastGlbMemSimd(-1),
lastShrMemSimd(-1), glbMemInstAvail(nullptr),
shrMemInstAvail(nullptr)
{
}
ScoreboardCheckStage::~ScoreboardCheckStage()
{
readyList.clear();
waveStatusList.clear();
shrMemInstAvail = nullptr;
glbMemInstAvail = nullptr;
}
void
@@ -64,102 +58,212 @@ ScoreboardCheckStage::init(ComputeUnit *cu)
computeUnit = cu;
_name = computeUnit->name() + ".ScoreboardCheckStage";
for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
readyList.push_back(&computeUnit->readyList[unitId]);
}
for (int unitId = 0; unitId < numSIMDs; ++unitId) {
waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
}
vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
glbMemInstAvail= &computeUnit->glbMemInstAvail;
shrMemInstAvail= &computeUnit->shrMemInstAvail;
}
void
ScoreboardCheckStage::initStatistics()
ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus)
{
lastGlbMemSimd = -1;
lastShrMemSimd = -1;
*glbMemInstAvail = 0;
*shrMemInstAvail = 0;
for (int unitId = 0; unitId < numSIMDs; ++unitId)
vectorAluInstAvail->at(unitId) = false;
panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS,
"Instruction ready status %d is illegal!!!", rdyStatus);
stallCycles[rdyStatus]++;
}
void
ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
// Return true if this wavefront is ready
// to execute an instruction of the specified type.
// It also returns the reason (in rdyStatus) if the instruction is not
// ready. Finally it sets the execution resource type (in exesResType)
// of the instruction, only if it ready.
bool
ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
int *exeResType, int wfSlot)
{
if (curWave->instructionBuffer.empty())
return;
/**
* The waitCnt checks have to be done BEFORE checking for Instruction
* buffer empty condition. Otherwise, it will result into a deadlock if
* the last instruction in the Instruction buffer is a waitCnt: after
* executing the waitCnt, the Instruction buffer would be empty and the
* ready check logic will exit BEFORE checking for wait counters being
* satisfied.
*/
// track which vector SIMD unit has at least one WV with a vector
// ALU as the oldest instruction in its Instruction buffer
vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
curWave->isOldestInstALU();
// track how many vector SIMD units have at least one WV with a
// vector Global memory instruction as the oldest instruction
// in its Instruction buffer
if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
*glbMemInstAvail <= 1) {
(*glbMemInstAvail)++;
lastGlbMemSimd = unitId;
// waitCnt instruction has been dispatched or executed: next
// instruction should be blocked until waitCnts are satisfied.
if (w->getStatus() == Wavefront::S_WAITCNT) {
if (!w->waitCntsSatisfied()) {
*rdyStatus = NRDY_WAIT_CNT;
return false;
}
}
// track how many vector SIMD units have at least one WV with a
// vector shared memory (LDS) instruction as the oldest instruction
// in its Instruction buffer
// TODO: parametrize the limit of the LDS units
if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
lastShrMemSimd != unitId) {
(*shrMemInstAvail)++;
lastShrMemSimd = unitId;
// Is the wave waiting at a barrier. Check this condition BEFORE checking
// for instruction buffer occupancy to avoid a deadlock when the barrier is
// the last instruction in the instruction buffer.
if (w->stalledAtBarrier) {
if (!computeUnit->AllAtBarrier(w->barrierId,w->barrierCnt,
computeUnit->getRefCounter(w->dispatchId, w->wgId))) {
// Are all threads at barrier?
*rdyStatus = NRDY_BARRIER_WAIT;
return false;
}
w->oldBarrierCnt = w->barrierCnt;
w->stalledAtBarrier = false;
}
// Check WF status: it has to be running
if (w->getStatus() == Wavefront::S_STOPPED ||
w->getStatus() == Wavefront::S_RETURNING ||
w->getStatus() == Wavefront::S_STALLED) {
*rdyStatus = NRDY_WF_STOP;
return false;
}
// is the Instruction buffer empty
if ( w->instructionBuffer.empty()) {
*rdyStatus = NRDY_IB_EMPTY;
return false;
}
// Check next instruction from instruction buffer
GPUDynInstPtr ii = w->nextInstr();
// Only instruction in the instruction buffer has been dispatched.
// No need to check it again for readiness
if (!ii) {
*rdyStatus = NRDY_IB_EMPTY;
return false;
}
// The following code is very error prone and the entire process for
// checking readiness will be fixed eventually. In the meantime, let's
// make sure that we do not silently let an instruction type slip
// through this logic and always return not ready.
if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat())) {
panic("next instruction: %s is of unknown type\n", ii->disassemble());
}
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Ready for Inst : %s\n",
computeUnit->cu_id, w->simdId, w->wfSlotId, ii->disassemble());
// Non-scalar (i.e., vector) instructions may use VGPRs
if (!ii->isScalar()) {
if (!computeUnit->vrf[w->simdId]->operandsReady(w, ii)) {
*rdyStatus = NRDY_VGPR_NRDY;
return false;
}
}
// Scalar and non-scalar instructions may use SGPR
if (!computeUnit->srf[w->simdId]->operandsReady(w, ii)) {
*rdyStatus = NRDY_SGPR_NRDY;
return false;
}
// The hardware implicitly executes S_WAITCNT 0 before executing
// the S_ENDPGM instruction. Implementing this implicit S_WAITCNT.
// isEndOfKernel() is used to identify the S_ENDPGM instruction
// On identifying it, we do the following:
// 1. Wait for all older instruction to execute
// 2. Once all the older instruction are executed, we add a wait
// count for the executed instruction(s) to complete.
if (ii->isEndOfKernel()) {
// Waiting for older instruction to execute
if (w->instructionBuffer.front()->seqNum() != ii->seqNum()) {
*rdyStatus = NRDY_WAIT_CNT;
return false;
}
// Older instructions have executed, adding implicit wait count
w->setStatus(Wavefront::S_WAITCNT);
w->setWaitCnts(0, 0, 0);
if (!w->waitCntsSatisfied()) {
*rdyStatus = NRDY_WAIT_CNT;
return false;
}
}
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
w->simdId, w->wfSlotId, ii->disassemble());
*exeResType = mapWaveToExeUnit(w);
*rdyStatus = INST_RDY;
return true;
}
int
ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w)
{
GPUDynInstPtr ii = w->nextInstr();
assert(ii);
if (ii->isFlat()) {
/**
* NOTE: Flat memory ops requires both GM and LM resources.
* The simulator models consumption of both GM and LM
* resources in the schedule stage. At instruction execution time,
* after the aperture check is performed, only the GM or LM pipe
* is actually reserved by the timing model. The GM unit is returned
* here since Flat ops occupy the GM slot in the ready and dispatch
* lists. They also consume the LM slot in the dispatch list.
*/
return w->globalMem;
} else if (ii->isLocalMem()) {
return w->localMem;
} else if (ii->isGlobalMem()) {
if (!ii->isScalar()) {
return w->globalMem;
} else {
return w->scalarMem;
}
} else if (ii->isBranch() ||
ii->isALU() ||
(ii->isKernArgSeg() && ii->isLoad()) ||
ii->isArgSeg() ||
ii->isReturn() ||
ii->isEndOfKernel() ||
ii->isNop() ||
ii->isBarrier()) {
if (!ii->isScalar()) {
return w->simdId;
} else {
return w->scalarAluGlobalIdx;
}
}
panic("%s: unmapped to an execution resource", ii->disassemble());
return computeUnit->numExeUnits();
}
void
ScoreboardCheckStage::exec()
{
initStatistics();
// reset the ready list for all execution units; it will be
// constructed every cycle since resource availability may change
for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
// Reset wavefront pointers to nullptr so clear() on the vector
// does not accidentally destruct the wavefront object
for (int i = 0; i < readyList[unitId]->size(); i++) {
readyList[unitId]->at(i) = nullptr;
}
readyList[unitId]->clear();
}
// iterate over the Wavefronts of all SIMD units
for (int unitId = 0; unitId < numSIMDs; ++unitId) {
for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
// iterate over all WF slots across all vector ALUs
for (int simdId = 0; simdId < computeUnit->numVectorALUs; ++simdId) {
for (int wfSlot = 0; wfSlot < computeUnit->shader->n_wf; ++wfSlot) {
// reset the ready status of each wavefront
waveStatusList[unitId]->at(wvId).second = BLOCKED;
Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
collectStatistics(curWave, unitId);
if (curWave->ready(Wavefront::I_ALU)) {
readyList[unitId]->push_back(curWave);
waveStatusList[unitId]->at(wvId).second = READY;
} else if (curWave->ready(Wavefront::I_GLOBAL)) {
if (computeUnit->cedeSIMD(unitId, wvId)) {
continue;
}
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
waveStatusList[unitId]->at(wvId).second = READY;
} else if (curWave->ready(Wavefront::I_SHARED)) {
readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
waveStatusList[unitId]->at(wvId).second = READY;
} else if (curWave->ready(Wavefront::I_FLAT)) {
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
waveStatusList[unitId]->at(wvId).second = READY;
} else if (curWave->ready(Wavefront::I_PRIVATE)) {
readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
waveStatusList[unitId]->at(wvId).second = READY;
Wavefront *curWave = computeUnit->wfList[simdId][wfSlot];
nonrdytype_e rdyStatus = NRDY_ILLEGAL;
int exeResType = -1;
// check WF readiness: If the WF's oldest
// instruction is ready to issue then add the WF to the ready list
if (ready(curWave, &rdyStatus, &exeResType, wfSlot)) {
assert(curWave->simdId == simdId);
DPRINTF(GPUSched,
"Adding to readyList[%d]: SIMD[%d] WV[%d]: %d: %s\n",
exeResType,
curWave->simdId, curWave->wfDynId,
curWave->nextInstr()->seqNum(),
curWave->nextInstr()->disassemble());
readyList.at(exeResType)->push_back(curWave);
}
collectStatistics(rdyStatus);
}
}
}
@@ -167,4 +271,16 @@ ScoreboardCheckStage::exec()
void
ScoreboardCheckStage::regStats()
{
stallCycles
.init(NRDY_CONDITIONS)
.name(name() + ".stall_cycles")
.desc("number of cycles wave stalled in SCB")
;
stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop"));
stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty"));
stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt"));
stallCycles.subname(NRDY_BARRIER_WAIT, csprintf("BarrierWait"));
stallCycles.subname(NRDY_VGPR_NRDY, csprintf("VgprBusy"));
stallCycles.subname(NRDY_SGPR_NRDY, csprintf("SgprBusy"));
stallCycles.subname(INST_RDY, csprintf("InstrReady"));
}

View File

@@ -36,20 +36,17 @@
#include <cstdint>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "sim/stats.hh"
class ComputeUnit;
class Wavefront;
struct ComputeUnitParams;
enum WAVE_STATUS
{
BLOCKED = 0,
READY
};
/*
* Scoreboard check stage.
* All wavefronts are analyzed to see if they are ready
@@ -61,6 +58,18 @@ enum WAVE_STATUS
class ScoreboardCheckStage
{
public:
enum nonrdytype_e {
NRDY_ILLEGAL,
NRDY_WF_STOP,
NRDY_IB_EMPTY,
NRDY_WAIT_CNT,
NRDY_BARRIER_WAIT,
NRDY_VGPR_NRDY,
NRDY_SGPR_NRDY,
INST_RDY,
NRDY_CONDITIONS
};
ScoreboardCheckStage(const ComputeUnitParams* params);
~ScoreboardCheckStage();
void init(ComputeUnit *cu);
@@ -71,31 +80,18 @@ class ScoreboardCheckStage
void regStats();
private:
void collectStatistics(Wavefront *curWave, int unitId);
void initStatistics();
void collectStatistics(nonrdytype_e rdyStatus);
int mapWaveToExeUnit(Wavefront *w);
bool ready(Wavefront *w, nonrdytype_e *rdyStatus,
int *exeResType, int wfSlot);
ComputeUnit *computeUnit;
uint32_t numSIMDs;
uint32_t numMemUnits;
uint32_t numShrMemPipes;
// flag per vector SIMD unit that is set when there is at least one
// WF that has a vector ALU instruction as the oldest in its
// Instruction Buffer
std::vector<bool> *vectorAluInstAvail;
int lastGlbMemSimd;
int lastShrMemSimd;
int *glbMemInstAvail;
int *shrMemInstAvail;
// List of waves which are ready to be scheduled.
// Each execution resource has a ready list
std::vector<std::vector<Wavefront*>*> readyList;
// Stores the status of waves. A READY implies the
// wave is ready to be scheduled this cycle and
// is already present in the readyList
std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
waveStatusList;
// Stats
Stats::Vector stallCycles;
std::string _name;
};

View File

@@ -39,37 +39,63 @@
#include "base/chunk_generator.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUMem.hh"
#include "debug/HSAIL.hh"
#include "debug/GPUShader.hh"
#include "debug/GPUWgLatency.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/gpu_command_processor.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/qstruct.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/packet.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "sim/sim_exit.hh"
Shader::Shader(const Params *p)
: ClockedObject(p), clock(p->clk_domain->clockPeriod()),
cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p->cpu_pointer),
tickEvent([this]{ processTick(); }, "Shader tick",
false, Event::CPU_Tick_Pri),
timingSim(p->timing), hsail_mode(SIMT),
impl_kern_boundary_sync(p->impl_kern_boundary_sync),
separate_acquire_release(p->separate_acquire_release), coissue_return(1),
trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
box_tick_cnt(0), start_tick_cnt(0)
Shader::Shader(const Params *p) : ClockedObject(p),
_activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
gpuTc(nullptr), cpuPointer(p->cpu_pointer),
tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
false, Event::CPU_Tick_Pri),
timingSim(p->timing), hsail_mode(SIMT),
impl_kern_boundary_sync(p->impl_kern_boundary_sync),
coissue_return(1),
trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
globalMemSize(p->globalmem),
nextSchedCu(0), sa_n(0), gpuCmdProc(*p->gpu_cmd_proc),
_dispatcher(*p->dispatcher),
max_valu_insts(p->max_valu_insts), total_valu_insts(0)
{
gpuCmdProc.setShader(this);
_dispatcher.setShader(this);
_gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
_gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
_ldsApe.base = ((Addr)1 << 61) + 0x0;
_ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
_scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
_scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
shHiddenPrivateBaseVmid = 0;
cuList.resize(n_cu);
panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
for (int i = 0; i < n_cu; ++i) {
cuList[i] = p->CUs[i];
assert(i == cuList[i]->cu_id);
cuList[i]->shader = this;
cuList[i]->idleCUTimeout = p->idlecu_timeout;
}
}
GPUDispatcher&
Shader::dispatcher()
{
return _dispatcher;
}
Addr
Shader::mmap(int length)
{
@@ -83,11 +109,11 @@ Shader::mmap(int length)
auto mem_state = proc->memState;
if (proc->mmapGrowsDown()) {
DPRINTF(HSAIL, "GROWS DOWN");
DPRINTF(GPUShader, "GROWS DOWN");
start = mem_state->getMmapEnd() - length;
mem_state->setMmapEnd(start);
} else {
DPRINTF(HSAIL, "GROWS UP");
DPRINTF(GPUShader, "GROWS UP");
start = mem_state->getMmapEnd();
mem_state->setMmapEnd(start + length);
@@ -96,7 +122,7 @@ Shader::mmap(int length)
mem_state->getMmapEnd());
}
DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
proc->allocateMem(start, length);
@@ -146,15 +172,15 @@ ShaderParams::create()
}
void
Shader::exec()
Shader::execScheduledAdds()
{
tick_cnt = curTick();
box_tick_cnt = curTick() - start_tick_cnt;
assert(!sa_when.empty());
// apply any scheduled adds
for (int i = 0; i < sa_n; ++i) {
if (sa_when[i] <= tick_cnt) {
if (sa_when[i] <= curTick()) {
*sa_val[i] += sa_x[i];
panic_if(*sa_val[i] < 0, "Negative counter value\n");
sa_val.erase(sa_val.begin() + i);
sa_x.erase(sa_x.begin() + i);
sa_when.erase(sa_when.begin() + i);
@@ -162,14 +188,62 @@ Shader::exec()
--i;
}
}
if (!sa_when.empty()) {
Tick shader_wakeup = *std::max_element(sa_when.begin(),
sa_when.end());
DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
schedule(tickEvent, shader_wakeup);
} else {
DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
}
}
// clock all of the cu's
for (int i = 0; i < n_cu; ++i)
cuList[i]->exec();
/*
* dispatcher/shader arranges invalidate requests to the CUs
*/
void
Shader::prepareInvalidate(HSAQueueEntry *task) {
// if invalidate has already started/finished, then do nothing
if (task->isInvStarted()) return;
// invalidate has never started; it can only perform once at kernel launch
assert(task->outstandingInvs() == -1);
int kernId = task->dispatchId();
// counter value is 0 now, indicating the inv is about to start
_dispatcher.updateInvCounter(kernId, +1);
// iterate all cus managed by the shader, to perform invalidate.
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
// create a request to hold INV info; the request's fields will
// be updated in cu before use
auto req = std::make_shared<Request>(0, 0, 0,
cuList[i_cu]->masterId(),
0, -1);
_dispatcher.updateInvCounter(kernId, +1);
// all necessary INV flags are all set now, call cu to execute
cuList[i_cu]->doInvalidate(req, task->dispatchId());
}
}
/**
* dispatcher/shader arranges flush requests to the CUs
*/
void
Shader::prepareFlush(GPUDynInstPtr gpuDynInst){
int kernId = gpuDynInst->kern_id;
// flush has never been started, performed only once at kernel end
assert(_dispatcher.getOutstandingWbs(kernId) == 0);
// iterate all cus, managed by the shader, to perform flush.
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
_dispatcher.updateWbCounter(kernId, +1);
cuList[i_cu]->doFlush(gpuDynInst);
}
}
bool
Shader::dispatch_workgroups(NDRange *ndr)
Shader::dispatchWorkgroups(HSAQueueEntry *task)
{
bool scheduledSomething = false;
int cuCount = 0;
@@ -182,32 +256,24 @@ Shader::dispatch_workgroups(NDRange *ndr)
// dispatch workgroup iff the following two conditions are met:
// (a) wg_rem is true - there are unassigned workgroups in the grid
// (b) there are enough free slots in cu cuList[i] for this wg
if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
if (!task->dispComplete() && cuList[curCu]->hasDispResources(task)) {
scheduledSomething = true;
DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
curCu, task->globalWgId());
DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
curTick(), task->globalWgId(), curCu);
// ticks() member function translates cycles to simulation ticks.
if (!tickEvent.scheduled()) {
schedule(tickEvent, curTick() + this->ticks(1));
if (!cuList[curCu]->tickEvent.scheduled()) {
if (!_activeCus)
_lastInactiveTick = curTick();
_activeCus++;
}
cuList[curCu]->StartWorkgroup(ndr);
ndr->wgId[0]++;
ndr->globalWgId++;
if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
ndr->wgId[0] = 0;
ndr->wgId[1]++;
panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
"Invalid activeCu size\n");
cuList[curCu]->dispWorkgroup(task);
if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
ndr->wgId[1] = 0;
ndr->wgId[2]++;
if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
ndr->wg_disp_rem = false;
break;
}
}
}
task->markWgDispatch();
}
++cuCount;
@@ -218,9 +284,83 @@ Shader::dispatch_workgroups(NDRange *ndr)
}
void
Shader::handshake(GpuDispatcher *_dispatcher)
Shader::regStats()
{
dispatcher = _dispatcher;
ClockedObject::regStats();
shaderActiveTicks
.name(name() + ".shader_active_ticks")
.desc("Total ticks that any CU attached to this shader is active")
;
allLatencyDist
.init(0, 1600000, 10000)
.name(name() + ".allLatencyDist")
.desc("delay distribution for all")
.flags(Stats::pdf | Stats::oneline);
loadLatencyDist
.init(0, 1600000, 10000)
.name(name() + ".loadLatencyDist")
.desc("delay distribution for loads")
.flags(Stats::pdf | Stats::oneline);
storeLatencyDist
.init(0, 1600000, 10000)
.name(name() + ".storeLatencyDist")
.desc("delay distribution for stores")
.flags(Stats::pdf | Stats::oneline);
vectorInstSrcOperand
.init(4)
.name(name() + ".vec_inst_src_operand")
.desc("vector instruction source operand distribution");
vectorInstDstOperand
.init(4)
.name(name() + ".vec_inst_dst_operand")
.desc("vector instruction destination operand distribution");
initToCoalesceLatency
.init(0, 1600000, 10000)
.name(name() + ".initToCoalesceLatency")
.desc("Ticks from vmem inst initiateAcc to coalescer issue")
.flags(Stats::pdf | Stats::oneline);
rubyNetworkLatency
.init(0, 1600000, 10000)
.name(name() + ".rubyNetworkLatency")
.desc("Ticks from coalescer issue to coalescer hit callback")
.flags(Stats::pdf | Stats::oneline);
gmEnqueueLatency
.init(0, 1600000, 10000)
.name(name() + ".gmEnqueueLatency")
.desc("Ticks from coalescer hit callback to GM pipe enqueue")
.flags(Stats::pdf | Stats::oneline);
gmToCompleteLatency
.init(0, 1600000, 10000)
.name(name() + ".gmToCompleteLatency")
.desc("Ticks queued in GM pipes ordered response buffer")
.flags(Stats::pdf | Stats::oneline);
coalsrLineAddresses
.init(0, 20, 1)
.name(name() + ".coalsrLineAddresses")
.desc("Number of cache lines for coalesced request")
.flags(Stats::pdf | Stats::oneline);
int wfSize = cuList[0]->wfSize();
cacheBlockRoundTrip = new Stats::Distribution[wfSize];
for (int idx = 0; idx < wfSize; ++idx) {
std::stringstream namestr;
ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
cacheBlockRoundTrip[idx]
.init(0, 1600000, 10000)
.name(namestr.str())
.desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
.flags(Stats::pdf | Stats::oneline);
}
}
void
@@ -251,7 +391,6 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
RequestPtr req1, req2;
req->splitOnVaddr(split_addr, req1, req2);
PacketPtr pkt1 = new Packet(req2, cmd);
PacketPtr pkt2 = new Packet(req1, cmd);
@@ -297,34 +436,22 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
}
}
bool
Shader::busy()
{
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
if (!cuList[i_cu]->isDone()) {
return true;
}
}
return false;
}
void
Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
Shader::ScheduleAdd(int *val,Tick when,int x)
{
sa_val.push_back(val);
sa_when.push_back(tick_cnt + when);
when += curTick();
sa_when.push_back(when);
sa_x.push_back(x);
++sa_n;
}
void
Shader::processTick()
{
if (busy()) {
exec();
schedule(tickEvent, curTick() + ticks(1));
if (!tickEvent.scheduled() || (when < tickEvent.when())) {
DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
"%lu\n", when);
reschedule(tickEvent, when, true);
} else {
assert(tickEvent.scheduled());
DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
"%lu\n", when);
}
}
@@ -356,7 +483,8 @@ void
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
bool suppress_func_errors)
{
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
suppress_func_errors);
}
void
@@ -385,15 +513,11 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
pkt->senderState =
new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
if (cu_id == n_cu) {
dispatcher->tlbPort->sendFunctional(pkt);
} else {
// even when the perLaneTLB flag is turned on
// it's ok tp send all accesses through lane 0
// since the lane # is not known here,
// This isn't important since these are functional accesses.
cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
}
// even when the perLaneTLB flag is turned on
// it's ok tp send all accesses through lane 0
// since the lane # is not known here,
// This isn't important since these are functional accesses.
cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
/* safe_cast the senderState */
TheISA::GpuTLB::TranslationState *sender_state =
@@ -402,3 +526,82 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
delete sender_state->tlbEntry;
delete pkt->senderState;
}
/*
* allow the shader to sample stats from constituent devices
*/
void
Shader::sampleStore(const Tick accessTime)
{
storeLatencyDist.sample(accessTime);
allLatencyDist.sample(accessTime);
}
/*
* allow the shader to sample stats from constituent devices
*/
void
Shader::sampleLoad(const Tick accessTime)
{
loadLatencyDist.sample(accessTime);
allLatencyDist.sample(accessTime);
}
void
Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)
{
// Only sample instructions that go all the way to main memory
if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
return;
}
Tick t1 = roundTripTime[0];
Tick t2 = roundTripTime[1];
Tick t3 = roundTripTime[2];
Tick t4 = roundTripTime[3];
Tick t5 = roundTripTime[4];
initToCoalesceLatency.sample(t2-t1);
rubyNetworkLatency.sample(t3-t2);
gmEnqueueLatency.sample(t4-t3);
gmToCompleteLatency.sample(t5-t4);
}
void
Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
{
coalsrLineAddresses.sample(lineMap.size());
std::vector<Tick> netTimes;
// For each cache block address generated by a vmem inst, calculate
// the round-trip time for that cache block.
for (auto& it : lineMap) {
const std::vector<Tick>& timeVec = it.second;
if (timeVec.size() == 2) {
netTimes.push_back(timeVec[1] - timeVec[0]);
}
}
// Sort the cache block round trip times so that the first
// distrubtion is always measuring the fastests and the last
// distrubtion is always measuring the slowest cache block.
std::sort(netTimes.begin(), netTimes.end());
// Sample the round trip time for each N cache blocks into the
// Nth distribution.
int idx = 0;
for (auto& time : netTimes) {
cacheBlockRoundTrip[idx].sample(time);
++idx;
}
}
void
Shader::notifyCuSleep() {
// If all CUs attached to his shader are asleep, update shaderActiveTicks
panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
"Invalid activeCu size\n");
_activeCus--;
if (!_activeCus)
shaderActiveTicks += curTick() - _lastInactiveTick;
}

View File

@@ -14,9 +14,9 @@
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -30,7 +30,7 @@
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Steve Reinhardt
* Authors: Steve Reinhardt
*/
#ifndef __SHADER_HH__
@@ -47,11 +47,11 @@
#include "cpu/simple_thread.hh"
#include "cpu/thread_context.hh"
#include "cpu/thread_state.hh"
#include "enums/MemType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/gpu_tlb.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/lds_state.hh"
#include "gpu-compute/qstruct.hh"
#include "mem/page_table.hh"
#include "mem/port.hh"
#include "mem/request.hh"
@@ -61,7 +61,8 @@
#include "sim/sim_object.hh"
class BaseTLB;
class GpuDispatcher;
class GPUCommandProcessor;
class GPUDispatcher;
namespace TheISA
{
@@ -70,36 +71,144 @@ namespace TheISA
static const int LDS_SIZE = 65536;
// aperture (APE) registers define the base/limit
// pair for the ATC mapped memory space. currently
// the only APEs we consider are for GPUVM/LDS/scratch.
// the APEs are registered with unique values based
// on a per-device basis
struct ApertureRegister
{
Addr base;
Addr limit;
};
// Class Shader: This describes a single shader instance. Most
// configurations will only have a single shader.
class Shader : public ClockedObject
{
protected:
// Shader's clock period in terms of number of ticks of curTime,
// aka global simulation clock
Tick clock;
private:
ApertureRegister _gpuVmApe;
ApertureRegister _ldsApe;
ApertureRegister _scratchApe;
Addr shHiddenPrivateBaseVmid;
// Number of active Cus attached to this shader
int _activeCus;
// Last tick that all CUs attached to this shader were inactive
Tick _lastInactiveTick;
// some stats for measuring latency
Stats::Distribution allLatencyDist;
Stats::Distribution loadLatencyDist;
Stats::Distribution storeLatencyDist;
// average ticks from vmem inst initiateAcc to coalescer issue,
// average ticks from coalescer issue to coalescer hit callback,
// average ticks from coalescer hit callback to GM pipe enqueue,
// and average ticks spent in GM pipe's ordered resp buffer.
Stats::Distribution initToCoalesceLatency;
Stats::Distribution rubyNetworkLatency;
Stats::Distribution gmEnqueueLatency;
Stats::Distribution gmToCompleteLatency;
// average number of cache blocks requested by vmem inst, and
// average ticks for cache blocks to main memory for the Nth
// cache block generated by a vmem inst.
Stats::Distribution coalsrLineAddresses;
Stats::Distribution *cacheBlockRoundTrip;
public:
typedef ShaderParams Params;
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
// clock related functions ; maps to-and-from
// Simulation ticks and shader clocks.
Tick frequency() const { return SimClock::Frequency / clock; }
Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
Tick getClock() const { return clock; }
Tick curCycle() const { return curTick() / clock; }
Tick tickToCycles(Tick val) const { return val / clock;}
GPUDispatcher &dispatcher();
void sampleLoad(const Tick accessTime);
void sampleStore(const Tick accessTime);
void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
void sampleLineRoundTrip(const std::map<Addr,
std::vector<Tick>> &roundTripTime);
SimpleThread *cpuThread;
ThreadContext *gpuTc;
BaseCPU *cpuPointer;
void processTick();
const ApertureRegister&
gpuVmApe() const
{
return _gpuVmApe;
}
const ApertureRegister&
ldsApe() const
{
return _ldsApe;
}
const ApertureRegister&
scratchApe() const
{
return _scratchApe;
}
bool
isGpuVmApe(Addr addr) const
{
bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
return is_gpu_vm;
}
bool
isLdsApe(Addr addr) const
{
bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
return is_lds;
}
bool
isScratchApe(Addr addr) const
{
bool is_scratch
= addr >= _scratchApe.base && addr <= _scratchApe.limit;
return is_scratch;
}
Addr
getScratchBase()
{
return _scratchApe.base;
}
Addr
getHiddenPrivateBase()
{
return shHiddenPrivateBaseVmid;
}
void
initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
{
Addr sh_hidden_base_new = queueBase - offset;
// We are initializing sh_hidden_private_base_vmid from the
// amd queue descriptor from the first queue.
// The sh_hidden_private_base_vmid is supposed to be same for
// all the queues from the same process
if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
// Do not panic if shHiddenPrivateBaseVmid == 0,
// that is if it is uninitialized. Panic only
// if the value is initilized and we get
// a differnt base later.
panic_if(shHiddenPrivateBaseVmid != 0,
"Currently we support only single process\n");
}
shHiddenPrivateBaseVmid = sh_hidden_base_new;
}
EventFunctionWrapper tickEvent;
// is this simulation going to be timing mode in the memory?
@@ -108,30 +217,18 @@ class Shader : public ClockedObject
// If set, issue acq packet @ kernel launch
int impl_kern_boundary_sync;
// If set, generate a separate packet for acquire/release on
// ld_acquire/st_release/atomic operations
int separate_acquire_release;
// If set, fetch returns may be coissued with instructions
int coissue_return;
// If set, always dump all 64 gprs to trace
int trace_vgpr_all;
// Number of cu units in the shader
int n_cu;
// Number of wavefront slots per cu
// Number of wavefront slots per SIMD per CU
int n_wf;
// The size of global memory
int globalMemSize;
/*
* Bytes/work-item for call instruction
* The number of arguments for an hsail function will
* vary. We simply determine the maximum # of arguments
* required by any hsail function up front before the
* simulation (during parsing of the Brig) and record
* that number here.
*/
int funcargs_size;
// Tracks CU that rr dispatcher should attempt scheduling
int nextSchedCu;
@@ -139,7 +236,7 @@ class Shader : public ClockedObject
uint32_t sa_n;
// Pointer to value to be increments
std::vector<uint32_t*> sa_val;
std::vector<int*> sa_val;
// When to do the increment
std::vector<uint64_t> sa_when;
// Amount to increment by
@@ -148,24 +245,29 @@ class Shader : public ClockedObject
// List of Compute Units (CU's)
std::vector<ComputeUnit*> cuList;
uint64_t tick_cnt;
uint64_t box_tick_cnt;
uint64_t start_tick_cnt;
GPUCommandProcessor &gpuCmdProc;
GPUDispatcher &_dispatcher;
GpuDispatcher *dispatcher;
/**
* Statistics
*/
Stats::Scalar shaderActiveTicks;
Stats::Vector vectorInstSrcOperand;
Stats::Vector vectorInstDstOperand;
void regStats();
int max_valu_insts;
int total_valu_insts;
Shader(const Params *p);
~Shader();
virtual void init();
// Run shader
void exec();
// Check to see if shader is busy
bool busy();
// Run shader scheduled adds
void execScheduledAdds();
// Schedule a 32-bit value to be incremented some time in the future
void ScheduleAdd(uint32_t *val, Tick when, int x);
void ScheduleAdd(int *val, Tick when, int x);
bool processTimingPacket(PacketPtr pkt);
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
@@ -190,12 +292,15 @@ class Shader : public ClockedObject
cuList[cu_id] = compute_unit;
}
void handshake(GpuDispatcher *dispatcher);
bool dispatch_workgroups(NDRange *ndr);
void prepareInvalidate(HSAQueueEntry *task);
void prepareFlush(GPUDynInstPtr gpuDynInst);
bool dispatchWorkgroups(HSAQueueEntry *task);
Addr mmap(int length);
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
void updateContext(int cid);
void hostWakeUp(BaseCPU *cpu);
void notifyCuSleep();
};
#endif // __SHADER_HH__

View File

@@ -35,6 +35,12 @@
#include "base/logging.hh"
SimplePoolManager *
SimplePoolManagerParams::create()
{
return new SimplePoolManager(this);
}
// return the min number of elements that the manager can reserve given
// a request for "size" elements
uint32_t
@@ -64,8 +70,6 @@ SimplePoolManager::printRegion()
bool
SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
{
assert(numRegions * minAllocatedElements(size) <= poolSize());
return _reservedGroups == 0;
}

View File

@@ -38,14 +38,15 @@
#include <cstdint>
#include "gpu-compute/pool_manager.hh"
#include "params/SimplePoolManager.hh"
// Simple Pool Manager: allows one region per pool. No region merging is
// supported.
class SimplePoolManager : public PoolManager
{
public:
SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
: PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
SimplePoolManager(const PoolManagerParams *p)
: PoolManager(p), _regionSize(0), _nxtFreeIdx(0),
_reservedGroups(0)
{
}
@@ -62,7 +63,7 @@ class SimplePoolManager : public PoolManager
// be reserved)
uint32_t _regionSize;
// next index to allocate a region
uint8_t _nxtFreeIdx;
int _nxtFreeIdx;
// number of groups that reserve a region
uint32_t _reservedGroups;
};

View File

@@ -0,0 +1,188 @@
/*
* Copyright (c) 2016 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Mark Wyse
*/
#include "gpu-compute/static_register_manager_policy.hh"
#include "config/the_gpu_isa.hh"
#include "debug/GPURename.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/pool_manager.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
StaticRegisterManagerPolicy::StaticRegisterManagerPolicy()
{
}
void
StaticRegisterManagerPolicy::exec()
{
}
int
StaticRegisterManagerPolicy::mapVgpr(Wavefront* w, int vgprIndex)
{
panic_if((vgprIndex >= w->reservedVectorRegs)
|| (w->reservedVectorRegs < 0),
"VGPR index %d is out of range: VGPR range=[0,%d]",
vgprIndex, w->reservedVectorRegs);
// add the offset from where the VGPRs of the wavefront have been assigned
int physicalVgprIndex = w->startVgprIndex + vgprIndex;
panic_if(!((w->startVgprIndex <= physicalVgprIndex) &&
(w->startVgprIndex + w->reservedVectorRegs - 1)
>= physicalVgprIndex),
"Invalid VGPR index %d\n", physicalVgprIndex);
// calculate physical VGPR index
return physicalVgprIndex % w->computeUnit->vrf[w->simdId]->numRegs();
}
int
StaticRegisterManagerPolicy::mapSgpr(Wavefront* w, int sgprIndex)
{
panic_if(!((sgprIndex < w->reservedScalarRegs)
&& (w->reservedScalarRegs > 0)),
"SGPR index %d is out of range: SGPR range=[0,%d]\n",
sgprIndex, w->reservedScalarRegs);
// add the offset from where the SGPRs of the wavefront have been assigned
int physicalSgprIndex = w->startSgprIndex + sgprIndex;
panic_if(!((w->startSgprIndex <= physicalSgprIndex) &&
(w->startSgprIndex + w->reservedScalarRegs - 1)
>= physicalSgprIndex),
"Invalid SGPR index %d\n", physicalSgprIndex);
// calculate physical SGPR index
return physicalSgprIndex % w->computeUnit->srf[w->simdId]->numRegs();
}
bool
StaticRegisterManagerPolicy::canAllocateVgprs(int simdId, int nWfs,
int demandPerWf)
{
return cu->registerManager->vrfPoolMgrs[simdId]->
canAllocate(nWfs, demandPerWf);
}
bool
StaticRegisterManagerPolicy::canAllocateSgprs(int simdId, int nWfs,
int demandPerWf)
{
return cu->registerManager->srfPoolMgrs[simdId]->
canAllocate(nWfs, demandPerWf);
}
void
StaticRegisterManagerPolicy::allocateRegisters(Wavefront *w, int vectorDemand,
int scalarDemand)
{
uint32_t allocatedSize = 0;
w->startVgprIndex = cu->registerManager->vrfPoolMgrs[w->simdId]->
allocateRegion(vectorDemand, &allocatedSize);
w->reservedVectorRegs = allocatedSize;
cu->vectorRegsReserved[w->simdId] += w->reservedVectorRegs;
panic_if(cu->vectorRegsReserved[w->simdId] > cu->numVecRegsPerSimd,
"VRF[%d] has been overallocated %d > %d\n",
w->simdId, cu->vectorRegsReserved[w->simdId],
cu->numVecRegsPerSimd);
if (scalarDemand) {
w->startSgprIndex = cu->registerManager->srfPoolMgrs[w->simdId]->
allocateRegion(scalarDemand, &allocatedSize);
w->reservedScalarRegs = allocatedSize;
cu->scalarRegsReserved[w->simdId] += w->reservedScalarRegs;
panic_if(cu->scalarRegsReserved[w->simdId] > cu->numScalarRegsPerSimd,
"SRF[%d] has been overallocated %d > %d\n",
w->simdId, cu->scalarRegsReserved[w->simdId],
cu->numScalarRegsPerSimd);
}
}
void
StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
{
// free the vector registers of the completed wavefront
w->computeUnit->vectorRegsReserved[w->simdId] -= w->reservedVectorRegs;
// free the scalar registers of the completed wavefront
w->computeUnit->scalarRegsReserved[w->simdId] -= w->reservedScalarRegs;
panic_if(w->computeUnit->vectorRegsReserved[w->simdId] < 0,
"Freeing VRF[%d] registers left %d registers reserved\n",
w->simdId,
w->computeUnit->vectorRegsReserved[w->simdId]);
panic_if(w->computeUnit->scalarRegsReserved[w->simdId] < 0,
"Freeing SRF[%d] registers left %d registers reserved\n",
w->simdId,
w->computeUnit->scalarRegsReserved[w->simdId]);
int endIndex = (w->startVgprIndex + w->reservedVectorRegs - 1) %
w->computeUnit->vrf[w->simdId]->numRegs();
w->computeUnit->registerManager->vrfPoolMgrs[w->simdId]->
freeRegion(w->startVgprIndex, endIndex);
// mark/pre-mark all registers as not busy
for (int i = 0; i < w->reservedVectorRegs; i++) {
uint32_t physVgprIdx = mapVgpr(w, i);
w->computeUnit->vrf[w->simdId]->markReg(physVgprIdx, false);
}
w->reservedVectorRegs = 0;
w->startVgprIndex = 0;
endIndex = (w->startSgprIndex + w->reservedScalarRegs - 1) %
w->computeUnit->srf[w->simdId]->numRegs();
w->computeUnit->registerManager->srfPoolMgrs[w->simdId]->
freeRegion(w->startSgprIndex, endIndex);
// mark/pre-mark all registers as not busy
for (int i = 0; i < w->reservedScalarRegs; i++) {
uint32_t physSgprIdx = mapSgpr(w, i);
w->computeUnit->srf[w->simdId]->markReg(physSgprIdx, false);
}
w->reservedScalarRegs = 0;
w->startSgprIndex = 0;
}
void
StaticRegisterManagerPolicy::regStats()
{
}

View File

@@ -0,0 +1,65 @@
/*
* Copyright (c) 2016 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Mark Wyse
*/
#ifndef __STATIC_REGISTER_MANAGER_POLICY_HH__
#define __STATIC_REGISTER_MANAGER_POLICY_HH__
#include "gpu-compute/register_manager_policy.hh"
class HSAQueueEntry;
class StaticRegisterManagerPolicy : public RegisterManagerPolicy
{
public:
StaticRegisterManagerPolicy();
void exec() override;
int mapVgpr(Wavefront* w, int vgprIndex) override;
int mapSgpr(Wavefront* w, int sgprIndex) override;
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) override;
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) override;
void allocateRegisters(Wavefront *w, int vectorDemand,
int scalarDemand) override;
void freeRegisters(Wavefront *w) override;
void regStats() override;
};
#endif // __STATIC_REGISTER_MANAGER_POLICY_HH__

View File

@@ -41,7 +41,6 @@
TLBCoalescer::TLBCoalescer(const Params *p)
: ClockedObject(p),
clock(p->clk_domain->clockPeriod()),
TLBProbesPerCycle(p->probesPerCycle),
coalescingWindow(p->coalescingWindow),
disableCoalescing(p->disableCoalescing),
@@ -317,7 +316,7 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
//coalesced requests to the TLB
if (!coalescer->probeTLBEvent.scheduled()) {
coalescer->schedule(coalescer->probeTLBEvent,
curTick() + coalescer->ticks(1));
curTick() + coalescer->clockPeriod());
}
return true;
@@ -380,7 +379,7 @@ TLBCoalescer::MemSidePort::recvReqRetry()
//we've receeived a retry. Schedule a probeTLBEvent
if (!coalescer->probeTLBEvent.scheduled())
coalescer->schedule(coalescer->probeTLBEvent,
curTick() + coalescer->ticks(1));
curTick() + coalescer->clockPeriod());
}
void
@@ -448,7 +447,7 @@ TLBCoalescer::processProbeTLBEvent()
// send the coalesced request for virt_page_addr
if (!memSidePort[0]->sendTimingReq(first_packet)) {
DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
virt_page_addr);
// No need for a retries queue since we are already buffering

View File

@@ -65,13 +65,6 @@ class ThreadContext;
*/
class TLBCoalescer : public ClockedObject
{
protected:
// TLB clock: will inherit clock from shader's clock period in terms
// of nuber of ticks of curTime (aka global simulation clock)
// The assignment of TLB clock from shader clock is done in the
// python config files.
int clock;
public:
typedef TLBCoalescerParams Params;
TLBCoalescer(const Params *p);
@@ -105,7 +98,8 @@ class TLBCoalescer : public ClockedObject
* option is to change it to curTick(), so we coalesce based
* on the receive time.
*/
typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;
typedef std::unordered_map<int64_t, std::vector<coalescedReq>>
CoalescingFIFO;
CoalescingFIFO coalescerFIFO;
@@ -143,13 +137,6 @@ class TLBCoalescer : public ClockedObject
void updatePhysAddresses(PacketPtr pkt);
void regStats() override;
// Clock related functions. Maps to-and-from
// Simulation ticks and object clocks.
Tick frequency() const { return SimClock::Frequency / clock; }
Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
Tick curCycle() const { return curTick() / clock; }
Tick tickToCycles(Tick val) const { return val / clock;}
class CpuSidePort : public SlavePort
{
public:
@@ -171,7 +158,8 @@ class TLBCoalescer : public ClockedObject
virtual void
recvRespRetry()
{
fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
fatal("recvRespRetry() is not implemented in the TLB "
"coalescer.\n");
}
virtual AddrRangeList getAddrRanges() const;

View File

@@ -36,81 +36,21 @@
#include <string>
#include "base/logging.hh"
#include "base/trace.hh"
#include "debug/GPUVRF.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/wavefront.hh"
#include "params/VectorRegisterFile.hh"
VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
: SimObject(p),
manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)),
simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd),
vgprState(new VecRegisterState())
: RegisterFile(p)
{
fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n");
fatal_if(simdId < 0, "Illegal SIMD id for VRF");
regFile.resize(numRegs(), VecRegContainer());
fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not "
"multiple of VRF size\n");
busy.clear();
busy.resize(numRegsPerSimd, 0);
nxtBusy.clear();
nxtBusy.resize(numRegsPerSimd, 0);
vgprState->init(numRegsPerSimd, p->wfSize);
}
void
VectorRegisterFile::setParent(ComputeUnit *_computeUnit)
{
computeUnit = _computeUnit;
vgprState->setParent(computeUnit);
}
uint8_t
VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const
{
uint8_t status = nxtBusy.at(idx);
if (operandSize > 4) {
status = status | (nxtBusy.at((idx + 1) % numRegs()));
}
return status;
}
uint8_t
VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const
{
uint8_t status = busy.at(idx);
if (operandSize > 4) {
status = status | (busy.at((idx + 1) % numRegs()));
}
return status;
}
void
VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value)
{
nxtBusy.at(regIdx) = value;
if (operandSize > 4) {
nxtBusy.at((regIdx + 1) % numRegs()) = value;
}
}
void
VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value)
{
busy.at(regIdx) = value;
if (operandSize > 4) {
busy.at((regIdx + 1) % numRegs()) = value;
for (auto &reg : regFile) {
reg.zero();
}
}
@@ -118,127 +58,154 @@ bool
VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
{
for (int i = 0; i < ii->getNumOperands(); ++i) {
if (ii->isVectorRegister(i)) {
uint32_t vgprIdx = ii->getRegisterIndex(i, ii);
uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1);
if (ii->isVectorRegister(i) && ii->isSrcOperand(i)) {
int vgprIdx = ii->getRegisterIndex(i, ii);
if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) {
if (ii->isDstOperand(i)) {
w->numTimesBlockedDueWAXDependencies++;
} else if (ii->isSrcOperand(i)) {
w->numTimesBlockedDueRAWDependencies++;
// determine number of registers
int nRegs =
ii->getOperandSize(i) <= 4 ? 1 : ii->getOperandSize(i) / 4;
for (int j = 0; j < nRegs; j++) {
int pVgpr = computeUnit->registerManager
->mapVgpr(w, vgprIdx + j);
if (regBusy(pVgpr)) {
if (ii->isDstOperand(i)) {
w->numTimesBlockedDueWAXDependencies++;
} else if (ii->isSrcOperand(i)) {
DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
w->wfDynId, ii->disassemble(), pVgpr);
w->numTimesBlockedDueRAWDependencies++;
}
return false;
}
return false;
}
if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) {
if (ii->isDstOperand(i)) {
w->numTimesBlockedDueWAXDependencies++;
} else if (ii->isSrcOperand(i)) {
w->numTimesBlockedDueRAWDependencies++;
}
return false;
}
}
}
return true;
}
void
VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w)
VectorRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
{
bool loadInstr = ii->isLoad();
bool atomicInstr = ii->isAtomic() || ii->isMemFence();
bool loadNoArgInstr = loadInstr && !ii->isArgLoad();
// iterate over all register destination operands
for (int i = 0; i < ii->getNumOperands(); ++i) {
if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii),
ii->getOperandSize(i), 1);
int vgprIdx = ii->getRegisterIndex(i, ii);
int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
ii->getOperandSize(i) / 4;
// mark the destination vector register as busy
markReg(physReg, ii->getOperandSize(i), 1);
// clear the in-flight status of the destination vector register
preMarkReg(physReg, ii->getOperandSize(i), 0);
for (int j = 0; j < nRegs; ++j) {
int physReg = computeUnit->registerManager
->mapVgpr(w, vgprIdx + j);
// FIXME: if we ever model correct timing behavior
// for load argument instructions then we should not
// set the destination register as busy now but when
// the data returns. Loads and Atomics should free
// their destination registers when the data returns,
// not now
if (!atomicInstr && !loadNoArgInstr) {
uint32_t pipeLen = ii->getOperandSize(i) <= 4 ?
computeUnit->spBypassLength() :
computeUnit->dpBypassLength();
// schedule an event for marking the register as ready
computeUnit->registerEvent(w->simdId, physReg,
ii->getOperandSize(i),
computeUnit->shader->tick_cnt +
computeUnit->shader->ticks(pipeLen),
0);
// If instruction is atomic instruction and
// the atomics do not return value, then
// do not mark this reg as busy.
if (!(ii->isAtomic() && !ii->isAtomicRet())) {
/**
* if the instruction is a load with EXEC = 0, then
* we do not mark the reg. we do this to avoid a
* deadlock that can occur because a load reserves
* its destination regs before checking its exec mask,
* and in the case it is 0, it will not send/recv any
* packets, and therefore it will never free its dest
* reg(s).
*/
if (!ii->isLoad() || (ii->isLoad()
&& ii->exec_mask.any())) {
markReg(physReg, true);
}
}
}
}
}
}
int
VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w,
std::vector<uint32_t> &regVec, uint32_t operandSize,
uint64_t timestamp)
void
VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
{
int delay = 0;
// increment count of number of DWORDs read from VRF
int DWORDs = ii->numSrcVecDWORDs();
registerReads += (DWORDs * w->execMask().count());
panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n",
regVec.size());
for (int i = 0; i < regVec.size(); ++i) {
// mark the destination VGPR as free when the timestamp expires
computeUnit->registerEvent(w->simdId, regVec[i], operandSize,
computeUnit->shader->tick_cnt + timestamp +
computeUnit->shader->ticks(delay), 0);
uint64_t mask = w->execMask().to_ullong();
int srams = w->execMask().size() / 4;
for (int i = 0; i < srams; i++) {
if (mask & 0xF) {
sramReads += DWORDs;
}
mask = mask >> 4;
}
return delay;
}
if (!ii->isLoad()
&& !(ii->isAtomic() || ii->isMemSync())) {
int opSize = 4;
for (int i = 0; i < ii->getNumOperands(); i++) {
if (ii->getOperandSize(i) > opSize) {
opSize = ii->getOperandSize(i);
}
}
Cycles delay(opSize <= 4 ? computeUnit->spBypassLength()
: computeUnit->dpBypassLength());
Tick tickDelay = computeUnit->cyclesToTicks(delay);
void
VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii)
{
// iterate over all register destination operands
for (int i = 0; i < ii->getNumOperands(); ++i) {
if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii),
ii->getOperandSize(i), 1);
// set the in-flight status of the destination vector register
preMarkReg(physReg, ii->getOperandSize(i), 1);
for (int i = 0; i < ii->getNumOperands(); i++) {
if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
int vgprIdx = ii->getRegisterIndex(i, ii);
int nRegs = ii->getOperandSize(i) <= 4 ? 1
: ii->getOperandSize(i) / 4;
for (int j = 0; j < nRegs; j++) {
int physReg = computeUnit->registerManager
->mapVgpr(w, vgprIdx + j);
enqRegFreeEvent(physReg, tickDelay);
}
}
}
// increment count of number of DWORDs written to VRF
DWORDs = ii->numDstVecDWORDs();
registerWrites += (DWORDs * w->execMask().count());
mask = w->execMask().to_ullong();
srams = w->execMask().size() / 4;
for (int i = 0; i < srams; i++) {
if (mask & 0xF) {
sramWrites += DWORDs;
}
mask = mask >> 4;
}
}
}
bool
VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
GPUDynInstPtr ii,
VrfAccessType accessType)
void
VectorRegisterFile::scheduleWriteOperandsFromLoad(
Wavefront *w, GPUDynInstPtr ii)
{
bool ready = true;
assert(ii->isLoad() || ii->isAtomicRet());
for (int i = 0; i < ii->getNumOperands(); ++i) {
if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
int vgprIdx = ii->getRegisterIndex(i, ii);
int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
ii->getOperandSize(i) / 4;
return ready;
}
for (int j = 0; j < nRegs; ++j) {
int physReg = computeUnit->registerManager
->mapVgpr(w, vgprIdx + j);
enqRegFreeEvent(physReg, computeUnit->clockPeriod());
}
}
}
// increment count of number of DWORDs written to VRF
int DWORDs = ii->numDstVecDWORDs();
registerWrites += (DWORDs * ii->exec_mask.count());
bool
VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
VrfAccessType accessType)
{
bool ready = true;
return ready;
uint64_t mask = ii->exec_mask.to_ullong();
int srams = ii->exec_mask.size() / 4;
for (int i = 0; i < srams; i++) {
if (mask & 0xF) {
sramWrites += DWORDs;
}
mask = mask >> 4;
}
}
VectorRegisterFile*

View File

@@ -34,111 +34,76 @@
#ifndef __VECTOR_REGISTER_FILE_HH__
#define __VECTOR_REGISTER_FILE_HH__
#include <list>
#include "base/statistics.hh"
#include "base/trace.hh"
#include "base/types.hh"
#include "arch/gpu_isa.hh"
#include "config/the_gpu_isa.hh"
#include "debug/GPUVRF.hh"
#include "gpu-compute/vector_register_state.hh"
#include "sim/sim_object.hh"
class ComputeUnit;
class Shader;
class SimplePoolManager;
class Wavefront;
#include "gpu-compute/register_file.hh"
#include "gpu-compute/wavefront.hh"
struct VectorRegisterFileParams;
enum class VrfAccessType : uint8_t
{
READ = 0x01,
WRITE = 0x02,
RD_WR = READ | WRITE
};
// Vector Register File
class VectorRegisterFile : public SimObject
class VectorRegisterFile : public RegisterFile
{
public:
using VecRegContainer = TheGpuISA::VecRegContainerU32;
VectorRegisterFile(const VectorRegisterFileParams *p);
~VectorRegisterFile() { }
void setParent(ComputeUnit *_computeUnit);
virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override;
virtual void scheduleWriteOperands(Wavefront *w,
GPUDynInstPtr ii) override;
virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
GPUDynInstPtr ii) override;
virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override;
// Read a register
template<typename T>
T
read(int regIdx, int threadId=0)
void
setParent(ComputeUnit *_computeUnit) override
{
T p0 = vgprState->read<T>(regIdx, threadId);
DPRINTF(GPUVRF, "reading vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)p0);
RegisterFile::setParent(_computeUnit);
}
return p0;
// Read a register that is writeable (e.g., a DST operand)
VecRegContainer&
readWriteable(int regIdx)
{
return regFile[regIdx];
}
// Read a register that is not writeable (e.g., src operand)
const VecRegContainer&
read(int regIdx) const
{
return regFile[regIdx];
}
// Write a register
template<typename T>
void
write(int regIdx, T value, int threadId=0)
write(int regIdx, const VecRegContainer &value)
{
DPRINTF(GPUVRF, "writing vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)value);
vgprState->write<T>(regIdx, value, threadId);
regFile[regIdx] = value;
}
uint8_t regBusy(int idx, uint32_t operandSize) const;
uint8_t regNxtBusy(int idx, uint32_t operandSize) const;
int numRegs() const { return numRegsPerSimd; }
void markReg(int regIdx, uint32_t operandSize, uint8_t value);
void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value);
virtual void exec(GPUDynInstPtr ii, Wavefront *w);
virtual int exec(uint64_t dynamic_id, Wavefront *w,
std::vector<uint32_t> &regVec, uint32_t operandSize,
uint64_t timestamp);
bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
virtual void updateEvents() { }
virtual void updateResources(Wavefront *w, GPUDynInstPtr ii);
virtual bool
isReadConflict(int memWfId, int exeWfId) const
void
printReg(Wavefront *wf, int regIdx) const
{
return false;
#ifndef NDEBUG
const auto &vec_reg_cont = regFile[regIdx];
auto vgpr = vec_reg_cont.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
DPRINTF(GPUVRF, "WF[%d][%d]: WV[%d] v[%d][%d] = %#x\n",
wf->simdId, wf->wfSlotId, wf->wfDynId, regIdx, lane,
vgpr[lane]);
}
}
#endif
}
virtual bool
isWriteConflict(int memWfId, int exeWfId) const
{
return false;
}
virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
GPUDynInstPtr ii,
VrfAccessType accessType);
virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
VrfAccessType accessType);
SimplePoolManager *manager;
protected:
ComputeUnit* computeUnit;
int simdId;
// flag indicating if a register is busy
std::vector<uint8_t> busy;
// flag indicating if a register will be busy (by instructions
// in the SIMD pipeline)
std::vector<uint8_t> nxtBusy;
// numer of registers (bank size) per simd unit (bank)
int numRegsPerSimd;
// vector register state
VecRegisterState *vgprState;
private:
std::vector<VecRegContainer> regFile;
};
#endif // __VECTOR_REGISTER_FILE_HH__

File diff suppressed because it is too large Load Diff

View File

@@ -31,161 +31,116 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __WAVEFRONT_HH__
#define __WAVEFRONT_HH__
#ifndef __GPU_COMPUTE_WAVEFRONT_HH__
#define __GPU_COMPUTE_WAVEFRONT_HH__
#include <cassert>
#include <deque>
#include <list>
#include <memory>
#include <stack>
#include <unordered_map>
#include <vector>
#include "arch/gpu_isa.hh"
#include "base/logging.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "gpu-compute/condition_register_state.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/lds_state.hh"
#include "gpu-compute/misc.hh"
#include "gpu-compute/ndrange.hh"
#include "params/Wavefront.hh"
#include "sim/sim_object.hh"
static const int MAX_NUM_INSTS_PER_WF = 12;
/**
* A reconvergence stack entry conveys the necessary state to implement
* control flow divergence.
*/
struct ReconvergenceStackEntry {
/**
* PC of current instruction.
*/
uint32_t pc;
/**
* PC of the immediate post-dominator instruction, i.e., the value of
* @a pc for the first instruction that will be executed by the wavefront
* when a reconvergence point is reached.
*/
uint32_t rpc;
/**
* Execution mask.
*/
VectorMask execMask;
};
/*
* Arguments for the hsail opcode call, are user defined and variable length.
* The hardware/finalizer can support arguments in hardware or use memory to
* pass arguments. For now, let's assume that an unlimited number of arguments
* are supported in hardware (the compiler inlines functions whenver it can
* anyways, so unless someone is interested in the implications of linking/
* library functions, I think this is a reasonable assumption given the typical
* size of an OpenCL kernel).
*
* Note that call args are different than kernel arguments:
* * All work-items in a kernel refer the same set of kernel arguments
* * Each work-item has it's on set of call args. So a call argument at
* address 0x4 is different for work-item 0 and work-item 1.
*
* Ok, the table below shows an example of how we organize the call arguments in
* the CallArgMem class.
*
* int foo(int arg1, double arg2)
* ___________________________________________________
* | 0: return.0 | 4: return.1 | ... | 252: return.63 |
* |---------------------------------------------------|
* | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
* |---------------------------------------------------|
* | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
* ___________________________________________________
*/
class CallArgMem
{
public:
// pointer to buffer for storing function arguments
uint8_t *mem;
int wfSize;
// size of function args
int funcArgsSizePerItem;
template<typename CType>
int
getLaneOffset(int lane, int addr)
{
return addr * wfSize + sizeof(CType) * lane;
}
CallArgMem(int func_args_size_per_item, int wf_size)
: wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
{
mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
}
~CallArgMem()
{
free(mem);
}
template<typename CType>
uint8_t*
getLaneAddr(int lane, int addr)
{
return mem + getLaneOffset<CType>(lane, addr);
}
template<typename CType>
void
setLaneAddr(int lane, int addr, CType val)
{
*((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
}
};
class Wavefront : public SimObject
{
public:
enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
// Base pointer for array of instruction pointers
uint64_t basePtr;
enum status_e {
// wavefront is stalled
S_STOPPED,
// wavefront is returning from a kernel
S_RETURNING,
// wavefront is running normally
S_RUNNING,
// wavefront is stalled
S_STALLED,
/**
* wavefront has unsatisfied wait counts
*
* while in this state the WF will only execute if
* the oldest instruction is the waitcnt. while in
* S_WAITCNT, the wavefront will not be ready until
* all of its waitcnts have been satisfied. the
* scoreboard ready() function will check the status
* of the waitcnts whenever the WF is in S_WAITCNT,
* and once they are satisfied, it will resume normal
* operation.
*/
S_WAITCNT
};
uint32_t oldBarrierCnt;
uint32_t barrierCnt;
uint32_t barrierId;
uint32_t barrierSlots;
status_e status;
// HW slot id where the WF is mapped to inside a SIMD unit
int wfSlotId;
const int wfSlotId;
int kernId;
// SIMD unit where the WV has been scheduled
int simdId;
const int simdId;
// id of the execution unit (or pipeline) where the oldest instruction
// of the WF is scheduled
int execUnitId;
int flatLmUnitId;
int flatGmUnitId;
// pointer to parent CU
ComputeUnit *computeUnit;
int maxIbSize;
std::deque<GPUDynInstPtr> instructionBuffer;
bool pendingFetch;
bool dropFetch;
// last tick during which all WFs in the CU are not idle
Tick lastNonIdleTick;
// Condition Register State (for HSAIL simulations only)
class ConditionRegisterState *condRegState;
// number of single precision VGPRs required by WF
uint32_t maxSpVgprs;
// number of double precision VGPRs required by WF
uint32_t maxDpVgprs;
// map virtual to physical vector register
uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
// Execution unit resource ID's associated with this WF
// These are static mappings set at WF slot construction and
// based off of the simdId and wfSlotId.
// Index to scalarALUs resource vector in CU
int scalarAlu;
// Indices into readyList/dispatchList of resources used by this
// wavefront
int scalarAluGlobalIdx;
int globalMem;
int localMem;
int scalarMem;
// number of VGPRs required by WF
uint32_t maxVgprs;
// number of SGPRs required by WF
uint32_t maxSgprs;
void freeResources();
GPUDynInstPtr nextInstr();
void setStatus(status_e newStatus);
status_e getStatus() { return status; }
void resizeRegFiles(int num_vregs, int num_sregs);
bool isGmInstruction(GPUDynInstPtr ii);
bool isLmInstruction(GPUDynInstPtr ii);
bool isOldestInstWaitcnt();
bool isOldestInstGMem();
bool isOldestInstLMem();
bool isOldestInstPrivMem();
bool isOldestInstFlatMem();
bool isOldestInstALU();
bool isOldestInstVectorALU();
bool isOldestInstScalarALU();
bool isOldestInstScalarMem();
bool isOldestInstBarrier();
// used for passing spill address to DDInstGPU
std::vector<Addr> lastAddr;
std::vector<uint32_t> workItemId[3];
@@ -199,36 +154,44 @@ class Wavefront : public SimObject
/* the actual WG size can differ than the maximum size */
uint32_t actualWgSz[3];
uint32_t actualWgSzTotal;
void computeActualWgSz(NDRange *ndr);
void computeActualWgSz(HSAQueueEntry *task);
// wavefront id within a workgroup
uint32_t wfId;
uint32_t maxDynWaveId;
uint32_t dispatchId;
// outstanding global+local memory requests
uint32_t outstandingReqs;
// memory requests between scoreboard
// and execute stage not yet executed
uint32_t memReqsInPipe;
// vector and scalar memory requests pending in memory system
int outstandingReqs;
// outstanding global memory write requests
uint32_t outstandingReqsWrGm;
int outstandingReqsWrGm;
// outstanding local memory write requests
uint32_t outstandingReqsWrLm;
int outstandingReqsWrLm;
// outstanding global memory read requests
uint32_t outstandingReqsRdGm;
int outstandingReqsRdGm;
// outstanding local memory read requests
uint32_t outstandingReqsRdLm;
uint32_t rdLmReqsInPipe;
uint32_t rdGmReqsInPipe;
uint32_t wrLmReqsInPipe;
uint32_t wrGmReqsInPipe;
int outstandingReqsRdLm;
// outstanding scalar memory read requests
int scalarOutstandingReqsRdGm;
// outstanding scalar memory write requests
int scalarOutstandingReqsWrGm;
int rdLmReqsInPipe;
int rdGmReqsInPipe;
int wrLmReqsInPipe;
int wrGmReqsInPipe;
int scalarRdGmReqsInPipe;
int scalarWrGmReqsInPipe;
int memTraceBusy;
uint64_t lastTrace;
// number of vector registers reserved by WF
// number of virtual vector registers reserved by WF
int reservedVectorRegs;
// number of virtual scalar registers reserved by WF
int reservedScalarRegs;
// Index into the Vector Register File's namespace where the WF's registers
// will live while the WF is executed
uint32_t startVgprIndex;
// Index into the Scalar Register File's namespace where the WF's registers
// will live while the WF is executed
uint32_t startSgprIndex;
// Old value of destination gpr (for trace)
std::vector<uint32_t> oldVgpr;
@@ -257,64 +220,63 @@ class Wavefront : public SimObject
// to this workgroup (thus this wavefront)
LdsChunk *ldsChunk;
// A pointer to the spill area
Addr spillBase;
// The size of the spill area
uint32_t spillSizePerItem;
// The vector width of the spill area
uint32_t spillWidth;
// A pointer to the private memory area
Addr privBase;
// The size of the private memory area
uint32_t privSizePerItem;
// A pointer ot the read-only memory area
Addr roBase;
// size of the read-only memory area
uint32_t roSize;
// pointer to buffer for storing kernel arguments
uint8_t *kernelArgs;
// unique WF id over all WFs executed across all CUs
uint64_t wfDynId;
// number of times instruction issue for this wavefront is blocked
// due to VRF port availability
Stats::Scalar numTimesBlockedDueVrfPortAvail;
// Wavefront slot stats
// Number of instructions executed by this wavefront slot across all
// dynamic wavefronts
Stats::Scalar numInstrExecuted;
// Number of cycles this WF spends in SCH stage
Stats::Scalar schCycles;
// Number of stall cycles encounterd by this WF in SCH stage
Stats::Scalar schStalls;
// The following stats sum to the value of schStalls, and record, per
// WF slot, what the cause of each stall was at a coarse granularity.
// Cycles WF is selected by scheduler, but RFs cannot support instruction
Stats::Scalar schRfAccessStalls;
// Cycles spent waiting for execution resources
Stats::Scalar schResourceStalls;
// cycles spent waiting for RF reads to complete in SCH stage
Stats::Scalar schOpdNrdyStalls;
// LDS arbitration stall cycles. WF attempts to execute LM instruction,
// but another wave is executing FLAT, which requires LM and GM and forces
// this WF to stall.
Stats::Scalar schLdsArbStalls;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueWAXDependencies;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueRAWDependencies;
// distribution of executed instructions based on their register
// operands; this is used to highlight the load on the VRF
Stats::Distribution srcRegOpDist;
Stats::Distribution dstRegOpDist;
// Functions to operate on call argument memory
// argument memory for hsail call instruction
CallArgMem *callArgMem;
void
initCallArgMem(int func_args_size_per_item, int wf_size)
{
callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
}
// dyn inst id (per SIMD) of last instruction exec from this wave
uint64_t lastInstExec;
template<typename CType>
CType
readCallArgMem(int lane, int addr)
{
return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
}
// Distribution to track the distance between producer and consumer
// for vector register values
Stats::Distribution vecRawDistance;
// Map to track the dyn instruction id of each vector register value
// produced, indexed by physical vector register ID
std::unordered_map<int,uint64_t> rawDist;
template<typename CType>
void
writeCallArgMem(int lane, int addr, CType val)
{
callArgMem->setLaneAddr<CType>(lane, addr, val);
}
// Distribution to track the number of times every vector register
// value produced is consumed.
Stats::Distribution readsPerWrite;
// Counts the number of reads performed to each physical register
// - counts are reset to 0 for each dynamic wavefront launched
std::vector<int> vecReads;
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems);
// context for save/restore
uint8_t *context;
typedef WavefrontParams Params;
Wavefront(const Params *p);
@@ -327,50 +289,31 @@ class Wavefront : public SimObject
computeUnit = cu;
}
void validateRequestCounters();
void start(uint64_t _wfDynId, uint64_t _base_ptr);
void exec();
void updateResources();
int ready(itype_e type);
bool instructionBufferHasBranch();
// called by SCH stage to reserve
std::vector<int> reserveResources();
bool stopFetch();
void regStats();
VectorMask getPred() { return execMask() & initMask; }
bool waitingAtBarrier(int lane);
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
const VectorMask& exec_mask);
void popFromReconvergenceStack();
uint32_t pc() const;
uint32_t rpc() const;
VectorMask execMask() const;
Addr pc() const;
void pc(Addr new_pc);
VectorMask& execMask();
bool execMask(int lane) const;
void pc(uint32_t new_pc);
void discardFetch();
/**
* Returns the size of the static hardware context of a particular wavefront
* This should be updated everytime the context is changed
*/
uint32_t getStaticContextSize() const;
bool waitCntsSatisfied();
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
void clearWaitCnts();
/**
* Returns the hardware context as a stream of bytes
* This method is designed for HSAIL execution
*/
void getContext(const void *out);
/**
* Sets the hardware context fromt a stream of bytes
* This method is designed for HSAIL execution
*/
void setContext(const void *in);
/** Freeing VRF space */
void freeRegisterFile();
TheGpuISA::GPUISA&
gpuISA()
@@ -380,14 +323,32 @@ class Wavefront : public SimObject
private:
TheGpuISA::GPUISA _gpuISA;
void reserveGmResource(GPUDynInstPtr ii);
void reserveLmResource(GPUDynInstPtr ii);
/**
* Stack containing Control Flow Graph nodes (i.e., kernel instructions)
* to be visited by the wavefront, and the associated execution masks. The
* reconvergence stack grows every time the wavefront reaches a divergence
* point (branch instruction), and shrinks every time the wavefront
* reaches a reconvergence point (immediate post-dominator instruction).
* the following are used for waitcnt instructions
* vmWaitCnt: once set, we wait for the oustanding
* number of vector mem instructions to be
* at, or below vmWaitCnt.
*
* expWaitCnt: once set, we wait for the outstanding
* number outstanding VM writes or EXP
* insts to be at, or below expWaitCnt.
*
* lgkmWaitCnt: once set, we wait for the oustanding
* number of LDS, GDS, scalar memory,
* and message instructions to be at, or
* below lgkmCount. we currently do not
* support GDS/message ops.
*/
std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
int vmWaitCnt;
int expWaitCnt;
int lgkmWaitCnt;
status_e status;
Addr _pc;
VectorMask _execMask;
};
#endif // __WAVEFRONT_HH__
#endif // __GPU_COMPUTE_WAVEFRONT_HH__

View File

@@ -86,6 +86,14 @@ MemCmd::commandInfo[] =
WriteResp, "WriteReq" },
/* WriteResp */
{ SET2(IsWrite, IsResponse), InvalidCmd, "WriteResp" },
/* WriteCompleteResp - The WriteCompleteResp command is needed
* because in the GPU memory model we use a WriteResp to indicate
* that a write has reached the cache controller so we can free
* resources at the coalescer. Later, when the write succesfully
* completes we send a WriteCompleteResp to the CU so its wait
* counters can be updated. Wait counters in the CU is how memory
* dependences are handled in the GPU ISA. */
{ SET2(IsWrite, IsResponse), InvalidCmd, "WriteCompleteResp" },
/* WritebackDirty */
{ SET5(IsWrite, IsRequest, IsEviction, HasData, FromCache),
InvalidCmd, "WritebackDirty" },

View File

@@ -83,6 +83,7 @@ class MemCmd
ReadRespWithInvalidate,
WriteReq,
WriteResp,
WriteCompleteResp,
WritebackDirty,
WritebackClean,
WriteClean, // writes dirty data below without evicting

View File

@@ -298,9 +298,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
} else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
if (in_msg.segment == HSASegment:SPILL) {
trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
} else if (WB) {
if (WB) {
trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
} else {
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);

View File

@@ -0,0 +1,68 @@
/*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
structure (GPUCoalescer, external = "yes") {
void readCallback(Addr, DataBlock);
void readCallback(Addr, MachineType, DataBlock);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void writeCallback(Addr, DataBlock);
void writeCallback(Addr, MachineType, DataBlock);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void evictionCallback(Addr);
void recordCPReadCallBack(MachineID, MachineID);
void recordCPWriteCallBack(MachineID, MachineID);
}
structure (VIPERCoalescer, external = "yes") {
void readCallback(Addr, DataBlock);
void readCallback(Addr, MachineType, DataBlock);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void writeCallback(Addr, DataBlock);
void writeCallback(Addr, MachineType, DataBlock);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void invCallback(Addr);
void wbCallback(Addr);
void evictionCallback(Addr);
}

View File

@@ -3,6 +3,7 @@ include "RubySlicc_interfaces.slicc";
include "MOESI_AMD_Base-msg.sm";
include "MOESI_AMD_Base-dir.sm";
include "MOESI_AMD_Base-CorePair.sm";
include "GPU_VIPER-msg.sm";
include "GPU_VIPER-TCP.sm";
include "GPU_VIPER-SQC.sm";
include "GPU_VIPER-TCC.sm";

View File

@@ -135,7 +135,6 @@ structure(CPURequestMsg, desc="...", interface="Message") {
CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer";
WriteMask writeMask, desc="Write Through Data";
MachineID WTRequestor, desc="Node who initiated the write through";
HSAScope scope, default="HSAScope_SYSTEM", desc="Request Scope";
int wfid, default="0", desc="wavefront id";
bool NoWriteConflict, default="true", desc="write collided with CAB entry";
int ProgramCounter, desc="PC that accesses to this block";

View File

@@ -103,26 +103,6 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent")
NotPresent, desc="block is NotPresent";
Busy, desc="block is in a transient state, currently invalid";
}
//HSA scopes
enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") {
UNSPECIFIED, desc="Unspecified scope";
NOSCOPE, desc="Explictly unscoped";
WAVEFRONT, desc="Wavefront scope";
WORKGROUP, desc="Workgroup scope";
DEVICE, desc="Device scope";
SYSTEM, desc="System scope";
}
// HSA segment types
enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") {
GLOBAL, desc="Global segment";
GROUP, desc="Group segment";
PRIVATE, desc="Private segment";
KERNARG, desc="Kernarg segment";
READONLY, desc="Readonly segment";
SPILL, desc="Spill segment";
ARG, desc="Arg segment";
}
// TesterStatus
enumeration(TesterStatus, desc="...") {

View File

@@ -138,42 +138,6 @@ structure (Sequencer, external = "yes") {
bool checkResourceAvailable(CacheResourceType, Addr);
}
structure (GPUCoalescer, external = "yes") {
void readCallback(Addr, DataBlock);
void readCallback(Addr, MachineType, DataBlock);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void writeCallback(Addr, DataBlock);
void writeCallback(Addr, MachineType, DataBlock);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void evictionCallback(Addr);
void recordCPReadCallBack(MachineID, MachineID);
void recordCPWriteCallBack(MachineID, MachineID);
}
structure (VIPERCoalescer, external = "yes") {
void readCallback(Addr, DataBlock);
void readCallback(Addr, MachineType, DataBlock);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void readCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void writeCallback(Addr, DataBlock);
void writeCallback(Addr, MachineType, DataBlock);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
void invCallback(Addr);
void wbCallback(Addr);
void evictionCallback(Addr);
}
structure(RubyRequest, desc="...", interface="Message", external="yes") {
Addr LineAddress, desc="Line address for this request";
Addr PhysicalAddress, desc="Physical address for this request";
@@ -186,8 +150,6 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
WriteMask writeMask, desc="Writethrough mask";
DataBlock WTData, desc="Writethrough data block";
int wfid, desc="Writethrough wavefront";
HSAScope scope, desc="HSA scope";
HSASegment segment, desc="HSA segment";
PacketPtr pkt, desc="Packet associated with this request";
}

View File

@@ -43,7 +43,6 @@
#include "debug/RubyQueue.hh"
#include "mem/ruby/network/Network.hh"
#include "mem/ruby/protocol/MemoryMsg.hh"
#include "mem/ruby/system/GPUCoalescer.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "mem/ruby/system/Sequencer.hh"
#include "sim/system.hh"

View File

@@ -35,8 +35,6 @@
#include "mem/ruby/common/Address.hh"
#include "mem/ruby/common/DataBlock.hh"
#include "mem/ruby/common/WriteMask.hh"
#include "mem/ruby/protocol/HSAScope.hh"
#include "mem/ruby/protocol/HSASegment.hh"
#include "mem/ruby/protocol/Message.hh"
#include "mem/ruby/protocol/PrefetchBit.hh"
#include "mem/ruby/protocol/RubyAccessMode.hh"

View File

@@ -61,58 +61,6 @@
using namespace std;
GPUCoalescer *
RubyGPUCoalescerParams::create()
{
return new GPUCoalescer(this);
}
HSAScope
reqScopeToHSAScope(const RequestPtr &req)
{
HSAScope accessScope = HSAScope_UNSPECIFIED;
if (req->isScoped()) {
if (req->isWavefrontScope()) {
accessScope = HSAScope_WAVEFRONT;
} else if (req->isWorkgroupScope()) {
accessScope = HSAScope_WORKGROUP;
} else if (req->isDeviceScope()) {
accessScope = HSAScope_DEVICE;
} else if (req->isSystemScope()) {
accessScope = HSAScope_SYSTEM;
} else {
fatal("Bad scope type");
}
}
return accessScope;
}
HSASegment
reqSegmentToHSASegment(const RequestPtr &req)
{
HSASegment accessSegment = HSASegment_GLOBAL;
if (req->isGlobalSegment()) {
accessSegment = HSASegment_GLOBAL;
} else if (req->isGroupSegment()) {
accessSegment = HSASegment_GROUP;
} else if (req->isPrivateSegment()) {
accessSegment = HSASegment_PRIVATE;
} else if (req->isKernargSegment()) {
accessSegment = HSASegment_KERNARG;
} else if (req->isReadonlySegment()) {
accessSegment = HSASegment_READONLY;
} else if (req->isSpillSegment()) {
accessSegment = HSASegment_SPILL;
} else if (req->isArgSegment()) {
accessSegment = HSASegment_ARG;
} else {
fatal("Bad segment type");
}
return accessSegment;
}
UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
: coalescer(gc)
{
@@ -152,6 +100,7 @@ UncoalescedTable::updateResources()
{
for (auto iter = instMap.begin(); iter != instMap.end(); ) {
if (iter->second.empty()) {
DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
instMap.erase(iter++);
coalescer->getGMTokenPort().sendTokens(1);
} else {
@@ -160,15 +109,27 @@ UncoalescedTable::updateResources()
}
}
bool
UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
// iterate the instructions held in UncoalescedTable to see whether there
// are more requests to issue; if yes, not yet done; otherwise, done
for (auto& inst : instMap) {
DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
,inst.first, inst.second.size());
if (inst.first == instSeqNum) { return false; }
}
return true;
}
void
UncoalescedTable::printRequestTable(std::stringstream& ss)
{
ss << "UncoalescedTable contains " << instMap.size()
<< " address entries." << std::endl;
ss << "Listing pending packets from " << instMap.size() << " instructions";
for (auto& inst : instMap) {
ss << "Addr 0x" << std::hex << inst.first << std::dec
<< " with " << inst.second.size() << " packets"
<< std::endl;
ss << "\tAddr: " << printAddress(inst.first) << " with "
<< inst.second.size() << " pending packets" << std::endl;
}
}
@@ -227,7 +188,6 @@ GPUCoalescer::GPUCoalescer(const Params *p)
assert(m_dataCache_ptr);
m_runningGarnetStandalone = p->garnet_standalone;
assumingRfOCoherence = p->assume_rfo;
}
GPUCoalescer::~GPUCoalescer()
@@ -254,18 +214,9 @@ GPUCoalescer::wakeup()
if (current_time - req->getIssueTime() > m_deadlock_threshold) {
std::stringstream ss;
printRequestTable(ss);
ss << "Outstanding requests: " << m_outstanding_count
<< std::endl;
panic("Possible Deadlock detected. Aborting!\n"
"version: %d request.paddr: 0x%x coalescedTable: %d "
"current time: %u issue_time: %d difference: %d\n"
"Request Tables:\n %s", m_version,
req->getFirstPkt()->getAddr(),
coalescedTable.size(), cyclesToTicks(current_time),
cyclesToTicks(req->getIssueTime()),
cyclesToTicks(current_time - req->getIssueTime()),
ss.str());
warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
m_version, ss.str());
panic("Aborting due to deadlock!\n");
}
}
}
@@ -283,21 +234,27 @@ GPUCoalescer::wakeup()
void
GPUCoalescer::printRequestTable(std::stringstream& ss)
{
uncoalescedTable.printRequestTable(ss);
ss << "Printing out " << coalescedTable.size()
<< " outstanding requests in the coalesced table\n";
ss << "CoalescedTable contains " << coalescedTable.size()
<< " address entries." << std::endl;
for (auto& requestList : coalescedTable) {
ss << "Addr 0x" << std::hex << requestList.first << std::dec
<< ": type-";
for (auto& request : requestList.second) {
ss << RubyRequestType_to_string(request->getRubyType())
<< " pkts-" << request->getPackets().size()
<< " issued-" << request->getIssueTime() << " seqNum-"
<< request->getSeqNum() << "; ";
ss << "\tAddr: " << printAddress(requestList.first) << "\n"
<< "\tInstruction sequence number: "
<< request->getSeqNum() << "\n"
<< "\t\tType: "
<< RubyRequestType_to_string(request->getRubyType()) << "\n"
<< "\t\tNumber of associated packets: "
<< request->getPackets().size() << "\n"
<< "\t\tIssue time: "
<< request->getIssueTime() * clockPeriod() << "\n"
<< "\t\tDifference from current tick: "
<< (curCycle() - request->getIssueTime()) * clockPeriod();
}
ss << std::endl;
}
// print out packets waiting to be issued in uncoalesced table
uncoalescedTable.printRequestTable(ss);
}
void
@@ -387,6 +344,7 @@ GPUCoalescer::writeCallback(Addr address,
hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
forwardRequestTime, firstResponseTime, isRegion);
// remove this crequest in coalescedTable
delete crequest;
coalescedTable.at(address).pop_front();
@@ -398,6 +356,36 @@ GPUCoalescer::writeCallback(Addr address,
}
}
void
GPUCoalescer::writeCompleteCallback(Addr address,
uint64_t instSeqNum,
MachineType mach)
{
DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
" instSeqNum = %d\n", address, instSeqNum);
assert(pendingWriteInsts.count(instSeqNum) == 1);
PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
// check the uncoalescedTable to see whether all requests for the inst
// have been issued or not
bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
"reqsAllIssued=%d\n", reqsAllIssued,
inst.getNumPendingStores()-1, reqsAllIssued);
if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
// if the pending write instruction has received all write completion
// callbacks for its issued Ruby requests, we can now start respond
// the requesting CU in one response packet.
inst.ackWriteCompletion(m_usingRubyTester);
DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
instSeqNum);
pendingWriteInsts.erase(instSeqNum);
}
}
void
GPUCoalescer::readCallback(Addr address, DataBlock& data)
{
@@ -477,7 +465,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
{
PacketPtr pkt = crequest->getFirstPkt();
Addr request_address = pkt->getAddr();
Addr request_line_address = makeLineAddress(request_address);
Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);
RubyRequestType type = crequest->getRubyType();
@@ -516,20 +504,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
"%s\n",
RubyRequestType_to_string(type));
}
// If using the RubyTester, update the RubyTester sender state's
// subBlock with the recieved data. The tester will later access
// this state.
// Note: RubyPort will access it's sender state before the
// RubyTester.
if (m_usingRubyTester) {
RubyPort::SenderState *requestSenderState =
safe_cast<RubyPort::SenderState*>(pkt->senderState);
RubyTester::SenderState* testerSenderState =
safe_cast<RubyTester::SenderState*>
(requestSenderState->predecessor);
testerSenderState->subBlock.mergeFrom(data);
}
}
@@ -566,8 +540,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
} else if (pkt->isWrite()) {
req_type = RubyRequestType_ST;
} else {
// Acquire and release packets will have been issued by
// makeRequest, so we do not need to check for it here.
panic("Unsupported ruby packet type\n");
}
@@ -579,71 +551,43 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
RequestStatus
GPUCoalescer::makeRequest(PacketPtr pkt)
{
// Check for GPU Barrier Kernel End or Kernel Begin
// Leave these to be handled by the child class
// Kernel End/Barrier = isFlush + isRelease
// Kernel Begin = isFlush + isAcquire
if (pkt->req->isKernel()) {
if (pkt->req->isAcquire()){
// This is a Kernel Begin leave handling to
// virtual xCoalescer::makeRequest
return RequestStatus_Issued;
}else if (pkt->req->isRelease()) {
// This is a Kernel End leave handling to
// virtual xCoalescer::makeRequest
// If we are here then we didn't call
// a virtual version of this function
// so we will also schedule the callback
int wf_id = 0;
if (pkt->req->hasContextId()) {
wf_id = pkt->req->contextId();
}
insertKernel(wf_id, pkt);
newKernelEnds.push_back(wf_id);
if (!issueEvent.scheduled()) {
schedule(issueEvent, curTick());
}
return RequestStatus_Issued;
// all packets must have valid instruction sequence numbers
assert(pkt->req->hasInstSeqNum());
if (pkt->cmd == MemCmd::MemSyncReq) {
// issue mem_sync requests immedidately to the cache system without
// going though uncoalescedTable like normal LD/ST/Atomic requests
issueMemSyncRequest(pkt);
} else {
// otherwise, this must be either read or write command
assert(pkt->isRead() || pkt->isWrite());
// the pkt is temporarily stored in the uncoalesced table until
// it's picked for coalescing process later in this cycle or in a
// future cycle
uncoalescedTable.insertPacket(pkt);
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
pkt->getAddr());
// we schedule an issue event here to process the uncoalesced table
// and try to issue Ruby request to cache system
if (!issueEvent.scheduled()) {
schedule(issueEvent, curTick());
}
}
if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
!pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
(pkt->req->isRelease() || pkt->req->isAcquire())) {
if (assumingRfOCoherence) {
// If we reached here, this request must be a memFence
// and the protocol implements RfO, the coalescer can
// assume sequentially consistency and schedule the callback
// immediately.
// Currently the code implements fence callbacks
// by reusing the mechanism for kernel completions.
// This should be fixed.
int wf_id = 0;
if (pkt->req->hasContextId()) {
wf_id = pkt->req->contextId();
}
insertKernel(wf_id, pkt);
newKernelEnds.push_back(wf_id);
if (!issueEvent.scheduled()) {
schedule(issueEvent, curTick());
}
return RequestStatus_Issued;
} else {
// If not RfO, return issued here and let the child coalescer
// take care of it.
return RequestStatus_Issued;
}
}
uncoalescedTable.insertPacket(pkt);
DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
if (!issueEvent.scheduled())
schedule(issueEvent, curTick());
// TODO: issue hardware prefetches here
// we always return RequestStatus_Issued in this coalescer
// b/c the coalescer's resouce was checked ealier and the coalescer is
// queueing up aliased requets in its coalesced table
return RequestStatus_Issued;
}
/**
* TODO: Figure out what do with this code. This code may go away
* and/or be merged into the VIPER coalescer once the VIPER
* protocol is re-integrated with GCN3 codes.
*/
/*
void
GPUCoalescer::issueRequest(CoalescedRequest* crequest)
{
@@ -736,8 +680,8 @@ GPUCoalescer::issueRequest(CoalescedRequest* crequest)
}
assert(m_mandatory_q_ptr);
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
}
m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
}*/
template <class KEY, class VALUE>
std::ostream &
@@ -760,12 +704,6 @@ GPUCoalescer::print(ostream& out) const
}
void
GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
DPRINTF(RubyStats, "Recorded statistic: %s\n",
SequencerRequestType_to_string(requestType));
}
bool
GPUCoalescer::coalescePacket(PacketPtr pkt)
{
@@ -819,6 +757,41 @@ GPUCoalescer::coalescePacket(PacketPtr pkt)
// be counted as outstanding requests.
m_outstanding_count++;
// We track all issued or to-be-issued Ruby requests associated with
// write instructions. An instruction may have multiple Ruby
// requests.
if (pkt->cmd == MemCmd::WriteReq) {
DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
" the pending write instruction list\n", seqNum,
line_addr);
RubyPort::SenderState* ss =
safe_cast<RubyPort::SenderState*>(pkt->senderState);
// we need to save this port because it will be used to call
// back the requesting CU when we receive write
// complete callbacks for all issued Ruby requests of this
// instruction.
RubyPort::MemSlavePort* mem_slave_port = ss->port;
GPUDynInstPtr gpuDynInst = nullptr;
if (!m_usingRubyTester) {
// If this coalescer is connected to a real CU, we need
// to save the corresponding gpu dynamic instruction.
// CU will use that instruction to decrement wait counters
// in the issuing wavefront.
// For Ruby tester, gpuDynInst == nullptr
ComputeUnit::DataPort::SenderState* cu_state =
safe_cast<ComputeUnit::DataPort::SenderState*>
(ss->predecessor);
gpuDynInst = cu_state->_gpuDynInst;
}
PendingWriteInst& inst = pendingWriteInsts[seqNum];
inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
}
return true;
}
@@ -907,34 +880,6 @@ GPUCoalescer::atomicCallback(Addr address,
}
}
void
GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
{
if (myMachID == senderMachID) {
CP_TCPLdHits++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
CP_TCPLdTransfers++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
CP_TCCLdHits++;
} else {
CP_LdMiss++;
}
}
void
GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
{
if (myMachID == senderMachID) {
CP_TCPStHits++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
CP_TCPStTransfers++;
} else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
CP_TCCStHits++;
} else {
CP_StMiss++;
}
}
void
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
{
@@ -970,74 +915,6 @@ GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
Cycles firstResponseTime,
bool success, bool isRegion)
{
RubyRequestType type = crequest->getRubyType();
Cycles issued_time = crequest->getIssueTime();
Cycles completion_time = curCycle();
assert(completion_time >= issued_time);
Cycles total_lat = completion_time - issued_time;
// cache stats (valid for RfO protocol only)
if (mach == MachineType_TCP) {
if (type == RubyRequestType_LD) {
GPU_TCPLdHits++;
} else {
GPU_TCPStHits++;
}
} else if (mach == MachineType_L1Cache_wCC) {
if (type == RubyRequestType_LD) {
GPU_TCPLdTransfers++;
} else {
GPU_TCPStTransfers++;
}
} else if (mach == MachineType_TCC) {
if (type == RubyRequestType_LD) {
GPU_TCCLdHits++;
} else {
GPU_TCCStHits++;
}
} else {
if (type == RubyRequestType_LD) {
GPU_LdMiss++;
} else {
GPU_StMiss++;
}
}
// Profile all access latency, even zero latency accesses
m_latencyHist.sample(total_lat);
m_typeLatencyHist[type]->sample(total_lat);
// Profile the miss latency for all non-zero demand misses
if (total_lat != Cycles(0)) {
m_missLatencyHist.sample(total_lat);
m_missTypeLatencyHist[type]->sample(total_lat);
if (mach != MachineType_NUM) {
m_missMachLatencyHist[mach]->sample(total_lat);
m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
if ((issued_time <= initialRequestTime) &&
(initialRequestTime <= forwardRequestTime) &&
(forwardRequestTime <= firstResponseTime) &&
(firstResponseTime <= completion_time)) {
m_IssueToInitialDelayHist[mach]->sample(
initialRequestTime - issued_time);
m_InitialToForwardDelayHist[mach]->sample(
forwardRequestTime - initialRequestTime);
m_ForwardToFirstResponseDelayHist[mach]->sample(
firstResponseTime - forwardRequestTime);
m_FirstResponseToCompletionDelayHist[mach]->sample(
completion_time - firstResponseTime);
}
}
}
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
curTick(), m_version, "Coal",
success ? "Done" : "SC_Failed", "", "",
printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
}
void
@@ -1085,74 +962,4 @@ GPUCoalescer::regStats()
m_missTypeMachLatencyHist[i][j]->init(10);
}
}
// GPU cache stats
GPU_TCPLdHits
.name(name() + ".gpu_tcp_ld_hits")
.desc("loads that hit in the TCP")
;
GPU_TCPLdTransfers
.name(name() + ".gpu_tcp_ld_transfers")
.desc("TCP to TCP load transfers")
;
GPU_TCCLdHits
.name(name() + ".gpu_tcc_ld_hits")
.desc("loads that hit in the TCC")
;
GPU_LdMiss
.name(name() + ".gpu_ld_misses")
.desc("loads that miss in the GPU")
;
GPU_TCPStHits
.name(name() + ".gpu_tcp_st_hits")
.desc("stores that hit in the TCP")
;
GPU_TCPStTransfers
.name(name() + ".gpu_tcp_st_transfers")
.desc("TCP to TCP store transfers")
;
GPU_TCCStHits
.name(name() + ".gpu_tcc_st_hits")
.desc("stores that hit in the TCC")
;
GPU_StMiss
.name(name() + ".gpu_st_misses")
.desc("stores that miss in the GPU")
;
// CP cache stats
CP_TCPLdHits
.name(name() + ".cp_tcp_ld_hits")
.desc("loads that hit in the TCP")
;
CP_TCPLdTransfers
.name(name() + ".cp_tcp_ld_transfers")
.desc("TCP to TCP load transfers")
;
CP_TCCLdHits
.name(name() + ".cp_tcc_ld_hits")
.desc("loads that hit in the TCC")
;
CP_LdMiss
.name(name() + ".cp_ld_misses")
.desc("loads that miss in the GPU")
;
CP_TCPStHits
.name(name() + ".cp_tcp_st_hits")
.desc("stores that hit in the TCP")
;
CP_TCPStTransfers
.name(name() + ".cp_tcp_st_transfers")
.desc("TCP to TCP store transfers")
;
CP_TCCStHits
.name(name() + ".cp_tcc_st_hits")
.desc("stores that hit in the TCC")
;
CP_StMiss
.name(name() + ".cp_st_misses")
.desc("stores that miss in the GPU")
;
}

View File

@@ -38,11 +38,11 @@
#include <unordered_map>
#include "base/statistics.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/misc.hh"
#include "mem/request.hh"
#include "mem/ruby/common/Address.hh"
#include "mem/ruby/common/Consumer.hh"
#include "mem/ruby/protocol/HSAScope.hh"
#include "mem/ruby/protocol/HSASegment.hh"
#include "mem/ruby/protocol/PrefetchBit.hh"
#include "mem/ruby/protocol/RubyAccessMode.hh"
#include "mem/ruby/protocol/RubyRequestType.hh"
@@ -57,9 +57,6 @@ class CacheMemory;
class RubyGPUCoalescerParams;
HSAScope reqScopeToHSAScope(const RequestPtr &req);
HSASegment reqSegmentToHSASegment(const RequestPtr &req);
// List of packets that belongs to a specific instruction.
typedef std::list<PacketPtr> PerInstPackets;
@@ -78,6 +75,7 @@ class UncoalescedTable
// instructions at the offset.
PerInstPackets* getInstPackets(int offset);
void updateResources();
bool areRequestsDone(const uint64_t instSeqNum);
// Check if a packet hasn't been removed from instMap in too long.
// Panics if a deadlock is detected and returns nothing otherwise.
@@ -120,6 +118,86 @@ class CoalescedRequest
std::vector<PacketPtr> pkts;
};
// PendingWriteInst tracks the number of outstanding Ruby requests
// per write instruction. Once all requests associated with one instruction
// are completely done in Ruby, we call back the requester to mark
// that this instruction is complete.
class PendingWriteInst
{
public:
PendingWriteInst()
: numPendingStores(0),
originalPort(nullptr),
gpuDynInstPtr(nullptr)
{}
~PendingWriteInst()
{}
void
addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
bool usingRubyTester)
{
assert(port);
originalPort = port;
if (!usingRubyTester) {
gpuDynInstPtr = inst;
}
numPendingStores++;
}
// return true if no more ack is expected
bool
receiveWriteCompleteAck()
{
assert(numPendingStores > 0);
numPendingStores--;
return (numPendingStores == 0) ? true : false;
}
// ack the original requester that this write instruction is complete
void
ackWriteCompletion(bool usingRubyTester)
{
assert(numPendingStores == 0);
// make a response packet
PacketPtr pkt = new Packet(std::make_shared<Request>(),
MemCmd::WriteCompleteResp);
if (!usingRubyTester) {
assert(gpuDynInstPtr);
ComputeUnit::DataPort::SenderState* ss =
new ComputeUnit::DataPort::SenderState
(gpuDynInstPtr, 0, nullptr);
pkt->senderState = ss;
}
// send the ack response to the requester
originalPort->sendTimingResp(pkt);
}
int
getNumPendingStores() {
return numPendingStores;
}
private:
// the number of stores waiting for writeCompleteCallback
int numPendingStores;
// The original port that sent one of packets associated with this
// write instruction. We may have more than one packet per instruction,
// which implies multiple ports per instruction. However, we need
// only 1 of the ports to call back the CU. Therefore, here we keep
// track the port that sent the first packet of this instruction.
RubyPort::MemSlavePort* originalPort;
// similar to the originalPort, this gpuDynInstPtr is set only for
// the first packet of this instruction.
GPUDynInstPtr gpuDynInstPtr;
};
class GPUCoalescer : public RubyPort
{
public:
@@ -159,6 +237,17 @@ class GPUCoalescer : public RubyPort
void collateStats();
void regStats() override;
// each store request needs two callbacks:
// (1) writeCallback is called when the store is received and processed
// by TCP. This writeCallback does not guarantee the store is actually
// completed at its destination cache or memory. writeCallback helps
// release hardware resources (e.g., its entry in coalescedTable)
// allocated for the store so that subsequent requests will not be
// blocked unnecessarily due to hardware resource constraints.
// (2) writeCompleteCallback is called when the store is fully completed
// at its destination cache or memory. writeCompleteCallback
// guarantees that the store is fully completed. This callback
// will decrement hardware counters in CU
void writeCallback(Addr address, DataBlock& data);
void writeCallback(Addr address,
@@ -180,6 +269,10 @@ class GPUCoalescer : public RubyPort
Cycles forwardRequestTime,
Cycles firstResponseTime);
void writeCompleteCallback(Addr address,
uint64_t instSeqNum,
MachineType mach);
void readCallback(Addr address, DataBlock& data);
void readCallback(Addr address,
@@ -200,18 +293,12 @@ class GPUCoalescer : public RubyPort
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool isRegion);
/* atomics need their own callback because the data
might be const coming from SLICC */
void atomicCallback(Addr address,
MachineType mach,
const DataBlock& data);
void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
// Alternate implementations in VIPER Coalescer
virtual RequestStatus makeRequest(PacketPtr pkt) override;
RequestStatus makeRequest(PacketPtr pkt) override;
int outstandingCount() const override { return m_outstanding_count; }
bool
@@ -237,7 +324,6 @@ class GPUCoalescer : public RubyPort
GMTokenPort& getGMTokenPort() { return gmTokenPort; }
void recordRequestType(SequencerRequestType requestType);
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
Stats::Histogram& getLatencyHist() { return m_latencyHist; }
@@ -271,15 +357,17 @@ class GPUCoalescer : public RubyPort
getFirstResponseToCompletionDelayHist(const MachineType t) const
{ return *m_FirstResponseToCompletionDelayHist[t]; }
// Changed to protected to enable inheritance by VIPER Coalescer
protected:
bool tryCacheAccess(Addr addr, RubyRequestType type,
Addr pc, RubyAccessMode access_mode,
int size, DataBlock*& data_ptr);
// Alternate implementations in VIPER Coalescer
virtual void issueRequest(CoalescedRequest* crequest);
void kernelCallback(int wavfront_id);
// since the two following issue functions are protocol-specific,
// they must be implemented in a derived coalescer
virtual void issueRequest(CoalescedRequest* crequest) = 0;
virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
void kernelCallback(int wavefront_id);
void hitCallback(CoalescedRequest* crequest,
MachineType mach,
@@ -297,7 +385,6 @@ class GPUCoalescer : public RubyPort
bool success, bool isRegion);
void completeHitCallback(std::vector<PacketPtr> & mylist);
virtual RubyRequestType getRequestType(PacketPtr pkt);
// Attempt to remove a packet from the uncoalescedTable and coalesce
@@ -309,8 +396,6 @@ class GPUCoalescer : public RubyPort
EventFunctionWrapper issueEvent;
// Changed to protected to enable inheritance by VIPER Coalescer
protected:
int m_max_outstanding_requests;
Cycles m_deadlock_threshold;
@@ -334,6 +419,11 @@ class GPUCoalescer : public RubyPort
// an address, the are serviced in age order.
std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
// a map btw an instruction sequence number and PendingWriteInst
// this is used to do a final call back for each write when it is
// completely done in the memory system
std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
// Global outstanding request count, across all request tables
int m_outstanding_count;
bool m_deadlock_check_scheduled;
@@ -350,26 +440,28 @@ class GPUCoalescer : public RubyPort
EventFunctionWrapper deadlockCheckEvent;
bool assumingRfOCoherence;
// m5 style stats for TCP hit/miss counts
Stats::Scalar GPU_TCPLdHits;
Stats::Scalar GPU_TCPLdTransfers;
Stats::Scalar GPU_TCCLdHits;
Stats::Scalar GPU_LdMiss;
Stats::Scalar GPU_TCPStHits;
Stats::Scalar GPU_TCPStTransfers;
Stats::Scalar GPU_TCCStHits;
Stats::Scalar GPU_StMiss;
Stats::Scalar CP_TCPLdHits;
Stats::Scalar CP_TCPLdTransfers;
Stats::Scalar CP_TCCLdHits;
Stats::Scalar CP_LdMiss;
Stats::Scalar CP_TCPStHits;
Stats::Scalar CP_TCPStTransfers;
Stats::Scalar CP_TCCStHits;
Stats::Scalar CP_StMiss;
// TODO - Need to update the following stats once the VIPER protocol
// is re-integrated.
// // m5 style stats for TCP hit/miss counts
// Stats::Scalar GPU_TCPLdHits;
// Stats::Scalar GPU_TCPLdTransfers;
// Stats::Scalar GPU_TCCLdHits;
// Stats::Scalar GPU_LdMiss;
//
// Stats::Scalar GPU_TCPStHits;
// Stats::Scalar GPU_TCPStTransfers;
// Stats::Scalar GPU_TCCStHits;
// Stats::Scalar GPU_StMiss;
//
// Stats::Scalar CP_TCPLdHits;
// Stats::Scalar CP_TCPLdTransfers;
// Stats::Scalar CP_TCCLdHits;
// Stats::Scalar CP_LdMiss;
//
// Stats::Scalar CP_TCPStHits;
// Stats::Scalar CP_TCPStTransfers;
// Stats::Scalar CP_TCCStHits;
// Stats::Scalar CP_StMiss;
//! Histogram for number of outstanding requests per cycle.
Stats::Histogram m_outstandReqHist;
@@ -394,6 +486,21 @@ class GPUCoalescer : public RubyPort
std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
// TODO - Need to update the following stats once the VIPER protocol
// is re-integrated.
// Stats::Distribution numHopDelays;
// Stats::Distribution tcpToTccDelay;
// Stats::Distribution tccToSdDelay;
// Stats::Distribution sdToSdDelay;
// Stats::Distribution sdToTccDelay;
// Stats::Distribution tccToTcpDelay;
//
// Stats::Average avgTcpToTcc;
// Stats::Average avgTccToSd;
// Stats::Average avgSdToSd;
// Stats::Average avgSdToTcc;
// Stats::Average avgTccToTcp;
private:
// Token port is used to send/receive tokens to/from GPU's global memory
// pipeline across the port boundary. There is one per <wave size> data

View File

@@ -36,6 +36,7 @@ from m5.objects.Sequencer import *
class RubyGPUCoalescer(RubyPort):
type = 'RubyGPUCoalescer'
abstract = True
cxx_class = 'GPUCoalescer'
cxx_header = "mem/ruby/system/GPUCoalescer.hh"
@@ -44,8 +45,6 @@ class RubyGPUCoalescer(RubyPort):
"max requests (incl. prefetches) outstanding")
max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
"coalesced in a single cycle")
assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
"Ownership coherence");
icache = Param.RubyCache("")
dcache = Param.RubyCache("")

View File

@@ -58,7 +58,7 @@ class VIPERCoalescer : public GPUCoalescer
VIPERCoalescer(const Params *);
~VIPERCoalescer();
void issueMemSyncRequest(PacketPtr pkt);
void issueMemSyncRequest(PacketPtr pkt) override;
void issueRequest(CoalescedRequest* crequest) override;
void wbCallback(Addr address);
void invCallback(Addr address);

View File

@@ -39,4 +39,3 @@ class VIPERCoalescer(RubyGPUCoalescer):
cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
assume_rfo = False