diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index 0ff80d8e14..4e9c75fd6c 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -32,7 +32,7 @@ from __future__ import print_function from __future__ import absolute_import -import optparse, os, re +import optparse, os, re, getpass import math import glob import inspect @@ -49,6 +49,9 @@ from common import Options from common import Simulation from common import GPUTLBOptions, GPUTLBConfig +import hsaTopology +from common import FileSystemConfig + ########################## Script Options ######################## def setOption(parser, opt_str, value = 1): # check to make sure the option actually exists @@ -77,9 +80,7 @@ Options.addSEOptions(parser) parser.add_option("--cpu-only-mode", action="store_true", default=False, help="APU mode. Used to take care of problems in "\ "Ruby.py while running APU protocols") -parser.add_option("-k", "--kernel-files", - help="file(s) containing GPU kernel code (colon separated)") -parser.add_option("-u", "--num-compute-units", type="int", default=1, +parser.add_option("-u", "--num-compute-units", type="int", default=4, help="number of GPU compute units"), parser.add_option("--num-cp", type="int", default=0, help="Number of GPU Command Processors (CP)") @@ -89,21 +90,34 @@ parser.add_option("--benchmark-root", help="Root of benchmark directory tree") # it an option/knob parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \ "sharing an SQC (icache, and thus icache TLB)") +parser.add_option('--cu-per-scalar-cache', type='int', default=4, + help='Number of CUs sharing a scalar cache') parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \ "per CU") +parser.add_option('--cu-per-sa', type='int', default=4, + help='Number of CUs per shader array. This must be a ' + 'multiple of options.cu-per-sqc and options.cu-per-scalar') +parser.add_option('--sa-per-complex', type='int', default=1, + help='Number of shader arrays per complex') +parser.add_option('--num-gpu-complexes', type='int', default=1, + help='Number of GPU complexes') parser.add_option("--wf-size", type="int", default=64, help="Wavefront size(in workitems)") parser.add_option("--sp-bypass-path-length", type="int", default=4, \ - help="Number of stages of bypass path in vector ALU for Single Precision ops") + help="Number of stages of bypass path in vector ALU for " + "Single Precision ops") parser.add_option("--dp-bypass-path-length", type="int", default=4, \ - help="Number of stages of bypass path in vector ALU for Double Precision ops") + help="Number of stages of bypass path in vector ALU for " + "Double Precision ops") # issue period per SIMD unit: number of cycles before issuing another vector parser.add_option("--issue-period", type="int", default=4, \ help="Number of cycles per vector instruction issue period") parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \ - help="VGPR to Coalescer (Global Memory) data bus width in bytes") + help="VGPR to Coalescer (Global Memory) data bus width " + "in bytes") parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \ - help="Coalescer to VGPR (Global Memory) data bus width in bytes") + help="Coalescer to VGPR (Global Memory) data bus width in " + "bytes") # Currently we only support 1 local memory pipe parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \ help="Number of Shared Memory pipelines per CU") @@ -113,13 +127,27 @@ parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \ parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \ "WF slots per SIMD") +parser.add_option("--registerManagerPolicy", type="string", default="static", + help="Register manager policy") parser.add_option("--vreg-file-size", type="int", default=2048, help="number of physical vector registers per SIMD") +parser.add_option("--vreg-min-alloc", type="int", default=4, + help="Minimum number of registers that can be allocated " + "from the VRF. The total number of registers will be " + "aligned to this value.") + +parser.add_option("--sreg-file-size", type="int", default=2048, + help="number of physical vector registers per SIMD") +parser.add_option("--sreg-min-alloc", type="int", default=4, + help="Minimum number of registers that can be allocated " + "from the SRF. The total number of registers will be " + "aligned to this value.") + parser.add_option("--bw-scalor", type="int", default=0, help="bandwidth scalor for scalability analysis") parser.add_option("--CPUClock", type="string", default="2GHz", help="CPU clock") -parser.add_option("--GPUClock", type="string", default="1GHz", +parser.add_option("--gpu-clock", type="string", default="1GHz", help="GPU clock") parser.add_option("--cpu-voltage", action="store", type="string", default='1.0V', @@ -129,8 +157,6 @@ parser.add_option("--gpu-voltage", action="store", type="string", help = """CPU voltage domain""") parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST", help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)") -parser.add_option("--xact-cas-mode", action="store_true", - help="enable load_compare mode (transactional CAS)") parser.add_option("--SegFaultDebug",action="store_true", help="checks for GPU seg fault before TLB access") parser.add_option("--FunctionalTLB",action="store_true", @@ -152,9 +178,8 @@ parser.add_option('--fast-forward-pseudo-op', action='store_true', help = 'fast forward using kvm until the m5_switchcpu' ' pseudo-op is encountered, then switch cpus. subsequent' ' m5_switchcpu pseudo-ops will toggle back and forth') -parser.add_option('--outOfOrderDataDelivery', action='store_true', - default=False, help='enable OoO data delivery in the GM' - ' pipeline') +parser.add_option("--num-hw-queues", type="int", default=10, + help="number of hw queues in packet processor") Ruby.define_options(parser) @@ -195,12 +220,17 @@ assert(options.num_compute_units >= 1) n_cu = options.num_compute_units num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc)) options.num_sqc = num_sqc # pass this to Ruby +num_scalar_cache = int(math.ceil(float(n_cu) / options.cu_per_scalar_cache)) +options.num_scalar_cache = num_scalar_cache + +print('Num SQC = ', num_sqc, 'Num scalar caches = ', num_scalar_cache, + 'Num CU = ', n_cu) ########################## Creating the GPU system ######################## # shader is the GPU shader = Shader(n_wf = options.wfs_per_simd, clk_domain = SrcClockDomain( - clock = options.GPUClock, + clock = options.gpu_clock, voltage_domain = VoltageDomain( voltage = options.gpu_voltage))) @@ -237,9 +267,11 @@ compute_units = [] for i in range(n_cu): compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane, num_SIMDs = options.simds_per_cu, - wfSize = options.wf_size, - spbypass_pipe_length = options.sp_bypass_path_length, - dpbypass_pipe_length = options.dp_bypass_path_length, + wf_size = options.wf_size, + spbypass_pipe_length = \ + options.sp_bypass_path_length, + dpbypass_pipe_length = \ + options.dp_bypass_path_length, issue_period = options.issue_period, coalescer_to_vrf_bus_width = \ options.glbmem_rd_bus_width, @@ -251,7 +283,6 @@ for i in range(n_cu): options.shr_mem_pipes_per_cu, n_wf = options.wfs_per_simd, execPolicy = options.CUExecPolicy, - xactCasMode = options.xact_cas_mode, debugSegFault = options.SegFaultDebug, functionalTLB = options.FunctionalTLB, localMemBarrier = options.LocalMemBarrier, @@ -259,19 +290,38 @@ for i in range(n_cu): localDataStore = \ LdsState(banks = options.numLdsBanks, bankConflictPenalty = \ - options.ldsBankConflictPenalty), - out_of_order_data_delivery = - options.outOfOrderDataDelivery)) + options.ldsBankConflictPenalty))) wavefronts = [] vrfs = [] - for j in range(options.simds_per_cu): - for k in range(shader.n_wf): + vrf_pool_mgrs = [] + srfs = [] + srf_pool_mgrs = [] + for j in xrange(options.simds_per_cu): + for k in xrange(shader.n_wf): wavefronts.append(Wavefront(simdId = j, wf_slot_id = k, - wfSize = options.wf_size)) - vrfs.append(VectorRegisterFile(simd_id=j, - num_regs_per_simd=options.vreg_file_size)) + wf_size = options.wf_size)) + vrf_pool_mgrs.append(SimplePoolManager(pool_size = \ + options.vreg_file_size, + min_alloc = \ + options.vreg_min_alloc)) + + vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size, + num_regs=options.vreg_file_size)) + + srf_pool_mgrs.append(SimplePoolManager(pool_size = \ + options.sreg_file_size, + min_alloc = \ + options.vreg_min_alloc)) + srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size, + num_regs=options.sreg_file_size)) + compute_units[-1].wavefronts = wavefronts compute_units[-1].vector_register_file = vrfs + compute_units[-1].scalar_register_file = srfs + compute_units[-1].register_manager = \ + RegisterManager(policy=options.registerManagerPolicy, + vrf_pool_managers=vrf_pool_mgrs, + srf_pool_managers=srf_pool_mgrs) if options.TLB_prefetch: compute_units[-1].prefetch_depth = options.TLB_prefetch compute_units[-1].prefetch_prev_type = options.pf_type @@ -358,14 +408,29 @@ for i in range(options.num_cpus): else: cpu_list.append(cpu) -########################## Creating the GPU dispatcher ######################## -# Dispatcher dispatches work from host CPU to GPU host_cpu = cpu_list[0] -dispatcher = GpuDispatcher() -########################## Create and assign the workload ######################## -# Check for rel_path in elements of base_list using test, returning -# the first full path that satisfies test +hsapp_gpu_map_vaddr = 0x200000000 +hsapp_gpu_map_size = 0x1000 +hsapp_gpu_map_paddr = int(Addr(options.mem_size)) + +# HSA kernel mode driver +gpu_driver = GPUComputeDriver(filename="kfd") + +# Creating the GPU kernel launching components: that is the HSA +# packet processor (HSAPP), GPU command processor (CP), and the +# dispatcher. +gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr, + numHWQueues=options.num_hw_queues) +dispatcher = GPUDispatcher() +gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp, + dispatcher=dispatcher) +gpu_driver.device = gpu_cmd_proc +shader.dispatcher = dispatcher +shader.gpu_cmd_proc = gpu_cmd_proc + +# Create and assign the workload Check for rel_path in elements of +# base_list using test, returning the first full path that satisfies test def find_path(base_list, rel_path, test): for base in base_list: if not base: @@ -380,31 +445,31 @@ def find_file(base_list, rel_path): return find_path(base_list, rel_path, os.path.isfile) executable = find_path(benchmark_path, options.cmd, os.path.exists) -# it's common for a benchmark to be in a directory with the same +# It's common for a benchmark to be in a directory with the same # name as the executable, so we handle that automatically if os.path.isdir(executable): benchmark_path = [executable] executable = find_file(benchmark_path, options.cmd) -if options.kernel_files: - kernel_files = [find_file(benchmark_path, f) - for f in options.kernel_files.split(':')] -else: - # if kernel_files is not set, see if there's a unique .asm file - # in the same directory as the executable - kernel_path = os.path.dirname(executable) - kernel_files = glob.glob(os.path.join(kernel_path, '*.asm')) - if kernel_files: - print("Using GPU kernel code file(s)", ",".join(kernel_files)) - else: - fatal("Can't locate kernel code (.asm) in " + kernel_path) -# OpenCL driver -driver = ClDriver(filename="hsa", codefile=kernel_files) +if options.env: + with open(options.env, 'r') as f: + env = [line.rstrip() for line in f] +else: + env = ['LD_LIBRARY_PATH=%s' % ':'.join([ + "/proj/radl_tools/rocm-1.6/lib", + "/proj/radl_tools/rocm-1.6/hcc/lib64", + "/tool/pandora64/.package/libunwind-1.1/lib", + "/tool/pandora64/.package/gcc-6.4.0/lib64" + ]), + "HSA_ENABLE_INTERRUPT=0"] + +process = Process(executable = executable, cmd = [options.cmd] + + options.options.split(), drivers = [gpu_driver], env = env) + for cpu in cpu_list: cpu.createThreads() - cpu.workload = Process(executable = executable, - cmd = [options.cmd] + options.options.split(), - drivers = [driver]) + cpu.workload = process + for cp in cp_list: cp.workload = host_cpu.workload @@ -419,10 +484,8 @@ if fast_forward: switch_cpu_list = \ [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)] -# Full list of processing cores in the system. Note that -# dispatcher is also added to cpu_list although it is -# not a processing element -cpu_list = cpu_list + [shader] + cp_list + [dispatcher] +# Full list of processing cores in the system. +cpu_list = cpu_list + [shader] + cp_list # creating the overall system # notice the cpu list is explicitly added as a parameter to System @@ -452,9 +515,15 @@ GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx) # create Ruby system system.piobus = IOXBar(width=32, response_latency=0, frontend_latency=0, forward_latency=0) -Ruby.create_system(options, None, system) +dma_list = [gpu_hsapp, gpu_cmd_proc] +Ruby.create_system(options, None, system, None, dma_list, None) system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock, voltage_domain = system.voltage_domain) +gpu_cmd_proc.pio = system.piobus.master +gpu_hsapp.pio = system.piobus.master + +for i, dma_device in enumerate(dma_list): + exec('system.dma_cntrl%d.clk_domain = system.ruby.clk_domain' % i) # attach the CPU ports to Ruby for i in range(options.num_cpus): @@ -484,7 +553,8 @@ for i in range(options.num_cpus): # per compute unit and one sequencer per SQC for the math to work out # correctly. gpu_port_idx = len(system.ruby._cpu_ports) \ - - options.num_compute_units - options.num_sqc + - options.num_compute_units - options.num_sqc \ + - options.num_scalar_cache gpu_port_idx = gpu_port_idx - options.num_cp * 2 wavefront_size = options.wf_size @@ -504,6 +574,14 @@ for i in range(n_cu): system.ruby._cpu_ports[gpu_port_idx].slave gpu_port_idx = gpu_port_idx + 1 +for i in xrange(n_cu): + if i > 0 and not i % options.cu_per_scalar_cache: + print("incrementing idx on ", i) + gpu_port_idx += 1 + system.cpu[shader_idx].CUs[i].scalar_port = \ + system.ruby._cpu_ports[gpu_port_idx].slave +gpu_port_idx = gpu_port_idx + 1 + # attach CP ports to Ruby for i in range(options.num_cp): system.cpu[cp_idx].createInterruptController() @@ -516,11 +594,7 @@ for i in range(options.num_cp): system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master cp_idx = cp_idx + 1 -# connect dispatcher to the system.piobus -dispatcher.pio = system.piobus.master -dispatcher.dma = system.piobus.slave - -################# Connect the CPU and GPU via GPU Dispatcher ################### +################# Connect the CPU and GPU via GPU Dispatcher ################## # CPU rings the GPU doorbell to notify a pending task # using this interface. # And GPU uses this interface to notify the CPU of task completion @@ -530,16 +604,28 @@ dispatcher.dma = system.piobus.slave # parameters must be after the explicit setting of the System cpu list if fast_forward: shader.cpu_pointer = future_cpu_list[0] - dispatcher.cpu = future_cpu_list[0] else: shader.cpu_pointer = host_cpu - dispatcher.cpu = host_cpu -dispatcher.shader_pointer = shader -dispatcher.cl_driver = driver ########################## Start simulation ######################## +chroot = os.path.expanduser(options.chroot) +redirect_paths = [RedirectPath(src = "/proc", + dests = ["%s/fs/proc" % m5.options.outdir]), + RedirectPath(src = "/sys", + dests = ["%s/fs/sys" % m5.options.outdir]), + RedirectPath(src = "/tmp", + dests = ["%s/fs/tmp" % m5.options.outdir]), + RedirectPath(src = "/dev/shm", + dests = ["/dev/shm/%s/gem5_%s" % + (getpass.getuser(), os.getpid())])] + +system.redirect_paths = redirect_paths + root = Root(system=system, full_system=False) + +hsaTopology.createHsaTopology(options) + m5.ticks.setGlobalFrequency('1THz') if options.abs_max_tick: maxtick = options.abs_max_tick @@ -587,4 +673,7 @@ elif options.fast_forward_pseudo_op: print("Ticks:", m5.curTick()) print('Exiting because ', exit_event.getCause()) + +FileSystemConfig.cleanup_filesystem(options) + sys.exit(exit_event.getCode()) diff --git a/configs/example/hsaTopology.py b/configs/example/hsaTopology.py new file mode 100644 index 0000000000..df24223454 --- /dev/null +++ b/configs/example/hsaTopology.py @@ -0,0 +1,105 @@ +# Copyright (c) 2018 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import m5 + +import operator +from os import mkdir, makedirs, getpid, listdir, fsync +from os.path import join as joinpath +from os.path import isdir +from shutil import rmtree, copyfile + +def file_append(path, contents): + with open(joinpath(*path), 'a') as f: + f.write(str(contents)) + f.flush() + fsync(f.fileno()) + +def remake_dir(path): + if isdir(path): + rmtree(path) + makedirs(path) + +def createHsaTopology(options): + topology_dir = joinpath(m5.options.outdir, \ + 'fs/sys/devices/virtual/kfd/kfd/topology') + remake_dir(topology_dir) + + # Ripped from real Kaveri platform to appease kmt version checks + # Set up generation_id + file_append((topology_dir, 'generation_id'), 1) + + # Set up system properties + sys_prop = 'platform_oem 2314885673410447169\n' + \ + 'platform_id 35322352389441\n' + \ + 'platform_rev 1\n' + file_append((topology_dir, 'system_properties'), sys_prop) + + # Populate the topology tree + # TODO: Just the bare minimum to pass for now + node_dir = joinpath(topology_dir, 'nodes/0') + remake_dir(node_dir) + + # must show valid kaveri gpu id or massive meltdown + file_append((node_dir, 'gpu_id'), 2765) + + # must have marketing name + file_append((node_dir, 'name'), 'Carrizo\n') + + # populate global node properties + # NOTE: SIMD count triggers a valid GPU agent creation + # TODO: Really need to parse these from options + node_prop = 'cpu_cores_count %s\n' % options.num_cpus + \ + 'simd_count 32\n' + \ + 'mem_banks_count 0\n' + \ + 'caches_count 0\n' + \ + 'io_links_count 0\n' + \ + 'cpu_core_id_base 16\n' + \ + 'simd_id_base 2147483648\n' + \ + 'max_waves_per_simd 40\n' + \ + 'lds_size_in_kb 64\n' + \ + 'gds_size_in_kb 0\n' + \ + 'wave_front_size 64\n' + \ + 'array_count 1\n' + \ + 'simd_arrays_per_engine 1\n' + \ + 'cu_per_simd_array 10\n' + \ + 'simd_per_cu 4\n' + \ + 'max_slots_scratch_cu 32\n' + \ + 'vendor_id 4098\n' + \ + 'device_id 39028\n' + \ + 'location_id 8\n' + \ + 'max_engine_clk_fcompute 800\n' + \ + 'local_mem_size 0\n' + \ + 'fw_version 699\n' + \ + 'capability 4738\n' + \ + 'max_engine_clk_ccompute 2100\n' + + file_append((node_dir, 'properties'), node_prop) diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py index 58572ae193..92dcf5e52f 100644 --- a/configs/ruby/GPU_VIPER.py +++ b/configs/ruby/GPU_VIPER.py @@ -476,6 +476,15 @@ def create_system(options, full_system, system, dma_devices, bootmem, dir_cntrl.requestToMemory = MessageBuffer() dir_cntrl.responseFromMemory = MessageBuffer() + dir_cntrl.requestFromDMA = MessageBuffer(ordered=True) + dir_cntrl.requestFromDMA.slave = ruby_system.network.master + + dir_cntrl.responseToDMA = MessageBuffer() + dir_cntrl.responseToDMA.master = ruby_system.network.slave + + dir_cntrl.requestToMemory = MessageBuffer() + dir_cntrl.responseFromMemory = MessageBuffer() + exec("ruby_system.dir_cntrl%d = dir_cntrl" % i) dir_cntrl_nodes.append(dir_cntrl) @@ -640,7 +649,29 @@ def create_system(options, full_system, system, dma_devices, bootmem, # SQC also in GPU cluster gpuCluster.add(sqc_cntrl) - for i in range(options.num_cp): + for i in xrange(options.num_scalar_cache): + scalar_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits) + scalar_cntrl.create(options, ruby_system, system) + + exec('ruby_system.scalar_cntrl%d = scalar_cntrl' % i) + + cpu_sequencers.append(scalar_cntrl.sequencer) + + scalar_cntrl.requestFromSQC = MessageBuffer(ordered = True) + scalar_cntrl.requestFromSQC.master = ruby_system.network.slave + + scalar_cntrl.probeToSQC = MessageBuffer(ordered = True) + scalar_cntrl.probeToSQC.slave = ruby_system.network.master + + scalar_cntrl.responseToSQC = MessageBuffer(ordered = True) + scalar_cntrl.responseToSQC.slave = ruby_system.network.master + + scalar_cntrl.mandatoryQueue = \ + MessageBuffer(buffer_size=options.buffers_size) + + gpuCluster.add(scalar_cntrl) + + for i in xrange(options.num_cp): tcp_ID = options.num_compute_units + i sqc_ID = options.num_sqc + i @@ -735,13 +766,27 @@ def create_system(options, full_system, system, dma_devices, bootmem, # TCC cntrls added to the GPU cluster gpuCluster.add(tcc_cntrl) - # Assuming no DMA devices - assert(len(dma_devices) == 0) + for i, dma_device in enumerate(dma_devices): + dma_seq = DMASequencer(version=i, ruby_system=ruby_system) + dma_cntrl = DMA_Controller(version=i, dma_sequencer=dma_seq, + ruby_system=ruby_system) + exec('system.dma_cntrl%d = dma_cntrl' % i) + if dma_device.type == 'MemTest': + exec('system.dma_cntrl%d.dma_sequencer.slave = dma_devices.test' + % i) + else: + exec('system.dma_cntrl%d.dma_sequencer.slave = dma_device.dma' % i) + dma_cntrl.requestToDir = MessageBuffer(buffer_size=0) + dma_cntrl.requestToDir.master = ruby_system.network.slave + dma_cntrl.responseFromDir = MessageBuffer(buffer_size=0) + dma_cntrl.responseFromDir.slave = ruby_system.network.master + dma_cntrl.mandatoryQueue = MessageBuffer(buffer_size = 0) + gpuCluster.add(dma_cntrl) # Add cpu/gpu clusters to main cluster mainCluster.add(cpuCluster) mainCluster.add(gpuCluster) - ruby_system.network.number_of_virtual_networks = 10 + ruby_system.network.number_of_virtual_networks = 11 return (cpu_sequencers, dir_cntrl_nodes, mainCluster) diff --git a/src/mem/request.hh b/src/mem/request.hh index 718d5fa243..7b324dc9f8 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -815,7 +815,6 @@ class Request bool isCondSwap() const { return _flags.isSet(MEM_SWAP_COND); } bool isSecure() const { return _flags.isSet(SECURE); } bool isPTWalk() const { return _flags.isSet(PT_WALK); } - bool isAcquire() const { return _flags.isSet(ACQUIRE); } bool isRelease() const { return _flags.isSet(RELEASE); } bool isKernel() const { return _flags.isSet(KERNEL); } bool isAtomicReturn() const { return _flags.isSet(ATOMIC_RETURN_OP); } @@ -839,6 +838,8 @@ class Request bool isToPOC() const { return _flags.isSet(DST_POC); } Flags getDest() const { return _flags & DST_BITS; } + bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); } + /** * Accessor functions for the memory space configuration flags and used by * GPU ISAs such as the Heterogeneous System Architecture (HSA). Note that diff --git a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm index 1885a68f75..84e86dcaf4 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm @@ -239,8 +239,6 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { Entry cache_entry := getCacheEntry(in_msg.LineAddress); TBE tbe := TBEs.lookup(in_msg.LineAddress); - - assert(in_msg.Type == RubyRequestType:IFETCH); trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe); } }