Files
gem5/configs/example/gpufs/Disjoint_VIPER.py
Michael Boyer ba2f5615ba gpu-compute: Support cache line sizes >64B in GPUFS (#939)
This change fixes two issues:

1) The --cacheline_size option was setting the system cache line size
but not the Ruby cache line size, and the mismatch was causing assertion
failures.

2) The submitDispatchPkt() function accesses the kernel object in
chunks, with the chunk size equal to the cache line size. For cache line
sizes >64B (e.g. 128B), the kernel object is not guaranteed to be
aligned to a cache line and it was possible for a chunk to be partially
contained in two separate device memories, causing the memory access to
fail.

Change-Id: I8e45146901943e9c2750d32162c0f35c851e09e1

Co-authored-by: Michael Boyer <Michael.Boyer@amd.com>
2024-03-20 11:09:25 -07:00

204 lines
8.1 KiB
Python

# Copyright (c) 2021 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from example.gpufs.DisjointNetwork import *
from ruby import Ruby
from ruby.GPU_VIPER import *
from m5.defines import buildEnv
from m5.objects import *
from m5.util import fatal
class DummySystem:
def __init__(self, mem_ranges):
self.mem_ctrls = []
self.mem_ranges = mem_ranges
class Disjoint_VIPER(RubySystem):
def __init__(self):
if buildEnv["PROTOCOL"] != "GPU_VIPER":
fatal("This ruby config only supports the GPU_VIPER protocol")
super().__init__()
def create(self, options, system, piobus, dma_devices):
# Disjoint network topology
if "garnet" in options.network:
self.network_cpu = DisjointGarnet(self)
self.network_gpu = DisjointGarnet(self)
else:
self.network_cpu = DisjointSimple(self)
self.network_gpu = DisjointSimple(self)
self.block_size_bytes = options.cacheline_size
# Construct CPU controllers
cpu_dir_nodes = construct_dirs(options, system, self, self.network_cpu)
(cp_sequencers, cp_cntrl_nodes) = construct_corepairs(
options, system, self, self.network_cpu
)
# Construct GPU controllers
(tcp_sequencers, tcp_cntrl_nodes) = construct_tcps(
options, system, self, self.network_gpu
)
(sqc_sequencers, sqc_cntrl_nodes) = construct_sqcs(
options, system, self, self.network_gpu
)
(scalar_sequencers, scalar_cntrl_nodes) = construct_scalars(
options, system, self, self.network_gpu
)
tcc_cntrl_nodes = construct_tccs(
options, system, self, self.network_gpu
)
# Construct CPU memories
Ruby.setup_memory_controllers(system, self, cpu_dir_nodes, options)
# Construct GPU memories
(gpu_dir_nodes, gpu_mem_ctrls) = construct_gpudirs(
options, system, self, self.network_gpu
)
# Configure the directories based on which network they are in
for cpu_dir_node in cpu_dir_nodes:
cpu_dir_node.CPUonly = True
cpu_dir_node.GPUonly = False
for gpu_dir_node in gpu_dir_nodes:
gpu_dir_node.CPUonly = False
gpu_dir_node.GPUonly = True
# Set access backing store if specified
if options.access_backing_store:
self.access_backing_store = True
# Assign the memory controllers to the system
cpu_abstract_mems = []
for mem_ctrl in system.mem_ctrls:
cpu_abstract_mems.append(mem_ctrl.dram)
system.memories = cpu_abstract_mems
gpu_abstract_mems = []
for mem_ctrl in gpu_mem_ctrls:
gpu_abstract_mems.append(mem_ctrl.dram)
system.pc.south_bridge.gpu.memories = gpu_abstract_mems
# Setup DMA controllers
gpu_dma_types = ["VegaPagetableWalker", "AMDGPUMemoryManager"]
cpu_dma_ctrls = []
gpu_dma_ctrls = []
dma_cntrls = []
for i, dma_device in enumerate(dma_devices):
dma_seq = DMASequencer(version=i, ruby_system=self)
dma_cntrl = DMA_Controller(
version=i, dma_sequencer=dma_seq, ruby_system=self
)
# Handle inconsistently named ports on various DMA devices:
if not hasattr(dma_device, "type"):
# IDE doesn't have a .type but seems like everything else does.
dma_seq.in_ports = dma_device
elif dma_device.type in gpu_dma_types:
dma_seq.in_ports = dma_device.port
else:
dma_seq.in_ports = dma_device.dma
if (
hasattr(dma_device, "type")
and dma_device.type in gpu_dma_types
):
dma_cntrl.requestToDir = MessageBuffer(buffer_size=0)
dma_cntrl.requestToDir.out_port = self.network_gpu.in_port
dma_cntrl.responseFromDir = MessageBuffer(buffer_size=0)
dma_cntrl.responseFromDir.in_port = self.network_gpu.out_port
dma_cntrl.mandatoryQueue = MessageBuffer(buffer_size=0)
gpu_dma_ctrls.append(dma_cntrl)
else:
dma_cntrl.requestToDir = MessageBuffer(buffer_size=0)
dma_cntrl.requestToDir.out_port = self.network_cpu.in_port
dma_cntrl.responseFromDir = MessageBuffer(buffer_size=0)
dma_cntrl.responseFromDir.in_port = self.network_cpu.out_port
dma_cntrl.mandatoryQueue = MessageBuffer(buffer_size=0)
cpu_dma_ctrls.append(dma_cntrl)
dma_cntrls.append(dma_cntrl)
system.dma_cntrls = dma_cntrls
# Collect CPU and GPU controllers into seperate lists
cpu_cntrls = cpu_dir_nodes + cp_cntrl_nodes + cpu_dma_ctrls
gpu_cntrls = (
tcp_cntrl_nodes
+ sqc_cntrl_nodes
+ scalar_cntrl_nodes
+ tcc_cntrl_nodes
+ gpu_dma_ctrls
+ gpu_dir_nodes
)
# Setup number of vnets
self.number_of_virtual_networks = 11
self.network_cpu.number_of_virtual_networks = 11
self.network_gpu.number_of_virtual_networks = 11
# Set up the disjoint topology
self.network_cpu.connectCPU(options, cpu_cntrls)
self.network_gpu.connectGPU(options, gpu_cntrls)
# Create port proxy for connecting system port. System port is used
# for loading from outside guest, e.g., binaries like vmlinux.
system.sys_port_proxy = RubyPortProxy(ruby_system=self)
system.sys_port_proxy.pio_request_port = piobus.cpu_side_ports
system.system_port = system.sys_port_proxy.in_ports
# Only CPU sequencers connect to PIO bus. This acts as the "default"
# destination for unknown address ranges. PCIe requests fall under
# this category.
for i in range(len(cp_sequencers)):
cp_sequencers[i].pio_request_port = piobus.cpu_side_ports
cp_sequencers[i].mem_request_port = piobus.cpu_side_ports
# The CorePairs in MOESI_AMD_Base round up when constructing
# sequencers, but if the CPU does not exit there would be no
# sequencer to send a range change, leading to assert.
if i < options.num_cpus:
cp_sequencers[i].pio_response_port = piobus.mem_side_ports
# Setup ruby port. Both CPU and GPU are actually connected here.
all_sequencers = (
cp_sequencers + tcp_sequencers + sqc_sequencers + scalar_sequencers
)
self._cpu_ports = all_sequencers
self.num_of_sequencers = len(all_sequencers)