This commit contains the rest of the base 2 vs base 10 cache/memory size clarifications. It also changes the warning message to use warn(). With these changes, the warning message should now no longer show up during a fresh compilation of gem5. Change-Id: Ia63f841bdf045b76473437f41548fab27dc19631
371 lines
13 KiB
Python
371 lines
13 KiB
Python
# Copyright (c) 2021 Advanced Micro Devices, Inc.
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
#
|
|
# 3. Neither the name of the copyright holder nor the names of its
|
|
# contributors may be used to endorse or promote products derived from this
|
|
# software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
from common import (
|
|
GPUTLBConfig,
|
|
Simulation,
|
|
)
|
|
from common.Benchmarks import *
|
|
from common.FSConfig import *
|
|
from example.gpufs.Disjoint_VIPER import *
|
|
from ruby import Ruby
|
|
from system.amdgpu import *
|
|
|
|
from m5.util import panic
|
|
|
|
|
|
def makeGpuFSSystem(args):
|
|
# Boot options are standard gem5 options plus:
|
|
# - Framebuffer device emulation 0 to reduce driver code paths.
|
|
# - Blacklist amdgpu as it cannot (currently) load in KVM CPU.
|
|
# - Blacklist psmouse as amdgpu driver adds proprietary commands that
|
|
# cause gem5 to panic.
|
|
boot_options = [
|
|
"earlyprintk=ttyS0",
|
|
"console=ttyS0,9600",
|
|
"lpj=7999923",
|
|
f"root={args.root_partition}",
|
|
"drm_kms_helper.fbdev_emulation=0",
|
|
"modprobe.blacklist=amdgpu",
|
|
"modprobe.blacklist=psmouse",
|
|
]
|
|
cmdline = " ".join(boot_options)
|
|
|
|
if MemorySize(args.mem_size) < MemorySize("2GiB"):
|
|
panic("Need at least 2GiB of system memory to load amdgpu module")
|
|
|
|
# Use the common FSConfig to setup a Linux X86 System
|
|
(TestCPUClass, test_mem_mode) = Simulation.getCPUClass(args.cpu_type)
|
|
if test_mem_mode == "atomic":
|
|
test_mem_mode = "atomic_noncaching"
|
|
disks = [args.disk_image]
|
|
if args.second_disk is not None:
|
|
disks.extend([args.second_disk])
|
|
bm = SysConfig(disks=disks, mem=args.mem_size)
|
|
system = makeLinuxX86System(
|
|
test_mem_mode, args.num_cpus, bm, True, cmdline=cmdline
|
|
)
|
|
system.workload.object_file = binary(args.kernel)
|
|
|
|
# Set the cache line size for the entire system.
|
|
system.cache_line_size = args.cacheline_size
|
|
|
|
# Create a top-level voltage and clock domain.
|
|
system.voltage_domain = VoltageDomain(voltage=args.sys_voltage)
|
|
system.clk_domain = SrcClockDomain(
|
|
clock=args.sys_clock, voltage_domain=system.voltage_domain
|
|
)
|
|
|
|
# Create a CPU voltage and clock domain.
|
|
system.cpu_voltage_domain = VoltageDomain()
|
|
system.cpu_clk_domain = SrcClockDomain(
|
|
clock=args.cpu_clock, voltage_domain=system.cpu_voltage_domain
|
|
)
|
|
|
|
# Setup VGA ROM region
|
|
system.shadow_rom_ranges = [AddrRange(0xC0000, size=Addr("128KiB"))]
|
|
|
|
# Create specified number of CPUs. GPUFS really only needs one.
|
|
system.cpu = [
|
|
TestCPUClass(clk_domain=system.cpu_clk_domain, cpu_id=i)
|
|
for i in range(args.num_cpus)
|
|
]
|
|
if ObjectList.is_kvm_cpu(TestCPUClass):
|
|
system.kvm_vm = KvmVM()
|
|
|
|
# Create AMDGPU and attach to southbridge
|
|
shader = createGPU(system, args)
|
|
connectGPU(system, args)
|
|
|
|
# The shader core will be whatever is after the CPU cores are accounted for
|
|
shader_idx = args.num_cpus
|
|
system.cpu.append(shader)
|
|
|
|
# This arbitrary address is something in the X86 I/O hole
|
|
hsapp_gpu_map_paddr = 0xE0000000
|
|
hsapp_pt_walker = VegaPagetableWalker()
|
|
gpu_hsapp = HSAPacketProcessor(
|
|
pioAddr=hsapp_gpu_map_paddr,
|
|
numHWQueues=args.num_hw_queues,
|
|
walker=hsapp_pt_walker,
|
|
)
|
|
dispatcher_exit_events = False
|
|
if args.exit_at_gpu_task > -1:
|
|
dispatcher_exit_events = True
|
|
if args.exit_after_gpu_kernel > -1:
|
|
dispatcher_exit_events = True
|
|
dispatcher = GPUDispatcher(kernel_exit_events=dispatcher_exit_events)
|
|
cp_pt_walker = VegaPagetableWalker()
|
|
target_kernel = args.skip_until_gpu_kernel
|
|
gpu_cmd_proc = GPUCommandProcessor(
|
|
hsapp=gpu_hsapp,
|
|
dispatcher=dispatcher,
|
|
walker=cp_pt_walker,
|
|
target_non_blit_kernel_id=target_kernel,
|
|
)
|
|
shader.dispatcher = dispatcher
|
|
shader.gpu_cmd_proc = gpu_cmd_proc
|
|
|
|
system.pc.south_bridge.gpu.cp = gpu_cmd_proc
|
|
|
|
# GPU Interrupt Handler
|
|
device_ih = AMDGPUInterruptHandler()
|
|
system.pc.south_bridge.gpu.device_ih = device_ih
|
|
|
|
# Setup the SDMA engines depending on device. The MMIO base addresses
|
|
# can be found in the driver code under:
|
|
# include/asic_reg/sdmaX/sdmaX_Y_Z_offset.h
|
|
num_sdmas = 2
|
|
sdma_bases = []
|
|
sdma_sizes = []
|
|
if args.gpu_device == "Vega10":
|
|
num_sdmas = 2
|
|
sdma_bases = [0x4980, 0x5180]
|
|
sdma_sizes = [0x800] * 2
|
|
elif args.gpu_device == "MI100":
|
|
num_sdmas = 8
|
|
sdma_bases = [
|
|
0x4980,
|
|
0x6180,
|
|
0x78000,
|
|
0x79000,
|
|
0x7A000,
|
|
0x7B000,
|
|
0x7C000,
|
|
0x7D000,
|
|
]
|
|
sdma_sizes = [0x1000] * 8
|
|
elif args.gpu_device == "MI200" or args.gpu_device == "MI300X":
|
|
num_sdmas = 5
|
|
sdma_bases = [
|
|
0x4980,
|
|
0x6180,
|
|
0x78000,
|
|
0x79000,
|
|
0x7A000,
|
|
]
|
|
sdma_sizes = [0x1000] * 5
|
|
else:
|
|
m5.util.panic(f"Unknown GPU device {args.gpu_device}")
|
|
|
|
sdma_pt_walkers = []
|
|
sdma_engines = []
|
|
for sdma_idx in range(num_sdmas):
|
|
sdma_pt_walker = VegaPagetableWalker()
|
|
sdma_engine = SDMAEngine(
|
|
walker=sdma_pt_walker,
|
|
mmio_base=sdma_bases[sdma_idx],
|
|
mmio_size=sdma_sizes[sdma_idx],
|
|
)
|
|
sdma_pt_walkers.append(sdma_pt_walker)
|
|
sdma_engines.append(sdma_engine)
|
|
|
|
system.pc.south_bridge.gpu.sdmas = sdma_engines
|
|
|
|
# Setup PM4 packet processors
|
|
pm4_procs = []
|
|
pm4_procs.append(
|
|
PM4PacketProcessor(
|
|
ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000)
|
|
)
|
|
)
|
|
|
|
system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs
|
|
|
|
# GPU data path
|
|
gpu_mem_mgr = AMDGPUMemoryManager()
|
|
system.pc.south_bridge.gpu.memory_manager = gpu_mem_mgr
|
|
|
|
# CPU data path (SystemHub)
|
|
system_hub = AMDGPUSystemHub()
|
|
shader.system_hub = system_hub
|
|
|
|
# GPU, HSAPP, and GPUCommandProc are DMA devices
|
|
system._dma_ports.append(gpu_hsapp)
|
|
system._dma_ports.append(gpu_cmd_proc)
|
|
system._dma_ports.append(system.pc.south_bridge.gpu)
|
|
for sdma in sdma_engines:
|
|
system._dma_ports.append(sdma)
|
|
system._dma_ports.append(device_ih)
|
|
for pm4_proc in pm4_procs:
|
|
system._dma_ports.append(pm4_proc)
|
|
system._dma_ports.append(system_hub)
|
|
system._dma_ports.append(gpu_mem_mgr)
|
|
system._dma_ports.append(hsapp_pt_walker)
|
|
system._dma_ports.append(cp_pt_walker)
|
|
for sdma_pt_walker in sdma_pt_walkers:
|
|
system._dma_ports.append(sdma_pt_walker)
|
|
|
|
gpu_hsapp.pio = system.iobus.mem_side_ports
|
|
gpu_cmd_proc.pio = system.iobus.mem_side_ports
|
|
system.pc.south_bridge.gpu.pio = system.iobus.mem_side_ports
|
|
for sdma in sdma_engines:
|
|
sdma.pio = system.iobus.mem_side_ports
|
|
device_ih.pio = system.iobus.mem_side_ports
|
|
for pm4_proc in pm4_procs:
|
|
pm4_proc.pio = system.iobus.mem_side_ports
|
|
system_hub.pio = system.iobus.mem_side_ports
|
|
|
|
# Full system needs special TLBs for SQC, Scalar, and vector data ports
|
|
args.full_system = True
|
|
GPUTLBConfig.config_tlb_hierarchy(
|
|
args, system, shader_idx, system.pc.south_bridge.gpu, True
|
|
)
|
|
|
|
# Create Ruby system using disjoint VIPER topology
|
|
system.ruby = Disjoint_VIPER()
|
|
system.ruby.create(args, system, system.iobus, system._dma_ports)
|
|
|
|
# Create a seperate clock domain for Ruby
|
|
system.ruby.clk_domain = SrcClockDomain(
|
|
clock=args.ruby_clock, voltage_domain=system.voltage_domain
|
|
)
|
|
|
|
# If we are using KVM cpu, enable AVX. AVX is used in some ROCm libraries
|
|
# such as rocBLAS which is used in higher level libraries like PyTorch.
|
|
use_avx = False
|
|
if ObjectList.is_kvm_cpu(TestCPUClass) and not args.disable_avx:
|
|
# AVX also requires CR4.osxsave to be 1. These must be set together
|
|
# of KVM will error out.
|
|
system.workload.enable_osxsave = 1
|
|
use_avx = True
|
|
|
|
# These values are taken from a real CPU and are further explained here:
|
|
# https://sandpile.org/x86/cpuid.htm#level_0000_000Dh
|
|
avx_extended_state = [
|
|
0x00000007,
|
|
0x00000340,
|
|
0x00000000,
|
|
0x00000340,
|
|
0x00000000,
|
|
0x00000340,
|
|
0x00000000,
|
|
0x00000000,
|
|
0x00000100,
|
|
0x00000240,
|
|
0x00000000,
|
|
0x00000040,
|
|
0x00000000,
|
|
0x00000000,
|
|
0x00000000,
|
|
0x00000000,
|
|
]
|
|
|
|
# This modifies the default value for ECX only (4th in this array).
|
|
# See: https://sandpile.org/x86/cpuid.htm#level_0000_0001h
|
|
# Enables AVX, OSXSAVE, XSAVE, POPCNT, SSE4.2, SSE4.1, CMPXCHG16B,
|
|
# and FMA.
|
|
avx_cpu_features = [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x1C803209]
|
|
|
|
for i, cpu in enumerate(system.cpu):
|
|
# Break once we reach the shader "CPU"
|
|
if i == args.num_cpus:
|
|
break
|
|
|
|
#
|
|
# Tie the cpu ports to the correct ruby system ports
|
|
#
|
|
cpu.clk_domain = system.cpu_clk_domain
|
|
cpu.createThreads()
|
|
cpu.createInterruptController()
|
|
|
|
system.ruby._cpu_ports[i].connectCpuPorts(cpu)
|
|
|
|
for j in range(len(system.cpu[i].isa)):
|
|
system.cpu[i].isa[j].vendor_string = "AuthenticAMD"
|
|
if use_avx:
|
|
system.cpu[i].isa[j].ExtendedState = avx_extended_state
|
|
system.cpu[i].isa[j].FamilyModelStepping = avx_cpu_features
|
|
|
|
if args.host_parallel:
|
|
# To get the KVM CPUs to run on different host CPUs, specify a
|
|
# different event queue for each CPU. The last CPU is a GPU
|
|
# shader and should be skipped.
|
|
for i, cpu in enumerate(system.cpu[:-1]):
|
|
for obj in cpu.descendants():
|
|
obj.eventq_index = 0
|
|
cpu.eventq_index = i + 1
|
|
|
|
# Only enable KVM perf counters if explicitly set, as this is more
|
|
# restrictive.
|
|
if ObjectList.is_kvm_cpu(TestCPUClass):
|
|
if args.kvm_perf:
|
|
for i, cpu in enumerate(system.cpu[:-1]):
|
|
cpu.usePerf = True
|
|
else:
|
|
for i, cpu in enumerate(system.cpu[:-1]):
|
|
cpu.usePerf = False
|
|
|
|
gpu_port_idx = (
|
|
len(system.ruby._cpu_ports)
|
|
- args.num_compute_units
|
|
- args.num_sqc
|
|
- args.num_scalar_cache
|
|
)
|
|
gpu_port_idx = gpu_port_idx - args.num_cp * 2
|
|
|
|
# Connect token ports. For this we need to search through the list of all
|
|
# sequencers, since the TCP coalescers will not necessarily be first. Only
|
|
# TCP coalescers use a token port for back pressure.
|
|
token_port_idx = 0
|
|
for i in range(len(system.ruby._cpu_ports)):
|
|
if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
|
|
system.cpu[shader_idx].CUs[
|
|
token_port_idx
|
|
].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort
|
|
token_port_idx += 1
|
|
|
|
wavefront_size = args.wf_size
|
|
for i in range(args.num_compute_units):
|
|
# The pipeline issues wavefront_size number of uncoalesced requests
|
|
# in one GPU issue cycle. Hence wavefront_size mem ports.
|
|
for j in range(wavefront_size):
|
|
system.cpu[shader_idx].CUs[i].memory_port[
|
|
j
|
|
] = system.ruby._cpu_ports[gpu_port_idx].in_ports[j]
|
|
gpu_port_idx += 1
|
|
|
|
for i in range(args.num_compute_units):
|
|
if i > 0 and not i % args.cu_per_sqc:
|
|
gpu_port_idx += 1
|
|
system.cpu[shader_idx].CUs[i].sqc_port = system.ruby._cpu_ports[
|
|
gpu_port_idx
|
|
].in_ports
|
|
gpu_port_idx = gpu_port_idx + 1
|
|
|
|
for i in range(args.num_compute_units):
|
|
if i > 0 and not i % args.cu_per_scalar_cache:
|
|
gpu_port_idx += 1
|
|
system.cpu[shader_idx].CUs[i].scalar_port = system.ruby._cpu_ports[
|
|
gpu_port_idx
|
|
].in_ports
|
|
gpu_port_idx = gpu_port_idx + 1
|
|
|
|
return system
|