This is the version for MI300. For the most part, it is the same as MI200 with the exception of architected flat scratch (not yet implemented in gem5) and therefore a new version enum is required. Change-Id: Id18cd7b57c4eebd467c010a3f61e3117beb8d58a
313 lines
10 KiB
Python
313 lines
10 KiB
Python
# Copyright (c) 2021 Advanced Micro Devices, Inc.
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
#
|
|
# 3. Neither the name of the copyright holder nor the names of its
|
|
# contributors may be used to endorse or promote products derived from this
|
|
# software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
# System includes
|
|
import argparse
|
|
import hashlib
|
|
import math
|
|
|
|
# gem5 related
|
|
import m5
|
|
from m5.objects import *
|
|
from m5.util import addToPath
|
|
|
|
# gem5 options and objects
|
|
addToPath("../../")
|
|
from amd import AmdGPUOptions
|
|
from common import (
|
|
GPUTLBConfig,
|
|
GPUTLBOptions,
|
|
ObjectList,
|
|
Options,
|
|
Simulation,
|
|
)
|
|
from ruby import Ruby
|
|
|
|
# GPU FS related
|
|
from system.system import makeGpuFSSystem
|
|
|
|
|
|
def addRunFSOptions(parser):
|
|
parser.add_argument(
|
|
"--script",
|
|
default=None,
|
|
help="Script to execute in the simulated system",
|
|
)
|
|
parser.add_argument(
|
|
"--host-parallel",
|
|
default=False,
|
|
action="store_true",
|
|
help="Run multiple host threads in KVM mode",
|
|
)
|
|
parser.add_argument(
|
|
"--restore-dir",
|
|
type=str,
|
|
default=None,
|
|
help="Directory to restore checkpoints from",
|
|
)
|
|
parser.add_argument(
|
|
"--disk-image",
|
|
default="",
|
|
help="The boot disk image to mount (/dev/sda)",
|
|
)
|
|
parser.add_argument(
|
|
"--second-disk",
|
|
default=None,
|
|
help="The second disk image to mount (/dev/sdb)",
|
|
)
|
|
parser.add_argument("--kernel", default=None, help="Linux kernel to boot")
|
|
parser.add_argument("--gpu-rom", default=None, help="GPU BIOS to load")
|
|
parser.add_argument(
|
|
"--gpu-mmio-trace", default=None, help="GPU MMIO trace to load"
|
|
)
|
|
parser.add_argument(
|
|
"--checkpoint-before-mmios",
|
|
default=False,
|
|
action="store_true",
|
|
help="Take a checkpoint before driver sends MMIOs. "
|
|
"This is used to switch out of KVM mode and into "
|
|
"timing mode required to read the VGA ROM on boot.",
|
|
)
|
|
parser.add_argument(
|
|
"--cpu-topology",
|
|
type=str,
|
|
default="Crossbar",
|
|
help="Network topology to use for CPU side. "
|
|
"Check configs/topologies for complete set",
|
|
)
|
|
parser.add_argument(
|
|
"--gpu-topology",
|
|
type=str,
|
|
default="Crossbar",
|
|
help="Network topology to use for GPU side. "
|
|
"Check configs/topologies for complete set",
|
|
)
|
|
parser.add_argument(
|
|
"--dgpu-mem-size",
|
|
action="store",
|
|
type=str,
|
|
default="16GB",
|
|
help="Specify the dGPU physical memory size",
|
|
)
|
|
parser.add_argument(
|
|
"--dgpu-num-dirs",
|
|
type=int,
|
|
default=1,
|
|
help="Set the number of dGPU directories (memory controllers",
|
|
)
|
|
parser.add_argument(
|
|
"--dgpu-mem-type",
|
|
default="HBM_1000_4H_1x128",
|
|
choices=ObjectList.mem_list.get_names(),
|
|
help="type of memory to use",
|
|
)
|
|
|
|
# These are the models that are both supported in gem5 and supported
|
|
# by the versions of ROCm supported by gem5 in full system mode. For
|
|
# other gfx versions there is some support in syscall emulation mode.
|
|
parser.add_argument(
|
|
"--gpu-device",
|
|
default="Vega10",
|
|
choices=["Vega10", "MI100", "MI200", "MI300X"],
|
|
help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), MI200 "
|
|
"(gfx90a), or MI300X (gfx942).",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--debug-at-gpu-task",
|
|
type=int,
|
|
default=-1,
|
|
help="Turn on debug flags starting with this task (counting both blit"
|
|
" and non-blit kernels)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--exit-at-gpu-task",
|
|
type=int,
|
|
default=-1,
|
|
help="Exit simulation after running this many tasks (counting both "
|
|
"blit and non-blit kernels)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--exit-after-gpu-kernel",
|
|
type=int,
|
|
default=-1,
|
|
help="Exit simulation after completing this (non-blit) kernel",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--skip-until-gpu-kernel",
|
|
type=int,
|
|
default=0,
|
|
help="Skip (non-blit) kernels until reaching this kernel. Note that "
|
|
"this can impact correctness (the skipped kernels are completely "
|
|
"skipped, not fast forwarded)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--root-partition",
|
|
type=str,
|
|
default="/dev/sda1",
|
|
help="Root partition of disk image",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--disable-avx",
|
|
action="store_true",
|
|
default=False,
|
|
help="Disables AVX. AVX is used in some ROCm libraries but "
|
|
"does not have checkpointing support yet. If simulation either "
|
|
"creates a checkpoint or restores from one, then AVX needs to "
|
|
"be disabled for correct functionality ",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--no-kvm-perf",
|
|
default=False,
|
|
action="store_true",
|
|
help="Disable KVM perf counters (use this with LSF / ETX)",
|
|
)
|
|
|
|
|
|
def runGpuFSSystem(args):
|
|
"""
|
|
This function can be called by higher level scripts designed to simulate
|
|
specific devices. As a result the scripts typically hard code some args
|
|
that should not be changed by the user.
|
|
"""
|
|
|
|
# GPUFS is primarily designed to use the X86 KVM CPU. This model needs to
|
|
# use multiple event queues when more than one CPU is simulated. Force it
|
|
# on if that is the case.
|
|
if ObjectList.is_kvm_cpu(ObjectList.cpu_list.get(args.cpu_type)):
|
|
args.host_parallel = True if args.num_cpus > 1 else False
|
|
|
|
# These are used by the protocols. They should not be set by the user.
|
|
n_cu = args.num_compute_units
|
|
args.num_sqc = int(math.ceil(float(n_cu) / args.cu_per_sqc))
|
|
args.num_scalar_cache = int(
|
|
math.ceil(float(n_cu) / args.cu_per_scalar_cache)
|
|
)
|
|
|
|
# Verify MMIO trace is valid. This is only needed for Vega10 simulations.
|
|
# The md5sum refers to the md5sum of the Vega10 MMIO hardware trace in
|
|
# the gem5-resources repository. By checking it here, we avoid potential
|
|
# errors that would cause the driver not to load and simulations to fail.
|
|
if args.gpu_device == "Vega10":
|
|
mmio_file = open(args.gpu_mmio_trace, "rb")
|
|
mmio_md5 = hashlib.md5(mmio_file.read()).hexdigest()
|
|
if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
|
|
m5.util.panic("MMIO file does not match gem5 resources")
|
|
|
|
system = makeGpuFSSystem(args)
|
|
|
|
root = Root(
|
|
full_system=True,
|
|
system=system,
|
|
time_sync_enable=True,
|
|
time_sync_period="1000us",
|
|
)
|
|
|
|
if args.host_parallel:
|
|
root.sim_quantum = int(1e8)
|
|
|
|
if args.script is not None:
|
|
system.readfile = args.script
|
|
|
|
if args.restore_dir is None:
|
|
m5.instantiate()
|
|
else:
|
|
m5.instantiate(args.restore_dir)
|
|
|
|
print("Running the simulation")
|
|
sim_ticks = args.abs_max_tick
|
|
kernels_completed = 0
|
|
tasks_completed = 0
|
|
if args.debug_at_gpu_task != -1:
|
|
m5.trace.disable()
|
|
|
|
exit_event = m5.simulate(sim_ticks)
|
|
|
|
# Keep executing while there is something to do
|
|
while True:
|
|
if (
|
|
exit_event.getCause() == "m5_exit instruction encountered"
|
|
or exit_event.getCause() == "user interrupt received"
|
|
or exit_event.getCause() == "simulate() limit reached"
|
|
):
|
|
break
|
|
elif "checkpoint" in exit_event.getCause():
|
|
assert args.checkpoint_dir is not None
|
|
m5.checkpoint(args.checkpoint_dir)
|
|
break
|
|
elif "GPU Kernel Completed" in exit_event.getCause():
|
|
if kernels_completed == args.exit_after_gpu_kernel:
|
|
print(f"Exiting after GPU kernel {kernels_completed}")
|
|
break
|
|
kernels_completed += 1
|
|
tasks_completed += 1
|
|
elif "GPU Blit Kernel Completed" in exit_event.getCause():
|
|
tasks_completed += 1
|
|
elif "Skipping GPU Kernel" in exit_event.getCause():
|
|
print(f"Skipping GPU kernel {kernels_completed}")
|
|
kernels_completed += 1
|
|
tasks_completed += 1
|
|
else:
|
|
print(
|
|
f"Unknown exit event: {exit_event.getCause()}. Continuing..."
|
|
)
|
|
|
|
if tasks_completed == args.debug_at_gpu_task:
|
|
print(f"Enabling debug flags @ GPU task {tasks_completed}")
|
|
m5.trace.enable()
|
|
if tasks_completed == args.exit_at_gpu_task:
|
|
print(f"Exiting @ GPU task {tasks_completed}")
|
|
break
|
|
|
|
exit_event = m5.simulate(sim_ticks - m5.curTick())
|
|
|
|
print(
|
|
"Exiting @ tick %i because %s" % (m5.curTick(), exit_event.getCause())
|
|
)
|
|
|
|
|
|
if __name__ == "__m5_main__":
|
|
# Add gpufs, common, ruby, amdgpu, and gpu tlb args
|
|
parser = argparse.ArgumentParser()
|
|
addRunFSOptions(parser)
|
|
Options.addCommonOptions(parser)
|
|
Ruby.define_options(parser)
|
|
AmdGPUOptions.addAmdGPUOptions(parser)
|
|
GPUTLBOptions.tlb_options(parser)
|
|
|
|
args = parser.parse_args()
|
|
|
|
runGpuFSSystem(args)
|