From 2105dc47a90fa5aee07f38722dc327e932a89387 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 28 Jun 2023 16:44:37 -0700 Subject: [PATCH] stdlib: Add viper board, viper cache, and gpu components Adds GPU_VIPER protocol related caches to stdlib components: CorePair cache, TCP, SQC, TCC, Directory, and DMA controllers. Adds GPU related components in a new components/devices/gpus/ directory. Adds prebuilt GPU and CPU cache hierarchies, GPU and CPU network classes, and a board overriding the X86Board to provide helper methods for disk image root, the complex kernel parameter list, and method to provide functionality to the current GPUFS scripts to load in applications and handle loading the GPU driver. The new GPU components can be used as follows: - Create a GPU device *before* the CPU cache hierarchy is created. - Add the GPU's CPU-side DMA controllers to the list of CPU cache controllers. - Use GPU device method to connect to an AbstractBoard. Each GPU components has it's own RubySystem, PCI device ID, and address ranges for VBIOS and legacy PCI BARs. Therefore, in theory, multiple GPUs can be created. This requires PR #1453 . An example of using this board is added to configs/example/gem5_library under x86-mi300x-gpu.py. It is designed to work with the disk image, kernel, and applications provided in the gem5-resources repository. Change-Id: Ie65ffcfee5e311d9492de935d6d0631260645cd3 --- .../example/gem5_library/x86-mi300x-gpu.py | 155 +++++++ src/python/SConscript | 32 ++ .../ruby/caches/viper/__init__.py | 0 .../ruby/caches/viper/corepair_cache.py | 124 ++++++ .../ruby/caches/viper/directory.py | 106 +++++ .../ruby/caches/viper/dma_controller.py | 68 ++++ .../cachehierarchies/ruby/caches/viper/sqc.py | 73 ++++ .../cachehierarchies/ruby/caches/viper/tcc.py | 87 ++++ .../cachehierarchies/ruby/caches/viper/tcp.py | 79 ++++ .../gem5/components/devices/__init__.py | 0 .../gem5/components/devices/gpus/__init__.py | 0 .../gem5/components/devices/gpus/amdgpu.py | 279 +++++++++++++ .../components/devices/gpus/viper_shader.py | 377 ++++++++++++++++++ src/python/gem5/prebuilt/viper/__init__.py | 0 src/python/gem5/prebuilt/viper/board.py | 123 ++++++ .../prebuilt/viper/cpu_cache_hierarchy.py | 273 +++++++++++++ .../prebuilt/viper/gpu_cache_hierarchy.py | 351 ++++++++++++++++ .../gem5/prebuilt/viper/viper_network.py | 165 ++++++++ 18 files changed, 2292 insertions(+) create mode 100644 configs/example/gem5_library/x86-mi300x-gpu.py create mode 100644 src/python/gem5/components/cachehierarchies/ruby/caches/viper/__init__.py create mode 100644 src/python/gem5/components/cachehierarchies/ruby/caches/viper/corepair_cache.py create mode 100644 src/python/gem5/components/cachehierarchies/ruby/caches/viper/directory.py create mode 100644 src/python/gem5/components/cachehierarchies/ruby/caches/viper/dma_controller.py create mode 100644 src/python/gem5/components/cachehierarchies/ruby/caches/viper/sqc.py create mode 100644 src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcc.py create mode 100644 src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcp.py create mode 100644 src/python/gem5/components/devices/__init__.py create mode 100644 src/python/gem5/components/devices/gpus/__init__.py create mode 100644 src/python/gem5/components/devices/gpus/amdgpu.py create mode 100644 src/python/gem5/components/devices/gpus/viper_shader.py create mode 100644 src/python/gem5/prebuilt/viper/__init__.py create mode 100644 src/python/gem5/prebuilt/viper/board.py create mode 100644 src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py create mode 100644 src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py create mode 100644 src/python/gem5/prebuilt/viper/viper_network.py diff --git a/configs/example/gem5_library/x86-mi300x-gpu.py b/configs/example/gem5_library/x86-mi300x-gpu.py new file mode 100644 index 0000000000..20fa99b9d8 --- /dev/null +++ b/configs/example/gem5_library/x86-mi300x-gpu.py @@ -0,0 +1,155 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +""" +Script to run a full system GPU simulation. + +Usage: +------ +``` +scons build/VEGA_X86/gem5.opt +./build/VEGA_X86/gem5.opt + configs/example/gem5_library/x86-viper-gpu.py + --image + --kernel + --app +``` + +Example: +-------- +``` +./build/VEGA_X86/gem5.opt + configs/example/gem5_library/x86-viper-gpu.py + --image ./gem5-resources/src/x86-ubuntu-gpu-ml/disk-image/x86-ubuntu-gpu-ml + --kernel ./gem5-resources/src/x86-ubuntu-gpu-ml/vmlinux-gpu-ml + --app ./gem5-resources/src/gpu/square/bin.default/square.default +``` +""" + +import argparse + +from gem5.coherence_protocol import CoherenceProtocol +from gem5.components.devices.gpus.amdgpu import MI300X +from gem5.components.memory.single_channel import SingleChannelDDR4_2400 +from gem5.components.processors.cpu_types import CPUTypes +from gem5.components.processors.simple_processor import SimpleProcessor +from gem5.isas import ISA +from gem5.prebuilt.viper.board import ViperBoard +from gem5.prebuilt.viper.cpu_cache_hierarchy import ViperCPUCacheHierarchy +from gem5.resources.resource import ( + DiskImageResource, + FileResource, +) +from gem5.simulate.simulator import Simulator +from gem5.utils.requires import requires + +requires( + isa_required=ISA.X86, + coherence_protocol_required=CoherenceProtocol.GPU_VIPER, +) + +# Kernel, disk, and applications are obtained locally. +parser = argparse.ArgumentParser() + +parser.add_argument( + "--image", + type=str, + required=True, + help="Full path to the gem5-resources x86-ubuntu-gpu-ml disk-image.", +) + +parser.add_argument( + "--kernel", + type=str, + required=True, + help="Full path to the gem5-resources vmlinux-gpu-ml kernel.", +) + +parser.add_argument( + "--app", + type=str, + required=True, + help="Path to GPU application, python script, or bash script to run", +) + +parser.add_argument( + "--kvm-perf", + default=False, + action="store_true", + help="Use KVM perf counters to give accurate GPU insts/cycles with KVM", +) + +args = parser.parse_args() + +# stdlib only supports up to 3GiB currently. This will need to be expanded in +# the future. +memory = SingleChannelDDR4_2400(size="3GiB") + +# Note: Only KVM and ATOMIC work due to buggy MOESI_AMD_Base protocol. +processor = SimpleProcessor(cpu_type=CPUTypes.KVM, isa=ISA.X86, num_cores=2) + +for core in processor.cores: + if core.is_kvm_core(): + core.get_simobject().usePerf = args.kvm_perf + +# The GPU must be created first so we can assign CPU-side DMA ports to the +# CPU cache hierarchy. +gpu0 = MI300X() + +cache_hierarchy = ViperCPUCacheHierarchy( + l1d_size="32KiB", + l1d_assoc=8, + l1i_size="32KiB", + l1i_assoc=8, + l2_size="1MiB", + l2_assoc=16, + l3_size="16MiB", + l3_assoc=16, +) + +board = ViperBoard( + clk_freq="3GHz", + processor=processor, + memory=memory, + cache_hierarchy=cache_hierarchy, + gpus=[gpu0], +) + +# Example of using a local disk image resource +disk = DiskImageResource(local_path=args.image, root_partition="1") +kernel = FileResource(local_path=args.kernel) + +board.set_kernel_disk_workload( + kernel=kernel, + disk_image=disk, + readfile_contents=board.make_gpu_app(gpu0, args.app), +) + +simulator = Simulator(board=board) +simulator.run() diff --git a/src/python/SConscript b/src/python/SConscript index b7a40c30c8..ab711fb668 100644 --- a/src/python/SConscript +++ b/src/python/SConscript @@ -175,10 +175,34 @@ PySource('gem5.components.cachehierarchies.ruby.caches.mi_example', 'dma_controller.py') PySource('gem5.components.cachehierarchies.ruby.caches.mi_example', 'gem5/components/cachehierarchies/ruby/caches/mi_example/l1_cache.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/__init__.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/corepair_cache.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/directory.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/dma_controller.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/tcp.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/sqc.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/tcc.py') PySource('gem5.components.cachehierarchies.ruby.topologies', 'gem5/components/cachehierarchies/ruby/topologies/__init__.py') PySource('gem5.components.cachehierarchies.ruby.topologies', 'gem5/components/cachehierarchies/ruby/topologies/simple_pt2pt.py') + +PySource('gem5.components.devices', + 'gem5/components/devices/__init__.py') +PySource('gem5.components.devices.gpus', + 'gem5/components/devices/gpus/__init__.py') +PySource('gem5.components.devices.gpus', + 'gem5/components/devices/gpus/amdgpu.py') +PySource('gem5.components.devices.gpus', + 'gem5/components/devices/gpus/viper_shader.py') + PySource('gem5.components.memory', 'gem5/components/memory/__init__.py') PySource('gem5.components.memory', 'gem5/components/memory/abstract_memory_system.py') PySource('gem5.components.memory', 'gem5/components/memory/dramsim_3.py') @@ -289,6 +313,14 @@ PySource('gem5.prebuilt.riscvmatched', 'gem5/prebuilt/riscvmatched/riscvmatched_processor.py') PySource('gem5.prebuilt.riscvmatched', 'gem5/prebuilt/riscvmatched/riscvmatched_core.py') +PySource('gem5.prebuilt.viper', 'gem5/prebuilt/viper/__init__.py') +PySource('gem5.prebuilt.viper', 'gem5/prebuilt/viper/board.py') +PySource('gem5.prebuilt.viper', + 'gem5/prebuilt/viper/cpu_cache_hierarchy.py') +PySource('gem5.prebuilt.viper', + 'gem5/prebuilt/viper/gpu_cache_hierarchy.py') +PySource('gem5.prebuilt.viper', + 'gem5/prebuilt/viper/viper_network.py') PySource('gem5.resources', 'gem5/resources/__init__.py') PySource('gem5.resources', 'gem5/resources/client.py') PySource('gem5.resources', 'gem5/resources/downloader.py') diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/__init__.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/corepair_cache.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/corepair_cache.py new file mode 100644 index 0000000000..cc7dcd7c39 --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/corepair_cache.py @@ -0,0 +1,124 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import math + +from m5.objects import ( + CorePair_Controller, + MessageBuffer, + RubyCache, + TreePLRURP, +) + +from gem5.components.processors.abstract_core import AbstractCore + + +class CorePairCache(CorePair_Controller): + def __init__( + self, + l1i_size: str, + l1i_assoc: int, + l1d_size: str, + l1d_assoc: int, + l2_size: str, + l2_assoc: int, + network, + cache_line_size, + core: AbstractCore, + ): + """Creating CorePair cache controller. Consist of both instruction + and data cache for a pair of L1s and a single L2 cache shared between + them. + """ + super().__init__() + + self.send_evictions = core.requires_send_evicts() + + self.L1Icache = RubyCache( + size=l1i_size, + assoc=l1i_assoc, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=2, + tagArrayBanks=2, + dataAccessLatency=1, + tagAccessLatency=1, + ) + + self.L1D0cache = RubyCache( + size=l1d_size, + assoc=l1d_assoc, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=2, + tagArrayBanks=2, + dataAccessLatency=1, + tagAccessLatency=1, + ) + + self.L1D1cache = RubyCache( + size=l1d_size, + assoc=l1d_assoc, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=2, + tagArrayBanks=2, + dataAccessLatency=1, + tagAccessLatency=1, + ) + + self.L2cache = RubyCache( + size=l2_size, + assoc=l2_assoc, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=16, + tagArrayBanks=16, + ) + + self.connectQueues(network) + + def connectQueues(self, network): + self.requestFromCore = MessageBuffer() + self.requestFromCore.out_port = network.in_port + + self.responseFromCore = MessageBuffer() + self.responseFromCore.out_port = network.in_port + + self.unblockFromCore = MessageBuffer() + self.unblockFromCore.out_port = network.in_port + + self.probeToCore = MessageBuffer() + self.probeToCore.in_port = network.out_port + + self.responseToCore = MessageBuffer() + self.responseToCore.in_port = network.out_port + + self.mandatoryQueue = MessageBuffer() + self.triggerQueue = MessageBuffer(ordered=True) diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/directory.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/directory.py new file mode 100644 index 0000000000..2e5c9c1f95 --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/directory.py @@ -0,0 +1,106 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +from m5.objects import ( + MessageBuffer, + RubyDirectoryMemory, +) + +from ......utils.override import overrides +from ..abstract_directory import AbstractDirectory + + +class ViperDirectory(AbstractDirectory): + def __init__(self, network, cache_line_size, mem_range, port): + super().__init__(network, cache_line_size) + self.addr_ranges = [mem_range] + self.directory = RubyDirectoryMemory( + block_size=cache_line_size, + ruby_system=network.ruby_system, + ) + # Connect this directory to the memory side. + self.memory_out_port = port + + # Turn off TCC (GPU cache) related parameters + self.noTCCdir = True + self.TCC_select_num_bits = 0 + + # Defaults which must be set + self.CPUonly = False + self.GPUonly = False + self.useL3OnWT = False + self.L2isWB = False + + @overrides(AbstractDirectory) + def connectQueues(self, network): + self.requestFromDMA = MessageBuffer(ordered=True) + self.requestFromDMA.in_port = network.out_port + + self.responseToDMA = MessageBuffer() + self.responseToDMA.out_port = network.in_port + + self.requestFromCores = MessageBuffer(ordered=True) + self.requestFromCores.in_port = network.out_port + + self.responseFromCores = MessageBuffer() + self.responseFromCores.in_port = network.out_port + + self.unblockFromCores = MessageBuffer() + self.unblockFromCores.in_port = network.out_port + + self.probeToCore = MessageBuffer() + self.probeToCore.out_port = network.in_port + + self.responseToCore = MessageBuffer() + self.responseToCore.out_port = network.in_port + + self.triggerQueue = MessageBuffer(ordered=True) + self.L3triggerQueue = MessageBuffer(ordered=True) + + self.requestToMemory = MessageBuffer() + self.responseFromMemory = MessageBuffer() + + +# This is intended to be used on the CPU side +class ViperCPUDirectory(ViperDirectory): + def __init__(self, network, cache_line_size, mem_range, port): + super().__init__(network, cache_line_size, mem_range, port) + + self.CPUonly = True + self.GPUonly = False + + +# This is intended to be used on the GPU side +class ViperGPUDirectory(ViperDirectory): + def __init__(self, network, cache_line_size, mem_range, port): + super().__init__(network, cache_line_size, mem_range, port) + + self.CPUonly = False + self.GPUonly = True diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/dma_controller.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/dma_controller.py new file mode 100644 index 0000000000..f163840c3c --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/dma_controller.py @@ -0,0 +1,68 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +from m5.objects import MessageBuffer + +from ......utils.override import overrides +from ..abstract_dma_controller import AbstractDMAController + + +# There is a controller for GPU and GPU to keep the "version" numbers +# incrementing seperately +class ViperCPUDMAController(AbstractDMAController): + def __init__(self, network, cache_line_size): + super().__init__(network, cache_line_size) + + @overrides(AbstractDMAController) + def connectQueues(self, network): + # A buffer size of 0 means it is an infinite queue. The VIPER + # DMA controller has not been thoroughly tested with finite buffers. + # Test + self.mandatoryQueue = MessageBuffer(buffer_size=0) + self.responseFromDir = MessageBuffer(buffer_size=0) + self.responseFromDir.in_port = network.out_port + self.requestToDir = MessageBuffer(buffer_size=0) + self.requestToDir.out_port = network.in_port + + +class ViperGPUDMAController(AbstractDMAController): + def __init__(self, network, cache_line_size): + super().__init__(network, cache_line_size) + + @overrides(AbstractDMAController) + def connectQueues(self, network): + # A buffer size of 0 means it is an infinite queue. The VIPER + # DMA controller has not been thoroughly tested with finite buffers. + # Test + self.mandatoryQueue = MessageBuffer(buffer_size=0) + self.responseFromDir = MessageBuffer(buffer_size=0) + self.responseFromDir.in_port = network.out_port + self.requestToDir = MessageBuffer(buffer_size=0) + self.requestToDir.out_port = network.in_port diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/sqc.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/sqc.py new file mode 100644 index 0000000000..835434ff16 --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/sqc.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects import ( + MessageBuffer, + RubyCache, + SQC_Controller, + TreePLRURP, +) + + +class SQCCache(SQC_Controller): + def __init__( + self, + sqc_size: str, + sqc_assoc: int, + network, + cache_line_size, + ): + """Creating SQC cache controller. This is the Icache for GPU devices.""" + + super().__init__() + + self.L1cache = RubyCache( + size=sqc_size, + assoc=sqc_assoc, + dataArrayBanks=8, + tagArrayBanks=8, + dataAccessLatency=1, + tagAccessLatency=1, + resourceStalls=True, + replacement_policy=TreePLRURP(), + ) + + self.connectQueues(network) + + def connectQueues(self, network): + self.requestFromSQC = MessageBuffer(ordered=True) + self.requestFromSQC.out_port = network.in_port + + self.probeToSQC = MessageBuffer(ordered=True) + self.probeToSQC.in_port = network.out_port + + self.responseToSQC = MessageBuffer(ordered=True) + self.responseToSQC.in_port = network.out_port + + self.mandatoryQueue = MessageBuffer() diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcc.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcc.py new file mode 100644 index 0000000000..f38c34bf84 --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcc.py @@ -0,0 +1,87 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects import ( + MessageBuffer, + RubyCache, + TCC_Controller, + TreePLRURP, +) + + +class TCCCache(TCC_Controller): + def __init__( + self, + tcc_size: str, + tcc_assoc: int, + network, + cache_line_size, + ): + """Creating TCC cache controller. This is the L2 cache for GPU devices.""" + + super().__init__() + + self.L2cache = RubyCache( + size=tcc_size, + assoc=tcc_assoc, + dataArrayBanks=256, + tagArrayBanks=256, + dataAccessLatency=8, + tagAccessLatency=2, + resourceStalls=True, + replacement_policy=TreePLRURP(), + atomicLatency=0, + atomicALUs=64, + ) + + self.connectQueues(network) + + def connectQueues(self, network): + self.requestFromTCP = MessageBuffer(ordered=True) + self.requestFromTCP.in_port = network.out_port + + self.responseToCore = MessageBuffer(ordered=True) + self.responseToCore.out_port = network.in_port + + self.probeFromNB = MessageBuffer() + self.probeFromNB.in_port = network.out_port + + self.responseFromNB = MessageBuffer() + self.responseFromNB.in_port = network.out_port + + self.requestToNB = MessageBuffer(ordered=True) + self.requestToNB.out_port = network.in_port + + self.responseToNB = MessageBuffer() + self.responseToNB.out_port = network.in_port + + self.unblockToNB = MessageBuffer() + self.unblockToNB.out_port = network.in_port + + self.triggerQueue = MessageBuffer(ordered=True) diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcp.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcp.py new file mode 100644 index 0000000000..8323eda6fe --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcp.py @@ -0,0 +1,79 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects import ( + MessageBuffer, + RubyCache, + TCP_Controller, + TreePLRURP, +) + + +class TCPCache(TCP_Controller): + def __init__( + self, + tcp_size: str, + tcp_assoc: int, + network, + cache_line_size, + ): + """Creating TCP cache controller. This is the L1 cache for GPU devices.""" + + super().__init__() + + self.L1cache = RubyCache( + size=tcp_size, + assoc=tcp_assoc, + dataArrayBanks=16, + tagArrayBanks=16, + dataAccessLatency=4, + tagAccessLatency=1, + resourceStalls=True, + replacement_policy=TreePLRURP(), + ) + + self.connectQueues(network) + + def connectQueues(self, network): + self.requestFromTCP = MessageBuffer(ordered=True) + self.requestFromTCP.out_port = network.in_port + + self.responseFromTCP = MessageBuffer(ordered=True) + self.responseFromTCP.out_port = network.in_port + + self.unblockFromCore = MessageBuffer() + self.unblockFromCore.out_port = network.in_port + + self.probeToTCP = MessageBuffer(ordered=True) + self.probeToTCP.in_port = network.out_port + + self.responseToTCP = MessageBuffer(ordered=True) + self.responseToTCP.in_port = network.out_port + + self.mandatoryQueue = MessageBuffer() diff --git a/src/python/gem5/components/devices/__init__.py b/src/python/gem5/components/devices/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/python/gem5/components/devices/gpus/__init__.py b/src/python/gem5/components/devices/gpus/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/python/gem5/components/devices/gpus/amdgpu.py b/src/python/gem5/components/devices/gpus/amdgpu.py new file mode 100644 index 0000000000..67ecc6c2c6 --- /dev/null +++ b/src/python/gem5/components/devices/gpus/amdgpu.py @@ -0,0 +1,279 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects import ( + AMDGPUDevice, + SubSystem, +) + +from ....components.boards.abstract_board import AbstractBoard +from ....prebuilt.viper.gpu_cache_hierarchy import ViperGPUCacheHierarchy +from .viper_shader import ViperShader + + +class BaseViperGPU(SubSystem): + _base_pci_dev = 8 + _gpu_count = 0 + _my_id = 0 + + @classmethod + def next_pci_dev(cls): + cls._gpu_count += 1 # Use count for this particular type + return cls._base_pci_dev + cls._gpu_count - 1 + + @classmethod + def get_gpu_count(cls): + return cls._gpu_count + + def __init__(self): + # Setup various PCI related parameters + self._my_id = self.get_gpu_count() + pci_dev = self.next_pci_dev() + + device = AMDGPUDevice(pci_func=0, pci_dev=pci_dev, pci_bus=0) + self._device = device + + def set_shader(self, shader: ViperShader): + self._shader = shader + + def get_cpu_dma_ports(self): + return self._shader.get_cpu_dma_ports() + + def connectGPU(self, board: AbstractBoard) -> None: + # Connect a CPU pointer. This is only used for SE mode. Any CPU will + # work, so pick assuming there is at least one + cpus = board.get_processor() + self._shader.set_cpu_pointer(cpus.cores[0].core) + + # Connect all PIO buses + self._shader.connect_iobus(board.get_io_bus()) + + # The System() object in gem5 has a memories parameter which defaults + # to Self.all. This will collect *all* AbstractMemories and connect to + # the CPU side. To avoid this we manually assign the memories param to + # the CPU side memories. We need the MemInterface which is called dram + # in the MemCtrl class even though it might not be modelling dram. + memory = board.get_memory() + cpu_abs_mems = [mem.dram for mem in memory.get_memory_controllers()] + board.memories = cpu_abs_mems + + # Make the cache hierarchy. This will create an independent RubySystem + # class containing only the GPU caches with no network connection to + # the CPU cache hierarchy. + self._device.gpu_caches = ViperGPUCacheHierarchy( + tcp_size=self._tcp_size, + tcp_assoc=self._tcp_assoc, + sqc_size=self._sqc_size, + sqc_assoc=self._sqc_assoc, + scalar_size=self._scalar_size, + scalar_assoc=self._scalar_assoc, + tcc_size=self._tcc_size, + tcc_assoc=self._tcc_assoc, + tcc_count=self._tcc_count, + cu_per_sqc=self._cu_per_sqc, + num_memory_channels=self._num_memory_channels, + cache_line_size=self._cache_line_size, + shader=self._shader, + ) + + # Collect GPU memory controllers created in the GPU cache hierarchy. + # First assign them as a child to the device so the SimObject unproxy. + # The device requires the memories parameter to be set as the system + # pointer required by the AbstractMemory class is set by AMDGPUDevice. + self._device.mem_ctrls = self._device.gpu_caches.get_mem_ctrls() + gpu_abs_mems = [mem.dram for mem in self._device.mem_ctrls] + self._device.memories = gpu_abs_mems + + # Finally attach to the board. PciDevices default to Parent.any for the + # PciHost parameter. To make sure this is found we need to connect to + # board.pc or a child of board.pc. Historically we place this in the + # south bridge. + board.pc.south_bridge.gpu_shader = self._shader + + # This is cosmetic so the device shows as board.pc.south_bridge.gpu### + # instead of board.pc.south_bridge.gpu_shader.CUs.l1_tlb.gpu_device. + gpu_name = f"gpu{self._my_id}" + self._device.set_parent(board.pc.south_bridge, gpu_name) + + +# A scaled down MI210-like device. Defaults to ~1/4th of an MI210. +class MI210(BaseViperGPU): + def __init__( + self, + num_cus: int = 32, + cu_per_sqc: int = 4, + tcp_size: str = "16KiB", + tcp_assoc: int = 16, + sqc_size: str = "32KiB", + sqc_assoc: int = 8, + scalar_size: str = "32KiB", + scalar_assoc: int = 8, + tcc_size: str = "256KiB", + tcc_assoc: int = 16, + tcc_count: int = 8, + num_memory_channels: int = 8, + cache_line_size: int = 64, + ): + super().__init__() + + self._cu_per_sqc = cu_per_sqc + self._tcp_size = tcp_size + self._tcp_assoc = tcp_assoc + self._sqc_size = sqc_size + self._sqc_assoc = sqc_assoc + self._scalar_size = scalar_size + self._scalar_assoc = scalar_assoc + self._tcc_size = tcc_size + self._tcc_assoc = tcc_assoc + self._tcc_count = tcc_count + self._num_memory_channels = num_memory_channels + self._cache_line_size = cache_line_size + + self._device.device_name = "MI200" + + self._device.DeviceID = 0x740F + self._device.SubsystemVendorID = 0x1002 + self._device.SubsystemID = 0x0C34 + + # Setup device-specific address ranges for various SoC components. + shader = ViperShader( + self._my_id, num_cus, cache_line_size, self._device + ) + self.set_shader(shader) + + # Setup the SDMA engines depending on device. The MMIO base addresses + # can be found in the driver code under: + # include/asic_reg/sdmaX/sdmaX_Y_Z_offset.h + num_sdmas = 5 + sdma_bases = [0x4980, 0x6180, 0x78000, 0x79000, 0x7A000] + sdma_sizes = [0x1000] * 5 + + self._device.sdmas = shader._create_sdmas(sdma_bases, sdma_sizes) + + # Setup the Command Processor's PM4 engines. + pm4_starts = [0xC000] + pm4_ends = [0xD000] + + self._device.pm4_pkt_procs = shader._create_pm4s(pm4_starts, pm4_ends) + + def get_driver_command(self, debug: bool = False): + debug_commands = "dmesg -n8\n" if debug else "" + + driver_load_command = ( + "export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH\n" + "export HSA_ENABLE_INTERRUPT=0\n" + "export HCC_AMDGPU_TARGET=gfx90a\n" + f"{debug_commands}\n" + "dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128\n" + "if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then\n" + ' echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."\n' + " /sbin/m5 exit\n" + "fi\n" + "modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0\n" + ) + + return driver_load_command + + +# Defaults to a single "XCD" (i.e., 1/8th of a full MI300X). +class MI300X(BaseViperGPU): + def __init__( + self, + num_cus: int = 40, + cu_per_sqc: int = 4, + tcp_size: str = "16KiB", + tcp_assoc: int = 16, + sqc_size: str = "32KiB", + sqc_assoc: int = 8, + scalar_size: str = "32KiB", + scalar_assoc: int = 8, + tcc_size: str = "256KiB", + tcc_assoc: int = 16, + tcc_count: int = 16, + num_memory_channels: int = 16, + cache_line_size: int = 64, + ): + super().__init__() + + self._cu_per_sqc = cu_per_sqc + self._tcp_size = tcp_size + self._tcp_assoc = tcp_assoc + self._sqc_size = sqc_size + self._sqc_assoc = sqc_assoc + self._scalar_size = scalar_size + self._scalar_assoc = scalar_assoc + self._tcc_size = tcc_size + self._tcc_assoc = tcc_assoc + self._tcc_count = tcc_count + self._num_memory_channels = num_memory_channels + self._cache_line_size = cache_line_size + + self._device.device_name = "MI300X" + + self._device.DeviceID = 0x740F + self._device.SubsystemVendorID = 0x1002 + self._device.SubsystemID = 0x0C34 + + # Setup device-specific address ranges for various SoC components. + shader = ViperShader( + self._my_id, num_cus, cache_line_size, self._device + ) + self.set_shader(shader) + + # These currently use MI200 values until the MI300X bios is released. + num_sdmas = 5 + sdma_bases = [0x4980, 0x6180, 0x78000, 0x79000, 0x7A000] + sdma_sizes = [0x1000] * 5 + + self._device.sdmas = shader._create_sdmas(sdma_bases, sdma_sizes) + + # Setup the Command Processor's PM4 engines. + pm4_starts = [0xC000] + pm4_ends = [0xD000] + + self._device.pm4_pkt_procs = shader._create_pm4s(pm4_starts, pm4_ends) + + def get_driver_command(self, debug: bool = False): + debug_commands = "dmesg -n8\n" if debug else "" + + driver_load_command = ( + "export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH\n" + "export HSA_ENABLE_INTERRUPT=0\n" + "export HCC_AMDGPU_TARGET=gfx942\n" + 'export HSA_OVERRIDE_GFX_VERSION="9.4.2"\n' + f"{debug_commands}\n" + "dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128\n" + "if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then\n" + ' echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."\n' + " /sbin/m5 exit\n" + "fi\n" + "modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0\n" + ) + + return driver_load_command diff --git a/src/python/gem5/components/devices/gpus/viper_shader.py b/src/python/gem5/components/devices/gpus/viper_shader.py new file mode 100644 index 0000000000..ce5e0a5eeb --- /dev/null +++ b/src/python/gem5/components/devices/gpus/viper_shader.py @@ -0,0 +1,377 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from typing import List + +from m5.objects import ( + AddrRange, + AMDGPUDevice, + AMDGPUInterruptHandler, + AMDGPUMemoryManager, + AMDGPUSystemHub, + BaseCPU, + BaseXBar, + ComputeUnit, + DynPoolManager, + GPUCommandProcessor, + GPUDispatcher, + HSAPacketProcessor, + LdsState, + PciLegacyIoBar, + PM4PacketProcessor, + RegisterFileCache, + RegisterManager, + ScalarRegisterFile, + SDMAEngine, + Shader, + VectorRegisterFile, + VegaGPUTLB, + VegaPagetableWalker, + VegaTLBCoalescer, + Wavefront, +) + + +class ViperCU(ComputeUnit): + def __init__(self, cu_id: int, device: AMDGPUDevice): + """ComputeUnit object of a gfx9-like compute unit.""" + super().__init__() + self._device = device + + self.cu_id = cu_id + + # Use in multiple places. Define variables to change once. + self._vreg_file_size = 2048 + self._sreg_file_size = 2048 + + # Latencies, etc. use defaults in src/gpu-compute/GPU.py. + self.num_SIMDs = 4 + self.n_wf = 8 + + self.localDataStore = LdsState( + size=65536, + ) + + self.wavefronts = [ + Wavefront(simdId=j, wf_slot_id=k) + for j in range(self.num_SIMDs) + for k in range(self.n_wf) + ] + + self.vector_register_file = [ + VectorRegisterFile(simd_id=i, num_regs=self._vreg_file_size) + for i in range(self.num_SIMDs) + ] + + self.scalar_register_file = [ + ScalarRegisterFile(simd_id=i, num_regs=self._sreg_file_size) + for i in range(self.num_SIMDs) + ] + + self.register_file_cache = [ + RegisterFileCache(simd_id=i) for i in range(self.num_SIMDs) + ] + + self.register_manager = RegisterManager( + policy="static", + vrf_pool_managers=[ + DynPoolManager(pool_size=self._vreg_file_size, min_alloc=4) + for _ in range(self.num_SIMDs) + ], + srf_pool_managers=[ + DynPoolManager(pool_size=self._sreg_file_size, min_alloc=4) + for _ in range(self.num_SIMDs) + ], + ) + + self.ldsPort = self.ldsBus.cpu_side_port + self.ldsBus.mem_side_port = self.localDataStore.cuPort + + self._create_tlbs() + + def _create_tlbs(self): + # Vector memory TLB + self.l1_tlb = VegaGPUTLB( + gpu_device=self._device, + size=64, + assoc=64, + hitLatency=1, + missLatency1=750, + missLatency2=750, + maxOutstandingReqs=64, + ) + + self.l1_coalescer = VegaTLBCoalescer(tlb_level=1) + + self.translation_port = self.l1_coalescer.cpu_side_ports + self.l1_coalescer.mem_side_ports = self.l1_tlb.cpu_side_ports + + # Scalar memory TLB + self.scalar_tlb = VegaGPUTLB( + gpu_device=self._device, + size=64, + assoc=64, + hitLatency=1, + missLatency1=750, + missLatency2=750, + maxOutstandingReqs=64, + ) + + self.scalar_coalescer = VegaTLBCoalescer(tlb_level=1) + + self.scalar_tlb_port = self.scalar_coalescer.cpu_side_ports + self.scalar_coalescer.mem_side_ports = self.scalar_tlb.cpu_side_ports + + # Instruction memory TLB + self.sqc_tlb = VegaGPUTLB( + gpu_device=self._device, + size=64, + assoc=64, + hitLatency=1, + missLatency1=750, + missLatency2=750, + maxOutstandingReqs=64, + ) + + self.sqc_coalescer = VegaTLBCoalescer(tlb_level=1) + + self.sqc_tlb_port = self.sqc_coalescer.cpu_side_ports + self.sqc_coalescer.mem_side_ports = self.sqc_tlb.cpu_side_ports + + def get_tlb_ports(self): + return [ + self.l1_tlb.mem_side_ports, + self.sqc_tlb.mem_side_ports, + self.scalar_tlb.mem_side_ports, + ] + + +class ViperShader(Shader): + def __init__( + self, + shader_id: int, + num_cus: int, + cache_line_size: int, + device: AMDGPUDevice, + ): + """ + The shader defines something the represents a single software visible + GPU (e.g., a graphics card, a chiplet on a GPU, etc.). + """ + super().__init__() + + self._shader_id = shader_id + self._cache_line_size = cache_line_size + self._device = device + + self.n_wf = 8 + self.timing = True + # used to track the (many, many) DMA ports + self._cpu_dma_ports = [] + self._gpu_dma_ports = [] + + # VIPER GPU protocol implements release consistency at GPU side. So, + # we make their writes visible to the global memory and should read + # from global memory during kernal boundary. The pipeline initiates + # (or do not initiate) the acquire/release operation depending on + # these impl_kern_launch_rel and impl_kern_end_rel flags. The flag=true + # means pipeline initiates a acquire/release operation at kernel launch/end + # VIPER protocol is write-through based, and thus only impl_kern_launch_acq + # needs to set. + self.impl_kern_launch_acq = True + self.impl_kern_end_rel = False + + # Attach compute units to GPU + self.CUs = [ViperCU(idx, device) for idx in range(num_cus)] + + self._create_tlbs(device) + + # This arbitrary address is something in the X86 I/O hole + hsapp_gpu_map_paddr = 0xE00000000 + self.dispatcher = GPUDispatcher() + self.gpu_cmd_proc = GPUCommandProcessor( + hsapp=HSAPacketProcessor( + pioAddr=hsapp_gpu_map_paddr, + numHWQueues=10, + walker=VegaPagetableWalker(), + ), + dispatcher=self.dispatcher, + walker=VegaPagetableWalker(), + ) + self._cpu_dma_ports.append(self.gpu_cmd_proc.hsapp.dma) + self._cpu_dma_ports.append(self.gpu_cmd_proc.dma) + + self._gpu_dma_ports.append(self.gpu_cmd_proc.hsapp.walker.port) + self._gpu_dma_ports.append(self.gpu_cmd_proc.walker.port) + + self.system_hub = AMDGPUSystemHub() + self._cpu_dma_ports.append(self.system_hub.dma) + + self._setup_device(device) + + def get_compute_units(self): + return self.CUs + + def _setup_device(self, device: AMDGPUDevice): + """Set the device type info on the device connected via PCI.""" + device.cp = self.gpu_cmd_proc + device.device_ih = AMDGPUInterruptHandler() + self._cpu_dma_ports.append(device.device_ih.dma) + + # GPU data path + device.memory_manager = AMDGPUMemoryManager( + cache_line_size=self._cache_line_size, + ) + self._gpu_dma_ports.append(device.memory_manager.port) + + self._cpu_dma_ports.append(device.dma) + + # Use the gem5 default of 0x280 OR'd with 0x10 which tells Linux there is + # a PCI capabilities list to travse. + device.Status = 0x0290 + + # The PCI capabilities are like a linked list. The list has a memory + # offset and a capability type ID read by the OS. Make the first + # capability at 0x80 and set the PXCAP (PCI express) capability to + # that address. Mark the type ID as PCI express. + # We leave the next ID of PXCAP blank to end the list. + device.PXCAPBaseOffset = 0x80 + device.CapabilityPtr = 0x80 + device.PXCAPCapId = 0x10 + + # Set bits 7 and 8 in the second PCIe device capabilities register which + # reports support for PCIe atomics for 32 and 64 bits respectively. + # Bit 9 for 128-bit compare and swap is not set because the amdgpu driver + # does not check this. + device.PXCAPDevCap2 = 0x00000180 + + # Set bit 6 to enable atomic requestor, meaning this device can request + # atomics from other PCI devices. + device.PXCAPDevCtrl2 = 0x00000040 + + # If there are multiple GPUs in the system, make sure the VBIOS region + # and the legacy IO bar do not overlap with the ranges from other GPUs. + if self._shader_id != 0: + device.ExpansionROM = 0xD0000000 + (0x20000 * self._shader_id) + bar4_addr = 0xF000 + (0x100 * self._shader_id) + device.BAR4 = PciLegacyIoBar(addr=bar4_addr, size="256B") + + def _create_pm4s(self, pm4_starts: List[int], pm4_ends: List[int]): + """Create PM4 packet processors.""" + num_pm4s = len(pm4_starts) + + pm4_procs = [ + PM4PacketProcessor( + ip_id=i, + mmio_range=AddrRange(start=pm4_starts[i], end=pm4_ends[i]), + ) + for i in range(num_pm4s) + ] + + for pm4_proc in pm4_procs: + self._cpu_dma_ports.append(pm4_proc.dma) + + return pm4_procs + + def _create_sdmas(self, sdma_bases: List[int], sdma_sizes: List[int]): + """Create the SDMA engines.""" + num_sdmas = len(sdma_bases) + + sdmas = [ + SDMAEngine( + walker=VegaPagetableWalker(), + mmio_base=sdma_bases[i], + mmio_size=sdma_sizes[i], + ) + for i in range(num_sdmas) + ] + + for sdma in sdmas: + self._cpu_dma_ports.append(sdma.dma) + self._gpu_dma_ports.append(sdma.walker.port) + + return sdmas + + def get_cpu_dma_ports(self): + return self._cpu_dma_ports + + def get_gpu_dma_ports(self): + return self._gpu_dma_ports + + def _create_tlbs(self, device: AMDGPUDevice): + """Connect per-CU TLBs to the L2/L3 TLBs""" + self.l2_tlb = VegaGPUTLB( + gpu_device=device, + size=4096, + assoc=64, + hitLatency=69, + missLatency1=750, + missLatency2=750, + maxOutstandingReqs=64, + ) + + self.l2_coalescer = VegaTLBCoalescer(tlb_level=2) + + self.l3_tlb = VegaGPUTLB( + gpu_device=device, + size=8192, + assoc=64, + hitLatency=150, + missLatency1=750, + missLatency2=750, + maxOutstandingReqs=64, + ) + + self.l3_coalescer = VegaTLBCoalescer(tlb_level=3) + + # Port flow: [L1s] -> L2 coalescer -> L2 tlb -> L3 coalescer -> L3 tlb + for cu in self.CUs: + for port in cu.get_tlb_ports(): + self.l2_coalescer.cpu_side_ports = port + self.l2_coalescer.mem_side_ports = self.l2_tlb.cpu_side_ports + self.l2_tlb.mem_side_ports = self.l3_coalescer.cpu_side_ports + self.l3_coalescer.mem_side_ports = self.l3_tlb.cpu_side_ports + + self._gpu_dma_ports.append(self.l3_tlb.walker.port) + + def connect_iobus(self, iobus: BaseXBar): + """Connect the GPU objects to the IO bus.""" + self.gpu_cmd_proc.pio = iobus.mem_side_ports + self.gpu_cmd_proc.hsapp.pio = iobus.mem_side_ports + self.system_hub.pio = iobus.mem_side_ports + self._device.pio = iobus.mem_side_ports + self._device.device_ih.pio = iobus.mem_side_ports + for sdma in self._device.sdmas: + sdma.pio = iobus.mem_side_ports + for pm4_proc in self._device.pm4_pkt_procs: + pm4_proc.pio = iobus.mem_side_ports + + def set_cpu_pointer(self, cpu: BaseCPU): + """Set the CPU pointer for the Shader.""" + self.cpu_pointer = cpu diff --git a/src/python/gem5/prebuilt/viper/__init__.py b/src/python/gem5/prebuilt/viper/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/python/gem5/prebuilt/viper/board.py b/src/python/gem5/prebuilt/viper/board.py new file mode 100644 index 0000000000..10eb57d5e5 --- /dev/null +++ b/src/python/gem5/prebuilt/viper/board.py @@ -0,0 +1,123 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import base64 +import os +from typing import ( + List, + Optional, +) + +from ...components.boards.abstract_board import AbstractBoard +from ...components.boards.kernel_disk_workload import KernelDiskWorkload +from ...components.boards.x86_board import X86Board +from ...components.cachehierarchies.abstract_cache_hierarchy import ( + AbstractCacheHierarchy, +) +from ...components.devices.gpus.amdgpu import BaseViperGPU +from ...components.memory.abstract_memory_system import AbstractMemorySystem +from ...components.processors.abstract_processor import AbstractProcessor +from ...utils.override import overrides + + +class ViperBoard(X86Board): + """ + A derivative of X86Board capable of full system simulation for X86 with a + GPU device. Provides all the functionality of the X86Board with helper + methods specific to booting a disk with GPU libraries installed. + """ + + def __init__( + self, + clk_freq: str, + processor: AbstractProcessor, + memory: AbstractMemorySystem, + cache_hierarchy: AbstractCacheHierarchy, + gpus: Optional[List[BaseViperGPU]] = None, + ) -> None: + super().__init__( + clk_freq=clk_freq, + processor=processor, + memory=memory, + cache_hierarchy=cache_hierarchy, + ) + print("Viper board __init__ was called here now") + + self._gpus = gpus + + def get_devices(self): + return self._gpus + + @overrides(AbstractBoard) + def _connect_things(self) -> None: + print("Viper board connect things was called here now") + super()._connect_things() + + if self._gpus is not None: + for gpu in self._gpus: + gpu.connectGPU(self) + + @overrides(KernelDiskWorkload) + def get_disk_device(self): + return "/dev/sda" + + @overrides(KernelDiskWorkload) + def get_default_kernel_args(self) -> List[str]: + # The regular parameters used with gem5 plus (1) fbdev_emulation=0 + # to disable having to implement this functionality, (2) blacklist + # amdgpu because we need to copy the VBIOS into memory first, and (3) + # blacklist psmouse as amdgpu driver adds new mouse commands which + # gem5 does not implement and they do not seem to be documented. + return [ + "earlyprintk=ttyS0", + "console=ttyS0", + "lpj=7999923", + "root={root_value}", + "drm_kms_helper.fbdev_emulation=0", + "modprobe.blacklist=amdgpu", + "modprobe.blacklist=psmouse", + ] + + # Replicate the capability of the old GPUFS config, which embed a binary + # application or script into a bash script setting up the environment and + # loading the GPU driver. + def make_gpu_app(self, gpu: BaseViperGPU, app: str, debug: bool = False): + driver_load_command = gpu.get_driver_command(debug=debug) + + with open(os.path.abspath(app), "rb") as binfile: + encodedBin = base64.b64encode(binfile.read()).decode() + + application_command = ( + f'echo "{encodedBin}" | base64 -d > myapp\n' + "chmod +x myapp\n" + "./myapp {}\n" + "/sbin/m5 exit\n" + ) + + return driver_load_command + application_command diff --git a/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py b/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py new file mode 100644 index 0000000000..6d7d5d90b8 --- /dev/null +++ b/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py @@ -0,0 +1,273 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import math + +from m5.objects import ( + DMASequencer, + RubyCache, + RubyPortProxy, + RubySequencer, + RubySystem, + SimpleMemory, + TreePLRURP, +) + +from ...coherence_protocol import CoherenceProtocol +from ...components.cachehierarchies.abstract_cache_hierarchy import ( + AbstractCacheHierarchy, +) +from ...components.cachehierarchies.ruby.abstract_ruby_cache_hierarchy import ( + AbstractRubyCacheHierarchy, +) +from ...components.cachehierarchies.ruby.caches.viper.corepair_cache import ( + CorePairCache, +) +from ...components.cachehierarchies.ruby.caches.viper.directory import ( + ViperCPUDirectory, +) +from ...components.cachehierarchies.ruby.caches.viper.dma_controller import ( + ViperCPUDMAController, +) +from ...prebuilt.viper.board import ViperBoard +from ...utils.override import overrides +from ...utils.requires import requires +from .viper_network import SimplePt2Pt + + +class ViperCPUCacheHierarchy(AbstractRubyCacheHierarchy): + """ + The VIPER CPU cache hierarchy creates CPU-side Ruby caches and connects + the nodes using a simple point-to-point topology. + """ + + def __init__( + self, + l1d_size: str, + l1d_assoc: int, + l1i_size: str, + l1i_assoc: int, + l2_size: str, + l2_assoc: int, + l3_size: str, + l3_assoc: int, + ): + """ + :param size: The size of each cache in the heirarchy. + :param assoc: The associativity of each cache. + :param device_dmas: Optional list of CPU connect device DMAs + """ + super().__init__() + + self._l1d_size = l1d_size + self._l1d_assoc = l1d_assoc + self._l1i_size = l1i_size + self._l1i_assoc = l1i_assoc + self._l2_size = l2_size + self._l2_assoc = l2_assoc + self._l3_size = l3_size + self._l3_assoc = l3_assoc + + self.ruby_system = RubySystem() + + @overrides(AbstractCacheHierarchy) + def incorporate_cache(self, board: ViperBoard) -> None: + requires(coherence_protocol_required=CoherenceProtocol.GPU_VIPER) + + # Ruby networks for CPU + self.ruby_system.network = SimplePt2Pt(self.ruby_system) + + # MOESI_AMD_Base uses 5 virtual networks. + self.ruby_system.number_of_virtual_networks = 5 + self.ruby_system.network.number_of_virtual_networks = 5 + + # There is a single local list of all of the controllers to make it + # easier to connect everything to the CPU network. This can be + # customized depending on the topology/network requirements. + # Create one controller for each L1 cache (and the cache mem obj.) + # Create a single directory controller (Really the memory cntrl). + self._controllers = [] + + cores = board.get_processor().get_cores() + num_cores = len(cores) + for i in range(0, num_cores, 2): + cache = CorePairCache( + l1d_size=self._l1d_size, + l1d_assoc=self._l1d_assoc, + l1i_size=self._l1i_size, + l1i_assoc=self._l1i_assoc, + l2_size=self._l2_size, + l2_assoc=self._l2_assoc, + network=self.ruby_system.network, + cache_line_size=board.get_cache_line_size(), + core=cores[i], + ) + + cache.version = i // 2 + cache.ruby_system = self.ruby_system + cache.clk_domain = board.get_clock_domain() + + cache.sequencer = RubySequencer( + version=i, + dcache=cache.L1D0cache, + ruby_system=self.ruby_system, + coreid=0, + is_cpu_sequencer=True, + clk_domain=board.get_clock_domain(), + ) + + cache.sequencer1 = RubySequencer( + version=i + 1, + dcache=cache.L1D1cache, + ruby_system=self.ruby_system, + coreid=1, + is_cpu_sequencer=True, + clk_domain=board.get_clock_domain(), + ) + + cache.sequencer.connectIOPorts(board.get_io_bus()) + cache.sequencer1.connectIOPorts(board.get_io_bus()) + + cores[i].connect_icache(cache.sequencer.in_ports) + cores[i].connect_dcache(cache.sequencer.in_ports) + + cores[i].connect_walker_ports( + cache.sequencer.in_ports, cache.sequencer.in_ports + ) + + # Connect the interrupt ports + int_req_port = cache.sequencer.interrupt_out_port + int_resp_port = cache.sequencer.in_ports + cores[i].connect_interrupt(int_req_port, int_resp_port) + + if i + 1 < num_cores: + cores[i + 1].connect_icache(cache.sequencer1.in_ports) + cores[i + 1].connect_dcache(cache.sequencer1.in_ports) + + cores[i + 1].connect_walker_ports( + cache.sequencer.in_ports, cache.sequencer1.in_ports + ) + + # Connect the interrupt ports + cores[i + 1].connect_interrupt(int_req_port, int_resp_port) + + self._controllers.append(cache) + + # Create the CPU directory controllers + self._directory_controllers = [] + + # Automatically determine the numa bit. This can be changed to + # increase the number of bytes to each memory channel before + # going to the next channels + dir_bits = int(math.log(len(board.get_mem_ports()), 2)) + block_size_bits = int(math.log(board.get_cache_line_size())) + + for addr_range, port in board.get_mem_ports(): + dir = ViperCPUDirectory( + self.ruby_system.network, + board.get_cache_line_size(), + addr_range, + port, + ) + dir.ruby_system = self.ruby_system + dir.version = len(self._directory_controllers) + self._directory_controllers.append(dir) + + dir.L3CacheMemory = RubyCache( + size=self._l3_size, + assoc=self._l3_assoc, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=16, + tagArrayBanks=16, + dataAccessLatency=20, + tagAccessLatency=15, + ) + + # Create the DMA Controllers, if required. + self._dma_controllers = [] + if board.has_dma_ports(): + dma_ports = board.get_dma_ports() + for i, port in enumerate(dma_ports): + ctrl = ViperCPUDMAController( + self.ruby_system.network, board.get_cache_line_size() + ) + ctrl.dma_sequencer = DMASequencer(version=i, in_ports=port) + + ctrl.ruby_system = self.ruby_system + ctrl.dma_sequencer.ruby_system = self.ruby_system + + self._dma_controllers.append(ctrl) + + # Create DMA Controllers requires for any devices in the system. + device_dmas = [] + if board.get_devices() is not None: + for device in board.get_devices(): + device_dmas += device.get_cpu_dma_ports() + + if len(device_dmas) > 0: + for _, port in enumerate(device_dmas): + ctrl = ViperCPUDMAController( + self.ruby_system.network, board.get_cache_line_size() + ) + ctrl.dma_sequencer = DMASequencer( + version=len(self._dma_controllers), in_ports=port + ) + + ctrl.ruby_system = self.ruby_system + ctrl.dma_sequencer.ruby_system = self.ruby_system + + self._dma_controllers.append(ctrl) + + # Number of sequencers = one per core pair + one per DMA + self.ruby_system.num_of_sequencers = len(self._controllers) * 2 + len( + self._dma_controllers + ) + + # Assign the controllers to their parent objects. + self.ruby_system.controllers = self._controllers + self.ruby_system.directory_controllers = self._directory_controllers + + if len(self._dma_controllers) != 0: + self.ruby_system.dma_controllers = self._dma_controllers + + # Connect the controllers using the network topology + self.ruby_system.network.connect( + self._controllers + + self._directory_controllers + + self._dma_controllers + ) + self.ruby_system.network.setup_buffers() + + # Set up a proxy port for the system_port. Used for load binaries and + # other functional-only things. + self.ruby_system.sys_port_proxy = RubyPortProxy( + ruby_system=self.ruby_system + ) + board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports) diff --git a/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py b/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py new file mode 100644 index 0000000000..1012d679db --- /dev/null +++ b/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py @@ -0,0 +1,351 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import math + +from m5.objects import ( + AddrRange, + DMASequencer, + HBM_2000_4H_1x64, + MemCtrl, + RubyCache, + RubySequencer, + RubySystem, + SrcClockDomain, + TreePLRURP, + VIPERCoalescer, + VoltageDomain, +) + +from ...coherence_protocol import CoherenceProtocol +from ...components.cachehierarchies.ruby.abstract_ruby_cache_hierarchy import ( + AbstractRubyCacheHierarchy, +) +from ...components.cachehierarchies.ruby.caches.viper.directory import ( + ViperGPUDirectory, +) +from ...components.cachehierarchies.ruby.caches.viper.dma_controller import ( + ViperGPUDMAController, +) +from ...components.cachehierarchies.ruby.caches.viper.sqc import SQCCache +from ...components.cachehierarchies.ruby.caches.viper.tcc import TCCCache +from ...components.cachehierarchies.ruby.caches.viper.tcp import TCPCache +from ...components.devices.gpus.viper_shader import ViperShader +from ...utils.requires import requires +from .viper_network import ( + SimpleDoubleCrossbar, + SimplePt2Pt, +) + + +class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy): + _seqs = 0 + + @classmethod + def seqCount(cls): + # Use SeqCount not class since we need global count + cls._seqs += 1 + return cls._seqs - 1 + + def __init__( + self, + tcp_size: str, + tcp_assoc: int, + sqc_size: str, + sqc_assoc: int, + scalar_size: str, + scalar_assoc: int, + tcc_size: str, + tcc_assoc: int, + tcc_count: int, + cu_per_sqc: int, + num_memory_channels: int, + cache_line_size: int, + shader: ViperShader, + ): + """ + :param size: The size of each cache in the heirarchy. + :param assoc: The associativity of each cache. + """ + super().__init__() + + self._tcp_size = tcp_size + self._tcp_assoc = tcp_assoc + self._sqc_size = sqc_size + self._sqc_assoc = sqc_assoc + self._scalar_size = scalar_size + self._scalar_assoc = scalar_assoc + self._tcc_size = tcc_size + self._tcc_assoc = tcc_assoc + self._cache_line_size = cache_line_size + + # We have everything we need to know to create the GPU cache hierarchy + # immediately. Therefore, an incorporate_cache method is not part of + # this cache hierarchy. Go ahead and incorporate everything now. + requires(coherence_protocol_required=CoherenceProtocol.GPU_VIPER) + + self.ruby_gpu = RubySystem() + self.ruby_gpu.block_size_bytes = cache_line_size + + # Ruby network for this GPU + self.ruby_gpu.network = SimpleDoubleCrossbar(self.ruby_gpu) + + # VIPER uses 6 virtual networks. + self.ruby_gpu.number_of_virtual_networks = 6 + self.ruby_gpu.network.number_of_virtual_networks = 6 + + # There is a single local list of all of the controllers to make it + # easier to connect everything to the GPU network. This can be + # customized depending on the topology/network requirements. + self._controllers = [] + self._directory_controllers = [] + self._dma_controllers = [] + self._mem_ctrls = [] + + self.clk_domain = SrcClockDomain( + clock="1801MHz", + voltage_domain=VoltageDomain(), + ) + + # Variables used by multiple objects are defined once here + tcc_bits = int(math.log(tcc_count, 2)) + deadlock_threshold = 500000 + + # Create one TCP per CU + compute_units = shader.get_compute_units() + for idx, cu in enumerate(compute_units): + tcp = TCPCache( + tcp_size=self._tcp_size, + tcp_assoc=self._tcp_assoc, + network=self.ruby_gpu.network, + cache_line_size=self._cache_line_size, + ) + + tcp.version = idx + + tcp.sequencer = RubySequencer( + version=self.seqCount(), + dcache=tcp.L1cache, + ruby_system=self.ruby_gpu, + is_cpu_sequencer=True, + ) + + tcp.coalescer = VIPERCoalescer( + version=self.seqCount(), + icache=tcp.L1cache, + dcache=tcp.L1cache, + ruby_system=self.ruby_gpu, + support_inst_reqs=False, + is_cpu_sequencer=False, + deadlock_threshold=deadlock_threshold, + max_coalesces_per_cycle=1, + gmTokenPort=cu.gmTokenPort, + ) + + for port_idx in range(cu.wf_size): + cu.memory_port[port_idx] = tcp.coalescer.in_ports + + tcp.ruby_system = self.ruby_gpu + tcp.TCC_select_num_bits = tcc_bits + tcp.use_seq_not_coal = False + tcp.issue_latency = 1 + tcp.clk_domain = self.clk_domain + tcp.recycle_latency = 10 + tcp.WB = False + tcp.disableL1 = False + + self._controllers.append(tcp) + + # This check ensures there are a same number of CUs with shared SQC + # and Scalar caches. + num_cus = len(shader.get_compute_units()) + assert (num_cus % cu_per_sqc) == 0 + num_sqcs = num_cus // cu_per_sqc + + for idx in range(num_sqcs): + sqc = SQCCache( + sqc_size=self._sqc_size, + sqc_assoc=self._sqc_assoc, + network=self.ruby_gpu.network, + cache_line_size=self._cache_line_size, + ) + + sqc.version = idx + + sqc.sequencer = RubySequencer( + version=self.seqCount(), + dcache=sqc.L1cache, + ruby_system=self.ruby_gpu, + support_data_reqs=False, + is_cpu_sequencer=False, + deadlock_threshold=deadlock_threshold, + ) + + # SQC is shared across {cu_per_sqc} CUs. + cu_base = cu_per_sqc * idx + for cu_num in range(cu_per_sqc): + cu_id = cu_base + cu_num + compute_units[cu_id].sqc_port = sqc.sequencer.in_ports + + sqc.ruby_system = self.ruby_gpu + sqc.TCC_select_num_bits = tcc_bits + sqc.clk_domain = self.clk_domain + sqc.recycle_latency = 10 + + self._controllers.append(sqc) + + num_scalars = num_sqcs + for idx in range(num_scalars): + scalar = SQCCache( + sqc_size=self._scalar_size, + sqc_assoc=self._scalar_assoc, + network=self.ruby_gpu.network, + cache_line_size=self._cache_line_size, + ) + + # Scalar uses same controller as SQC, so add SQC count + scalar.version = idx + num_sqcs + + scalar.sequencer = RubySequencer( + version=self.seqCount(), + dcache=scalar.L1cache, + ruby_system=self.ruby_gpu, + support_data_reqs=False, + is_cpu_sequencer=False, + deadlock_threshold=deadlock_threshold, + ) + + # Scalar cache is shared across {cu_per_sqc} CUs. + cu_base = cu_per_sqc * idx + for cu_num in range(cu_per_sqc): + cu_id = cu_base + cu_num + compute_units[cu_id].scalar_port = scalar.sequencer.in_ports + + scalar.ruby_system = self.ruby_gpu + scalar.TCC_select_num_bits = tcc_bits + scalar.clk_domain = self.clk_domain + scalar.recycle_latency = 10 + + self._controllers.append(scalar) + + # Create TCCs (GPU L2 cache) + for idx in range(tcc_count): + tcc = TCCCache( + tcc_size=self._tcc_size, + tcc_assoc=self._tcc_assoc, + network=self.ruby_gpu.network, + cache_line_size=self._cache_line_size, + ) + + tcc.version = idx + + tcc.ruby_system = self.ruby_gpu + tcc.WB = False + tcc.clk_domain = self.clk_domain + tcc.recycle_latency = 10 + + self._controllers.append(tcc) + + # Create DMA controllers + for i, port in enumerate(shader.get_gpu_dma_ports()): + ctrl = ViperGPUDMAController( + self.ruby_gpu.network, self._cache_line_size + ) + ctrl.dma_sequencer = DMASequencer(version=i, in_ports=port) + + ctrl.ruby_system = self.ruby_gpu + ctrl.dma_sequencer.ruby_system = self.ruby_gpu + + self._dma_controllers.append(ctrl) + + # Create GPU memories. Currently fixed to HBM2. + mem_type_cls = HBM_2000_4H_1x64 + + # AMDGPUDevice currently tells the driver there is 16GiB for memory. + # Until that is a parameter, this need to be fixed to 16GiB. + gpu_mem_range = AddrRange(0, size="16GiB") + intlv_low_bit = int(math.log(self._cache_line_size, 2)) + intlv_bits = int(math.log(num_memory_channels, 2)) + + for idx in range(num_memory_channels): + addr_range = AddrRange( + gpu_mem_range.start, + size=gpu_mem_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + intlvBits=intlv_bits, + intlvMatch=idx, + xorHighBit=0, + ) + + mem_ctrl = MemCtrl(dram=mem_type_cls(range=addr_range)) + self._mem_ctrls.append(mem_ctrl) + + dir = ViperGPUDirectory( + self.ruby_gpu.network, + self._cache_line_size, + addr_range, + self._mem_ctrls[idx].port, + ) + + dir.ruby_system = self.ruby_gpu + dir.TCC_select_num_bits = tcc_bits + dir.version = len(self._directory_controllers) + self._directory_controllers.append(dir) + + dir.L3CacheMemory = RubyCache( + size="16MiB", + assoc=16, + atomicALUs=64, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=16, + tagArrayBanks=16, + dataAccessLatency=20, + tagAccessLatency=15, + ) + + # Number of sequencers = one per TCP, SQC, and Scalar + one per DMA. + self.ruby_gpu.num_of_sequencers = len(self._controllers) + len( + self._dma_controllers + ) + + # Assign the controllers to their parent objects. + self.ruby_gpu.controllers = self._controllers + self.ruby_gpu.directory_controllers = self._directory_controllers + + # Connect the controllers using the network topology + self.ruby_gpu.network.connect( + self._controllers + + self._directory_controllers + + self._dma_controllers + ) + self.ruby_gpu.network.setup_buffers() + + def get_mem_ctrls(self): + return self._mem_ctrls diff --git a/src/python/gem5/prebuilt/viper/viper_network.py b/src/python/gem5/prebuilt/viper/viper_network.py new file mode 100644 index 0000000000..e22a330126 --- /dev/null +++ b/src/python/gem5/prebuilt/viper/viper_network.py @@ -0,0 +1,165 @@ +# Copyright (c) 2021 The Regents of the University of California. +# All Rights Reserved +# +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects import ( + SimpleExtLink, + SimpleIntLink, + SimpleNetwork, + Switch, +) + + +class SimplePt2Pt(SimpleNetwork): + """A simple point-to-point network. This does not use garnet.""" + + def __init__(self, ruby_system): + super().__init__() + self.netifs = [] + + # TODO: These should be in a base class + # https://gem5.atlassian.net/browse/GEM5-1039 + self.ruby_system = ruby_system + + def connect(self, controllers): + """Connect all of the controllers to routers and connect the routers + together in a point-to-point network. + """ + # Create one router/switch per controller in the system + self.routers = [Switch(router_id=i) for i in range(len(controllers))] + + # Make a link from each controller to the router. The link goes + # externally to the network. + self.ext_links = [ + SimpleExtLink(link_id=i, ext_node=c, int_node=self.routers[i]) + for i, c in enumerate(controllers) + ] + + # Make an "internal" link (internal to the network) between every pair + # of routers. + link_count = 0 + int_links = [] + for ri in self.routers: + for rj in self.routers: + if ri == rj: + continue # Don't connect a router to itself! + link_count += 1 + int_links.append( + SimpleIntLink(link_id=link_count, src_node=ri, dst_node=rj) + ) + self.int_links = int_links + + +class SimpleDoubleCrossbar(SimpleNetwork): + """ + GPU network with crossbars between CU caches and L2 caches and between L2 + caches and directories/memory controllers/DMAs using SimpleNetwork. + """ + + def __init__(self, ruby_system): + super().__init__() + self.netifs = [] + + self.ruby_system = ruby_system + + def connect(self, controllers): + l2_xbar_types = ("TCP_Controller", "SQC_Controller", "TCC_Controller") + soc_xbar_types = ("DMA_Controller", "Directory_Controller") + + # Create one router per controller plus a crossbar for L2 controllers + # and a crossbar for SoC controllers. + routers = [Switch(router_id=i) for i in range(len(controllers))] + routers.append(Switch(router_id=len(routers))) + routers.append(Switch(router_id=len(routers))) + self.routers = routers + + # Routers 0 ... N-2 connect to the individual controllers + self.ext_links = [ + SimpleExtLink(link_id=i, ext_node=c, int_node=self.routers[i]) + for i, c in enumerate(controllers) + ] + + # Connect compute unit components and L2s to L2 crossbar in both + # directions. + l2_xbar_id = len(controllers) + soc_xbar_id = l2_xbar_id + 1 + int_links = [] + + for ext_link in self.ext_links: + if ext_link.ext_node.type in l2_xbar_types: + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=ext_link.int_node, + dst_node=self.routers[l2_xbar_id], + ) + ) + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=self.routers[l2_xbar_id], + dst_node=ext_link.int_node, + ) + ) + elif ext_link.ext_node.type in soc_xbar_types: + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=ext_link.int_node, + dst_node=self.routers[soc_xbar_id], + ) + ) + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=self.routers[soc_xbar_id], + dst_node=ext_link.int_node, + ) + ) + + # Connect L2 xbar to SoC xbar. + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=self.routers[l2_xbar_id], + dst_node=self.routers[soc_xbar_id], + ) + ) + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=self.routers[soc_xbar_id], + dst_node=self.routers[l2_xbar_id], + ) + ) + + # Finalize network int_links for unproxy + self.int_links = int_links