diff --git a/configs/example/gem5_library/x86-mi300x-gpu.py b/configs/example/gem5_library/x86-mi300x-gpu.py new file mode 100644 index 0000000000..20fa99b9d8 --- /dev/null +++ b/configs/example/gem5_library/x86-mi300x-gpu.py @@ -0,0 +1,155 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +""" +Script to run a full system GPU simulation. + +Usage: +------ +``` +scons build/VEGA_X86/gem5.opt +./build/VEGA_X86/gem5.opt + configs/example/gem5_library/x86-viper-gpu.py + --image + --kernel + --app +``` + +Example: +-------- +``` +./build/VEGA_X86/gem5.opt + configs/example/gem5_library/x86-viper-gpu.py + --image ./gem5-resources/src/x86-ubuntu-gpu-ml/disk-image/x86-ubuntu-gpu-ml + --kernel ./gem5-resources/src/x86-ubuntu-gpu-ml/vmlinux-gpu-ml + --app ./gem5-resources/src/gpu/square/bin.default/square.default +``` +""" + +import argparse + +from gem5.coherence_protocol import CoherenceProtocol +from gem5.components.devices.gpus.amdgpu import MI300X +from gem5.components.memory.single_channel import SingleChannelDDR4_2400 +from gem5.components.processors.cpu_types import CPUTypes +from gem5.components.processors.simple_processor import SimpleProcessor +from gem5.isas import ISA +from gem5.prebuilt.viper.board import ViperBoard +from gem5.prebuilt.viper.cpu_cache_hierarchy import ViperCPUCacheHierarchy +from gem5.resources.resource import ( + DiskImageResource, + FileResource, +) +from gem5.simulate.simulator import Simulator +from gem5.utils.requires import requires + +requires( + isa_required=ISA.X86, + coherence_protocol_required=CoherenceProtocol.GPU_VIPER, +) + +# Kernel, disk, and applications are obtained locally. +parser = argparse.ArgumentParser() + +parser.add_argument( + "--image", + type=str, + required=True, + help="Full path to the gem5-resources x86-ubuntu-gpu-ml disk-image.", +) + +parser.add_argument( + "--kernel", + type=str, + required=True, + help="Full path to the gem5-resources vmlinux-gpu-ml kernel.", +) + +parser.add_argument( + "--app", + type=str, + required=True, + help="Path to GPU application, python script, or bash script to run", +) + +parser.add_argument( + "--kvm-perf", + default=False, + action="store_true", + help="Use KVM perf counters to give accurate GPU insts/cycles with KVM", +) + +args = parser.parse_args() + +# stdlib only supports up to 3GiB currently. This will need to be expanded in +# the future. +memory = SingleChannelDDR4_2400(size="3GiB") + +# Note: Only KVM and ATOMIC work due to buggy MOESI_AMD_Base protocol. +processor = SimpleProcessor(cpu_type=CPUTypes.KVM, isa=ISA.X86, num_cores=2) + +for core in processor.cores: + if core.is_kvm_core(): + core.get_simobject().usePerf = args.kvm_perf + +# The GPU must be created first so we can assign CPU-side DMA ports to the +# CPU cache hierarchy. +gpu0 = MI300X() + +cache_hierarchy = ViperCPUCacheHierarchy( + l1d_size="32KiB", + l1d_assoc=8, + l1i_size="32KiB", + l1i_assoc=8, + l2_size="1MiB", + l2_assoc=16, + l3_size="16MiB", + l3_assoc=16, +) + +board = ViperBoard( + clk_freq="3GHz", + processor=processor, + memory=memory, + cache_hierarchy=cache_hierarchy, + gpus=[gpu0], +) + +# Example of using a local disk image resource +disk = DiskImageResource(local_path=args.image, root_partition="1") +kernel = FileResource(local_path=args.kernel) + +board.set_kernel_disk_workload( + kernel=kernel, + disk_image=disk, + readfile_contents=board.make_gpu_app(gpu0, args.app), +) + +simulator = Simulator(board=board) +simulator.run() diff --git a/src/python/SConscript b/src/python/SConscript index b7a40c30c8..ab711fb668 100644 --- a/src/python/SConscript +++ b/src/python/SConscript @@ -175,10 +175,34 @@ PySource('gem5.components.cachehierarchies.ruby.caches.mi_example', 'dma_controller.py') PySource('gem5.components.cachehierarchies.ruby.caches.mi_example', 'gem5/components/cachehierarchies/ruby/caches/mi_example/l1_cache.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/__init__.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/corepair_cache.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/directory.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/dma_controller.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/tcp.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/sqc.py') +PySource('gem5.components.cachehierarchies.ruby.caches.viper', + 'gem5/components/cachehierarchies/ruby/caches/viper/tcc.py') PySource('gem5.components.cachehierarchies.ruby.topologies', 'gem5/components/cachehierarchies/ruby/topologies/__init__.py') PySource('gem5.components.cachehierarchies.ruby.topologies', 'gem5/components/cachehierarchies/ruby/topologies/simple_pt2pt.py') + +PySource('gem5.components.devices', + 'gem5/components/devices/__init__.py') +PySource('gem5.components.devices.gpus', + 'gem5/components/devices/gpus/__init__.py') +PySource('gem5.components.devices.gpus', + 'gem5/components/devices/gpus/amdgpu.py') +PySource('gem5.components.devices.gpus', + 'gem5/components/devices/gpus/viper_shader.py') + PySource('gem5.components.memory', 'gem5/components/memory/__init__.py') PySource('gem5.components.memory', 'gem5/components/memory/abstract_memory_system.py') PySource('gem5.components.memory', 'gem5/components/memory/dramsim_3.py') @@ -289,6 +313,14 @@ PySource('gem5.prebuilt.riscvmatched', 'gem5/prebuilt/riscvmatched/riscvmatched_processor.py') PySource('gem5.prebuilt.riscvmatched', 'gem5/prebuilt/riscvmatched/riscvmatched_core.py') +PySource('gem5.prebuilt.viper', 'gem5/prebuilt/viper/__init__.py') +PySource('gem5.prebuilt.viper', 'gem5/prebuilt/viper/board.py') +PySource('gem5.prebuilt.viper', + 'gem5/prebuilt/viper/cpu_cache_hierarchy.py') +PySource('gem5.prebuilt.viper', + 'gem5/prebuilt/viper/gpu_cache_hierarchy.py') +PySource('gem5.prebuilt.viper', + 'gem5/prebuilt/viper/viper_network.py') PySource('gem5.resources', 'gem5/resources/__init__.py') PySource('gem5.resources', 'gem5/resources/client.py') PySource('gem5.resources', 'gem5/resources/downloader.py') diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/__init__.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/corepair_cache.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/corepair_cache.py new file mode 100644 index 0000000000..cc7dcd7c39 --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/corepair_cache.py @@ -0,0 +1,124 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import math + +from m5.objects import ( + CorePair_Controller, + MessageBuffer, + RubyCache, + TreePLRURP, +) + +from gem5.components.processors.abstract_core import AbstractCore + + +class CorePairCache(CorePair_Controller): + def __init__( + self, + l1i_size: str, + l1i_assoc: int, + l1d_size: str, + l1d_assoc: int, + l2_size: str, + l2_assoc: int, + network, + cache_line_size, + core: AbstractCore, + ): + """Creating CorePair cache controller. Consist of both instruction + and data cache for a pair of L1s and a single L2 cache shared between + them. + """ + super().__init__() + + self.send_evictions = core.requires_send_evicts() + + self.L1Icache = RubyCache( + size=l1i_size, + assoc=l1i_assoc, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=2, + tagArrayBanks=2, + dataAccessLatency=1, + tagAccessLatency=1, + ) + + self.L1D0cache = RubyCache( + size=l1d_size, + assoc=l1d_assoc, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=2, + tagArrayBanks=2, + dataAccessLatency=1, + tagAccessLatency=1, + ) + + self.L1D1cache = RubyCache( + size=l1d_size, + assoc=l1d_assoc, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=2, + tagArrayBanks=2, + dataAccessLatency=1, + tagAccessLatency=1, + ) + + self.L2cache = RubyCache( + size=l2_size, + assoc=l2_assoc, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=16, + tagArrayBanks=16, + ) + + self.connectQueues(network) + + def connectQueues(self, network): + self.requestFromCore = MessageBuffer() + self.requestFromCore.out_port = network.in_port + + self.responseFromCore = MessageBuffer() + self.responseFromCore.out_port = network.in_port + + self.unblockFromCore = MessageBuffer() + self.unblockFromCore.out_port = network.in_port + + self.probeToCore = MessageBuffer() + self.probeToCore.in_port = network.out_port + + self.responseToCore = MessageBuffer() + self.responseToCore.in_port = network.out_port + + self.mandatoryQueue = MessageBuffer() + self.triggerQueue = MessageBuffer(ordered=True) diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/directory.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/directory.py new file mode 100644 index 0000000000..2e5c9c1f95 --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/directory.py @@ -0,0 +1,106 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +from m5.objects import ( + MessageBuffer, + RubyDirectoryMemory, +) + +from ......utils.override import overrides +from ..abstract_directory import AbstractDirectory + + +class ViperDirectory(AbstractDirectory): + def __init__(self, network, cache_line_size, mem_range, port): + super().__init__(network, cache_line_size) + self.addr_ranges = [mem_range] + self.directory = RubyDirectoryMemory( + block_size=cache_line_size, + ruby_system=network.ruby_system, + ) + # Connect this directory to the memory side. + self.memory_out_port = port + + # Turn off TCC (GPU cache) related parameters + self.noTCCdir = True + self.TCC_select_num_bits = 0 + + # Defaults which must be set + self.CPUonly = False + self.GPUonly = False + self.useL3OnWT = False + self.L2isWB = False + + @overrides(AbstractDirectory) + def connectQueues(self, network): + self.requestFromDMA = MessageBuffer(ordered=True) + self.requestFromDMA.in_port = network.out_port + + self.responseToDMA = MessageBuffer() + self.responseToDMA.out_port = network.in_port + + self.requestFromCores = MessageBuffer(ordered=True) + self.requestFromCores.in_port = network.out_port + + self.responseFromCores = MessageBuffer() + self.responseFromCores.in_port = network.out_port + + self.unblockFromCores = MessageBuffer() + self.unblockFromCores.in_port = network.out_port + + self.probeToCore = MessageBuffer() + self.probeToCore.out_port = network.in_port + + self.responseToCore = MessageBuffer() + self.responseToCore.out_port = network.in_port + + self.triggerQueue = MessageBuffer(ordered=True) + self.L3triggerQueue = MessageBuffer(ordered=True) + + self.requestToMemory = MessageBuffer() + self.responseFromMemory = MessageBuffer() + + +# This is intended to be used on the CPU side +class ViperCPUDirectory(ViperDirectory): + def __init__(self, network, cache_line_size, mem_range, port): + super().__init__(network, cache_line_size, mem_range, port) + + self.CPUonly = True + self.GPUonly = False + + +# This is intended to be used on the GPU side +class ViperGPUDirectory(ViperDirectory): + def __init__(self, network, cache_line_size, mem_range, port): + super().__init__(network, cache_line_size, mem_range, port) + + self.CPUonly = False + self.GPUonly = True diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/dma_controller.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/dma_controller.py new file mode 100644 index 0000000000..f163840c3c --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/dma_controller.py @@ -0,0 +1,68 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +from m5.objects import MessageBuffer + +from ......utils.override import overrides +from ..abstract_dma_controller import AbstractDMAController + + +# There is a controller for GPU and GPU to keep the "version" numbers +# incrementing seperately +class ViperCPUDMAController(AbstractDMAController): + def __init__(self, network, cache_line_size): + super().__init__(network, cache_line_size) + + @overrides(AbstractDMAController) + def connectQueues(self, network): + # A buffer size of 0 means it is an infinite queue. The VIPER + # DMA controller has not been thoroughly tested with finite buffers. + # Test + self.mandatoryQueue = MessageBuffer(buffer_size=0) + self.responseFromDir = MessageBuffer(buffer_size=0) + self.responseFromDir.in_port = network.out_port + self.requestToDir = MessageBuffer(buffer_size=0) + self.requestToDir.out_port = network.in_port + + +class ViperGPUDMAController(AbstractDMAController): + def __init__(self, network, cache_line_size): + super().__init__(network, cache_line_size) + + @overrides(AbstractDMAController) + def connectQueues(self, network): + # A buffer size of 0 means it is an infinite queue. The VIPER + # DMA controller has not been thoroughly tested with finite buffers. + # Test + self.mandatoryQueue = MessageBuffer(buffer_size=0) + self.responseFromDir = MessageBuffer(buffer_size=0) + self.responseFromDir.in_port = network.out_port + self.requestToDir = MessageBuffer(buffer_size=0) + self.requestToDir.out_port = network.in_port diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/sqc.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/sqc.py new file mode 100644 index 0000000000..835434ff16 --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/sqc.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects import ( + MessageBuffer, + RubyCache, + SQC_Controller, + TreePLRURP, +) + + +class SQCCache(SQC_Controller): + def __init__( + self, + sqc_size: str, + sqc_assoc: int, + network, + cache_line_size, + ): + """Creating SQC cache controller. This is the Icache for GPU devices.""" + + super().__init__() + + self.L1cache = RubyCache( + size=sqc_size, + assoc=sqc_assoc, + dataArrayBanks=8, + tagArrayBanks=8, + dataAccessLatency=1, + tagAccessLatency=1, + resourceStalls=True, + replacement_policy=TreePLRURP(), + ) + + self.connectQueues(network) + + def connectQueues(self, network): + self.requestFromSQC = MessageBuffer(ordered=True) + self.requestFromSQC.out_port = network.in_port + + self.probeToSQC = MessageBuffer(ordered=True) + self.probeToSQC.in_port = network.out_port + + self.responseToSQC = MessageBuffer(ordered=True) + self.responseToSQC.in_port = network.out_port + + self.mandatoryQueue = MessageBuffer() diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcc.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcc.py new file mode 100644 index 0000000000..f38c34bf84 --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcc.py @@ -0,0 +1,87 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects import ( + MessageBuffer, + RubyCache, + TCC_Controller, + TreePLRURP, +) + + +class TCCCache(TCC_Controller): + def __init__( + self, + tcc_size: str, + tcc_assoc: int, + network, + cache_line_size, + ): + """Creating TCC cache controller. This is the L2 cache for GPU devices.""" + + super().__init__() + + self.L2cache = RubyCache( + size=tcc_size, + assoc=tcc_assoc, + dataArrayBanks=256, + tagArrayBanks=256, + dataAccessLatency=8, + tagAccessLatency=2, + resourceStalls=True, + replacement_policy=TreePLRURP(), + atomicLatency=0, + atomicALUs=64, + ) + + self.connectQueues(network) + + def connectQueues(self, network): + self.requestFromTCP = MessageBuffer(ordered=True) + self.requestFromTCP.in_port = network.out_port + + self.responseToCore = MessageBuffer(ordered=True) + self.responseToCore.out_port = network.in_port + + self.probeFromNB = MessageBuffer() + self.probeFromNB.in_port = network.out_port + + self.responseFromNB = MessageBuffer() + self.responseFromNB.in_port = network.out_port + + self.requestToNB = MessageBuffer(ordered=True) + self.requestToNB.out_port = network.in_port + + self.responseToNB = MessageBuffer() + self.responseToNB.out_port = network.in_port + + self.unblockToNB = MessageBuffer() + self.unblockToNB.out_port = network.in_port + + self.triggerQueue = MessageBuffer(ordered=True) diff --git a/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcp.py b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcp.py new file mode 100644 index 0000000000..8323eda6fe --- /dev/null +++ b/src/python/gem5/components/cachehierarchies/ruby/caches/viper/tcp.py @@ -0,0 +1,79 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects import ( + MessageBuffer, + RubyCache, + TCP_Controller, + TreePLRURP, +) + + +class TCPCache(TCP_Controller): + def __init__( + self, + tcp_size: str, + tcp_assoc: int, + network, + cache_line_size, + ): + """Creating TCP cache controller. This is the L1 cache for GPU devices.""" + + super().__init__() + + self.L1cache = RubyCache( + size=tcp_size, + assoc=tcp_assoc, + dataArrayBanks=16, + tagArrayBanks=16, + dataAccessLatency=4, + tagAccessLatency=1, + resourceStalls=True, + replacement_policy=TreePLRURP(), + ) + + self.connectQueues(network) + + def connectQueues(self, network): + self.requestFromTCP = MessageBuffer(ordered=True) + self.requestFromTCP.out_port = network.in_port + + self.responseFromTCP = MessageBuffer(ordered=True) + self.responseFromTCP.out_port = network.in_port + + self.unblockFromCore = MessageBuffer() + self.unblockFromCore.out_port = network.in_port + + self.probeToTCP = MessageBuffer(ordered=True) + self.probeToTCP.in_port = network.out_port + + self.responseToTCP = MessageBuffer(ordered=True) + self.responseToTCP.in_port = network.out_port + + self.mandatoryQueue = MessageBuffer() diff --git a/src/python/gem5/components/devices/__init__.py b/src/python/gem5/components/devices/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/python/gem5/components/devices/gpus/__init__.py b/src/python/gem5/components/devices/gpus/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/python/gem5/components/devices/gpus/amdgpu.py b/src/python/gem5/components/devices/gpus/amdgpu.py new file mode 100644 index 0000000000..67ecc6c2c6 --- /dev/null +++ b/src/python/gem5/components/devices/gpus/amdgpu.py @@ -0,0 +1,279 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects import ( + AMDGPUDevice, + SubSystem, +) + +from ....components.boards.abstract_board import AbstractBoard +from ....prebuilt.viper.gpu_cache_hierarchy import ViperGPUCacheHierarchy +from .viper_shader import ViperShader + + +class BaseViperGPU(SubSystem): + _base_pci_dev = 8 + _gpu_count = 0 + _my_id = 0 + + @classmethod + def next_pci_dev(cls): + cls._gpu_count += 1 # Use count for this particular type + return cls._base_pci_dev + cls._gpu_count - 1 + + @classmethod + def get_gpu_count(cls): + return cls._gpu_count + + def __init__(self): + # Setup various PCI related parameters + self._my_id = self.get_gpu_count() + pci_dev = self.next_pci_dev() + + device = AMDGPUDevice(pci_func=0, pci_dev=pci_dev, pci_bus=0) + self._device = device + + def set_shader(self, shader: ViperShader): + self._shader = shader + + def get_cpu_dma_ports(self): + return self._shader.get_cpu_dma_ports() + + def connectGPU(self, board: AbstractBoard) -> None: + # Connect a CPU pointer. This is only used for SE mode. Any CPU will + # work, so pick assuming there is at least one + cpus = board.get_processor() + self._shader.set_cpu_pointer(cpus.cores[0].core) + + # Connect all PIO buses + self._shader.connect_iobus(board.get_io_bus()) + + # The System() object in gem5 has a memories parameter which defaults + # to Self.all. This will collect *all* AbstractMemories and connect to + # the CPU side. To avoid this we manually assign the memories param to + # the CPU side memories. We need the MemInterface which is called dram + # in the MemCtrl class even though it might not be modelling dram. + memory = board.get_memory() + cpu_abs_mems = [mem.dram for mem in memory.get_memory_controllers()] + board.memories = cpu_abs_mems + + # Make the cache hierarchy. This will create an independent RubySystem + # class containing only the GPU caches with no network connection to + # the CPU cache hierarchy. + self._device.gpu_caches = ViperGPUCacheHierarchy( + tcp_size=self._tcp_size, + tcp_assoc=self._tcp_assoc, + sqc_size=self._sqc_size, + sqc_assoc=self._sqc_assoc, + scalar_size=self._scalar_size, + scalar_assoc=self._scalar_assoc, + tcc_size=self._tcc_size, + tcc_assoc=self._tcc_assoc, + tcc_count=self._tcc_count, + cu_per_sqc=self._cu_per_sqc, + num_memory_channels=self._num_memory_channels, + cache_line_size=self._cache_line_size, + shader=self._shader, + ) + + # Collect GPU memory controllers created in the GPU cache hierarchy. + # First assign them as a child to the device so the SimObject unproxy. + # The device requires the memories parameter to be set as the system + # pointer required by the AbstractMemory class is set by AMDGPUDevice. + self._device.mem_ctrls = self._device.gpu_caches.get_mem_ctrls() + gpu_abs_mems = [mem.dram for mem in self._device.mem_ctrls] + self._device.memories = gpu_abs_mems + + # Finally attach to the board. PciDevices default to Parent.any for the + # PciHost parameter. To make sure this is found we need to connect to + # board.pc or a child of board.pc. Historically we place this in the + # south bridge. + board.pc.south_bridge.gpu_shader = self._shader + + # This is cosmetic so the device shows as board.pc.south_bridge.gpu### + # instead of board.pc.south_bridge.gpu_shader.CUs.l1_tlb.gpu_device. + gpu_name = f"gpu{self._my_id}" + self._device.set_parent(board.pc.south_bridge, gpu_name) + + +# A scaled down MI210-like device. Defaults to ~1/4th of an MI210. +class MI210(BaseViperGPU): + def __init__( + self, + num_cus: int = 32, + cu_per_sqc: int = 4, + tcp_size: str = "16KiB", + tcp_assoc: int = 16, + sqc_size: str = "32KiB", + sqc_assoc: int = 8, + scalar_size: str = "32KiB", + scalar_assoc: int = 8, + tcc_size: str = "256KiB", + tcc_assoc: int = 16, + tcc_count: int = 8, + num_memory_channels: int = 8, + cache_line_size: int = 64, + ): + super().__init__() + + self._cu_per_sqc = cu_per_sqc + self._tcp_size = tcp_size + self._tcp_assoc = tcp_assoc + self._sqc_size = sqc_size + self._sqc_assoc = sqc_assoc + self._scalar_size = scalar_size + self._scalar_assoc = scalar_assoc + self._tcc_size = tcc_size + self._tcc_assoc = tcc_assoc + self._tcc_count = tcc_count + self._num_memory_channels = num_memory_channels + self._cache_line_size = cache_line_size + + self._device.device_name = "MI200" + + self._device.DeviceID = 0x740F + self._device.SubsystemVendorID = 0x1002 + self._device.SubsystemID = 0x0C34 + + # Setup device-specific address ranges for various SoC components. + shader = ViperShader( + self._my_id, num_cus, cache_line_size, self._device + ) + self.set_shader(shader) + + # Setup the SDMA engines depending on device. The MMIO base addresses + # can be found in the driver code under: + # include/asic_reg/sdmaX/sdmaX_Y_Z_offset.h + num_sdmas = 5 + sdma_bases = [0x4980, 0x6180, 0x78000, 0x79000, 0x7A000] + sdma_sizes = [0x1000] * 5 + + self._device.sdmas = shader._create_sdmas(sdma_bases, sdma_sizes) + + # Setup the Command Processor's PM4 engines. + pm4_starts = [0xC000] + pm4_ends = [0xD000] + + self._device.pm4_pkt_procs = shader._create_pm4s(pm4_starts, pm4_ends) + + def get_driver_command(self, debug: bool = False): + debug_commands = "dmesg -n8\n" if debug else "" + + driver_load_command = ( + "export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH\n" + "export HSA_ENABLE_INTERRUPT=0\n" + "export HCC_AMDGPU_TARGET=gfx90a\n" + f"{debug_commands}\n" + "dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128\n" + "if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then\n" + ' echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."\n' + " /sbin/m5 exit\n" + "fi\n" + "modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0\n" + ) + + return driver_load_command + + +# Defaults to a single "XCD" (i.e., 1/8th of a full MI300X). +class MI300X(BaseViperGPU): + def __init__( + self, + num_cus: int = 40, + cu_per_sqc: int = 4, + tcp_size: str = "16KiB", + tcp_assoc: int = 16, + sqc_size: str = "32KiB", + sqc_assoc: int = 8, + scalar_size: str = "32KiB", + scalar_assoc: int = 8, + tcc_size: str = "256KiB", + tcc_assoc: int = 16, + tcc_count: int = 16, + num_memory_channels: int = 16, + cache_line_size: int = 64, + ): + super().__init__() + + self._cu_per_sqc = cu_per_sqc + self._tcp_size = tcp_size + self._tcp_assoc = tcp_assoc + self._sqc_size = sqc_size + self._sqc_assoc = sqc_assoc + self._scalar_size = scalar_size + self._scalar_assoc = scalar_assoc + self._tcc_size = tcc_size + self._tcc_assoc = tcc_assoc + self._tcc_count = tcc_count + self._num_memory_channels = num_memory_channels + self._cache_line_size = cache_line_size + + self._device.device_name = "MI300X" + + self._device.DeviceID = 0x740F + self._device.SubsystemVendorID = 0x1002 + self._device.SubsystemID = 0x0C34 + + # Setup device-specific address ranges for various SoC components. + shader = ViperShader( + self._my_id, num_cus, cache_line_size, self._device + ) + self.set_shader(shader) + + # These currently use MI200 values until the MI300X bios is released. + num_sdmas = 5 + sdma_bases = [0x4980, 0x6180, 0x78000, 0x79000, 0x7A000] + sdma_sizes = [0x1000] * 5 + + self._device.sdmas = shader._create_sdmas(sdma_bases, sdma_sizes) + + # Setup the Command Processor's PM4 engines. + pm4_starts = [0xC000] + pm4_ends = [0xD000] + + self._device.pm4_pkt_procs = shader._create_pm4s(pm4_starts, pm4_ends) + + def get_driver_command(self, debug: bool = False): + debug_commands = "dmesg -n8\n" if debug else "" + + driver_load_command = ( + "export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH\n" + "export HSA_ENABLE_INTERRUPT=0\n" + "export HCC_AMDGPU_TARGET=gfx942\n" + 'export HSA_OVERRIDE_GFX_VERSION="9.4.2"\n' + f"{debug_commands}\n" + "dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128\n" + "if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then\n" + ' echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."\n' + " /sbin/m5 exit\n" + "fi\n" + "modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0\n" + ) + + return driver_load_command diff --git a/src/python/gem5/components/devices/gpus/viper_shader.py b/src/python/gem5/components/devices/gpus/viper_shader.py new file mode 100644 index 0000000000..ce5e0a5eeb --- /dev/null +++ b/src/python/gem5/components/devices/gpus/viper_shader.py @@ -0,0 +1,377 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from typing import List + +from m5.objects import ( + AddrRange, + AMDGPUDevice, + AMDGPUInterruptHandler, + AMDGPUMemoryManager, + AMDGPUSystemHub, + BaseCPU, + BaseXBar, + ComputeUnit, + DynPoolManager, + GPUCommandProcessor, + GPUDispatcher, + HSAPacketProcessor, + LdsState, + PciLegacyIoBar, + PM4PacketProcessor, + RegisterFileCache, + RegisterManager, + ScalarRegisterFile, + SDMAEngine, + Shader, + VectorRegisterFile, + VegaGPUTLB, + VegaPagetableWalker, + VegaTLBCoalescer, + Wavefront, +) + + +class ViperCU(ComputeUnit): + def __init__(self, cu_id: int, device: AMDGPUDevice): + """ComputeUnit object of a gfx9-like compute unit.""" + super().__init__() + self._device = device + + self.cu_id = cu_id + + # Use in multiple places. Define variables to change once. + self._vreg_file_size = 2048 + self._sreg_file_size = 2048 + + # Latencies, etc. use defaults in src/gpu-compute/GPU.py. + self.num_SIMDs = 4 + self.n_wf = 8 + + self.localDataStore = LdsState( + size=65536, + ) + + self.wavefronts = [ + Wavefront(simdId=j, wf_slot_id=k) + for j in range(self.num_SIMDs) + for k in range(self.n_wf) + ] + + self.vector_register_file = [ + VectorRegisterFile(simd_id=i, num_regs=self._vreg_file_size) + for i in range(self.num_SIMDs) + ] + + self.scalar_register_file = [ + ScalarRegisterFile(simd_id=i, num_regs=self._sreg_file_size) + for i in range(self.num_SIMDs) + ] + + self.register_file_cache = [ + RegisterFileCache(simd_id=i) for i in range(self.num_SIMDs) + ] + + self.register_manager = RegisterManager( + policy="static", + vrf_pool_managers=[ + DynPoolManager(pool_size=self._vreg_file_size, min_alloc=4) + for _ in range(self.num_SIMDs) + ], + srf_pool_managers=[ + DynPoolManager(pool_size=self._sreg_file_size, min_alloc=4) + for _ in range(self.num_SIMDs) + ], + ) + + self.ldsPort = self.ldsBus.cpu_side_port + self.ldsBus.mem_side_port = self.localDataStore.cuPort + + self._create_tlbs() + + def _create_tlbs(self): + # Vector memory TLB + self.l1_tlb = VegaGPUTLB( + gpu_device=self._device, + size=64, + assoc=64, + hitLatency=1, + missLatency1=750, + missLatency2=750, + maxOutstandingReqs=64, + ) + + self.l1_coalescer = VegaTLBCoalescer(tlb_level=1) + + self.translation_port = self.l1_coalescer.cpu_side_ports + self.l1_coalescer.mem_side_ports = self.l1_tlb.cpu_side_ports + + # Scalar memory TLB + self.scalar_tlb = VegaGPUTLB( + gpu_device=self._device, + size=64, + assoc=64, + hitLatency=1, + missLatency1=750, + missLatency2=750, + maxOutstandingReqs=64, + ) + + self.scalar_coalescer = VegaTLBCoalescer(tlb_level=1) + + self.scalar_tlb_port = self.scalar_coalescer.cpu_side_ports + self.scalar_coalescer.mem_side_ports = self.scalar_tlb.cpu_side_ports + + # Instruction memory TLB + self.sqc_tlb = VegaGPUTLB( + gpu_device=self._device, + size=64, + assoc=64, + hitLatency=1, + missLatency1=750, + missLatency2=750, + maxOutstandingReqs=64, + ) + + self.sqc_coalescer = VegaTLBCoalescer(tlb_level=1) + + self.sqc_tlb_port = self.sqc_coalescer.cpu_side_ports + self.sqc_coalescer.mem_side_ports = self.sqc_tlb.cpu_side_ports + + def get_tlb_ports(self): + return [ + self.l1_tlb.mem_side_ports, + self.sqc_tlb.mem_side_ports, + self.scalar_tlb.mem_side_ports, + ] + + +class ViperShader(Shader): + def __init__( + self, + shader_id: int, + num_cus: int, + cache_line_size: int, + device: AMDGPUDevice, + ): + """ + The shader defines something the represents a single software visible + GPU (e.g., a graphics card, a chiplet on a GPU, etc.). + """ + super().__init__() + + self._shader_id = shader_id + self._cache_line_size = cache_line_size + self._device = device + + self.n_wf = 8 + self.timing = True + # used to track the (many, many) DMA ports + self._cpu_dma_ports = [] + self._gpu_dma_ports = [] + + # VIPER GPU protocol implements release consistency at GPU side. So, + # we make their writes visible to the global memory and should read + # from global memory during kernal boundary. The pipeline initiates + # (or do not initiate) the acquire/release operation depending on + # these impl_kern_launch_rel and impl_kern_end_rel flags. The flag=true + # means pipeline initiates a acquire/release operation at kernel launch/end + # VIPER protocol is write-through based, and thus only impl_kern_launch_acq + # needs to set. + self.impl_kern_launch_acq = True + self.impl_kern_end_rel = False + + # Attach compute units to GPU + self.CUs = [ViperCU(idx, device) for idx in range(num_cus)] + + self._create_tlbs(device) + + # This arbitrary address is something in the X86 I/O hole + hsapp_gpu_map_paddr = 0xE00000000 + self.dispatcher = GPUDispatcher() + self.gpu_cmd_proc = GPUCommandProcessor( + hsapp=HSAPacketProcessor( + pioAddr=hsapp_gpu_map_paddr, + numHWQueues=10, + walker=VegaPagetableWalker(), + ), + dispatcher=self.dispatcher, + walker=VegaPagetableWalker(), + ) + self._cpu_dma_ports.append(self.gpu_cmd_proc.hsapp.dma) + self._cpu_dma_ports.append(self.gpu_cmd_proc.dma) + + self._gpu_dma_ports.append(self.gpu_cmd_proc.hsapp.walker.port) + self._gpu_dma_ports.append(self.gpu_cmd_proc.walker.port) + + self.system_hub = AMDGPUSystemHub() + self._cpu_dma_ports.append(self.system_hub.dma) + + self._setup_device(device) + + def get_compute_units(self): + return self.CUs + + def _setup_device(self, device: AMDGPUDevice): + """Set the device type info on the device connected via PCI.""" + device.cp = self.gpu_cmd_proc + device.device_ih = AMDGPUInterruptHandler() + self._cpu_dma_ports.append(device.device_ih.dma) + + # GPU data path + device.memory_manager = AMDGPUMemoryManager( + cache_line_size=self._cache_line_size, + ) + self._gpu_dma_ports.append(device.memory_manager.port) + + self._cpu_dma_ports.append(device.dma) + + # Use the gem5 default of 0x280 OR'd with 0x10 which tells Linux there is + # a PCI capabilities list to travse. + device.Status = 0x0290 + + # The PCI capabilities are like a linked list. The list has a memory + # offset and a capability type ID read by the OS. Make the first + # capability at 0x80 and set the PXCAP (PCI express) capability to + # that address. Mark the type ID as PCI express. + # We leave the next ID of PXCAP blank to end the list. + device.PXCAPBaseOffset = 0x80 + device.CapabilityPtr = 0x80 + device.PXCAPCapId = 0x10 + + # Set bits 7 and 8 in the second PCIe device capabilities register which + # reports support for PCIe atomics for 32 and 64 bits respectively. + # Bit 9 for 128-bit compare and swap is not set because the amdgpu driver + # does not check this. + device.PXCAPDevCap2 = 0x00000180 + + # Set bit 6 to enable atomic requestor, meaning this device can request + # atomics from other PCI devices. + device.PXCAPDevCtrl2 = 0x00000040 + + # If there are multiple GPUs in the system, make sure the VBIOS region + # and the legacy IO bar do not overlap with the ranges from other GPUs. + if self._shader_id != 0: + device.ExpansionROM = 0xD0000000 + (0x20000 * self._shader_id) + bar4_addr = 0xF000 + (0x100 * self._shader_id) + device.BAR4 = PciLegacyIoBar(addr=bar4_addr, size="256B") + + def _create_pm4s(self, pm4_starts: List[int], pm4_ends: List[int]): + """Create PM4 packet processors.""" + num_pm4s = len(pm4_starts) + + pm4_procs = [ + PM4PacketProcessor( + ip_id=i, + mmio_range=AddrRange(start=pm4_starts[i], end=pm4_ends[i]), + ) + for i in range(num_pm4s) + ] + + for pm4_proc in pm4_procs: + self._cpu_dma_ports.append(pm4_proc.dma) + + return pm4_procs + + def _create_sdmas(self, sdma_bases: List[int], sdma_sizes: List[int]): + """Create the SDMA engines.""" + num_sdmas = len(sdma_bases) + + sdmas = [ + SDMAEngine( + walker=VegaPagetableWalker(), + mmio_base=sdma_bases[i], + mmio_size=sdma_sizes[i], + ) + for i in range(num_sdmas) + ] + + for sdma in sdmas: + self._cpu_dma_ports.append(sdma.dma) + self._gpu_dma_ports.append(sdma.walker.port) + + return sdmas + + def get_cpu_dma_ports(self): + return self._cpu_dma_ports + + def get_gpu_dma_ports(self): + return self._gpu_dma_ports + + def _create_tlbs(self, device: AMDGPUDevice): + """Connect per-CU TLBs to the L2/L3 TLBs""" + self.l2_tlb = VegaGPUTLB( + gpu_device=device, + size=4096, + assoc=64, + hitLatency=69, + missLatency1=750, + missLatency2=750, + maxOutstandingReqs=64, + ) + + self.l2_coalescer = VegaTLBCoalescer(tlb_level=2) + + self.l3_tlb = VegaGPUTLB( + gpu_device=device, + size=8192, + assoc=64, + hitLatency=150, + missLatency1=750, + missLatency2=750, + maxOutstandingReqs=64, + ) + + self.l3_coalescer = VegaTLBCoalescer(tlb_level=3) + + # Port flow: [L1s] -> L2 coalescer -> L2 tlb -> L3 coalescer -> L3 tlb + for cu in self.CUs: + for port in cu.get_tlb_ports(): + self.l2_coalescer.cpu_side_ports = port + self.l2_coalescer.mem_side_ports = self.l2_tlb.cpu_side_ports + self.l2_tlb.mem_side_ports = self.l3_coalescer.cpu_side_ports + self.l3_coalescer.mem_side_ports = self.l3_tlb.cpu_side_ports + + self._gpu_dma_ports.append(self.l3_tlb.walker.port) + + def connect_iobus(self, iobus: BaseXBar): + """Connect the GPU objects to the IO bus.""" + self.gpu_cmd_proc.pio = iobus.mem_side_ports + self.gpu_cmd_proc.hsapp.pio = iobus.mem_side_ports + self.system_hub.pio = iobus.mem_side_ports + self._device.pio = iobus.mem_side_ports + self._device.device_ih.pio = iobus.mem_side_ports + for sdma in self._device.sdmas: + sdma.pio = iobus.mem_side_ports + for pm4_proc in self._device.pm4_pkt_procs: + pm4_proc.pio = iobus.mem_side_ports + + def set_cpu_pointer(self, cpu: BaseCPU): + """Set the CPU pointer for the Shader.""" + self.cpu_pointer = cpu diff --git a/src/python/gem5/prebuilt/viper/__init__.py b/src/python/gem5/prebuilt/viper/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/python/gem5/prebuilt/viper/board.py b/src/python/gem5/prebuilt/viper/board.py new file mode 100644 index 0000000000..10eb57d5e5 --- /dev/null +++ b/src/python/gem5/prebuilt/viper/board.py @@ -0,0 +1,123 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import base64 +import os +from typing import ( + List, + Optional, +) + +from ...components.boards.abstract_board import AbstractBoard +from ...components.boards.kernel_disk_workload import KernelDiskWorkload +from ...components.boards.x86_board import X86Board +from ...components.cachehierarchies.abstract_cache_hierarchy import ( + AbstractCacheHierarchy, +) +from ...components.devices.gpus.amdgpu import BaseViperGPU +from ...components.memory.abstract_memory_system import AbstractMemorySystem +from ...components.processors.abstract_processor import AbstractProcessor +from ...utils.override import overrides + + +class ViperBoard(X86Board): + """ + A derivative of X86Board capable of full system simulation for X86 with a + GPU device. Provides all the functionality of the X86Board with helper + methods specific to booting a disk with GPU libraries installed. + """ + + def __init__( + self, + clk_freq: str, + processor: AbstractProcessor, + memory: AbstractMemorySystem, + cache_hierarchy: AbstractCacheHierarchy, + gpus: Optional[List[BaseViperGPU]] = None, + ) -> None: + super().__init__( + clk_freq=clk_freq, + processor=processor, + memory=memory, + cache_hierarchy=cache_hierarchy, + ) + print("Viper board __init__ was called here now") + + self._gpus = gpus + + def get_devices(self): + return self._gpus + + @overrides(AbstractBoard) + def _connect_things(self) -> None: + print("Viper board connect things was called here now") + super()._connect_things() + + if self._gpus is not None: + for gpu in self._gpus: + gpu.connectGPU(self) + + @overrides(KernelDiskWorkload) + def get_disk_device(self): + return "/dev/sda" + + @overrides(KernelDiskWorkload) + def get_default_kernel_args(self) -> List[str]: + # The regular parameters used with gem5 plus (1) fbdev_emulation=0 + # to disable having to implement this functionality, (2) blacklist + # amdgpu because we need to copy the VBIOS into memory first, and (3) + # blacklist psmouse as amdgpu driver adds new mouse commands which + # gem5 does not implement and they do not seem to be documented. + return [ + "earlyprintk=ttyS0", + "console=ttyS0", + "lpj=7999923", + "root={root_value}", + "drm_kms_helper.fbdev_emulation=0", + "modprobe.blacklist=amdgpu", + "modprobe.blacklist=psmouse", + ] + + # Replicate the capability of the old GPUFS config, which embed a binary + # application or script into a bash script setting up the environment and + # loading the GPU driver. + def make_gpu_app(self, gpu: BaseViperGPU, app: str, debug: bool = False): + driver_load_command = gpu.get_driver_command(debug=debug) + + with open(os.path.abspath(app), "rb") as binfile: + encodedBin = base64.b64encode(binfile.read()).decode() + + application_command = ( + f'echo "{encodedBin}" | base64 -d > myapp\n' + "chmod +x myapp\n" + "./myapp {}\n" + "/sbin/m5 exit\n" + ) + + return driver_load_command + application_command diff --git a/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py b/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py new file mode 100644 index 0000000000..6d7d5d90b8 --- /dev/null +++ b/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py @@ -0,0 +1,273 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import math + +from m5.objects import ( + DMASequencer, + RubyCache, + RubyPortProxy, + RubySequencer, + RubySystem, + SimpleMemory, + TreePLRURP, +) + +from ...coherence_protocol import CoherenceProtocol +from ...components.cachehierarchies.abstract_cache_hierarchy import ( + AbstractCacheHierarchy, +) +from ...components.cachehierarchies.ruby.abstract_ruby_cache_hierarchy import ( + AbstractRubyCacheHierarchy, +) +from ...components.cachehierarchies.ruby.caches.viper.corepair_cache import ( + CorePairCache, +) +from ...components.cachehierarchies.ruby.caches.viper.directory import ( + ViperCPUDirectory, +) +from ...components.cachehierarchies.ruby.caches.viper.dma_controller import ( + ViperCPUDMAController, +) +from ...prebuilt.viper.board import ViperBoard +from ...utils.override import overrides +from ...utils.requires import requires +from .viper_network import SimplePt2Pt + + +class ViperCPUCacheHierarchy(AbstractRubyCacheHierarchy): + """ + The VIPER CPU cache hierarchy creates CPU-side Ruby caches and connects + the nodes using a simple point-to-point topology. + """ + + def __init__( + self, + l1d_size: str, + l1d_assoc: int, + l1i_size: str, + l1i_assoc: int, + l2_size: str, + l2_assoc: int, + l3_size: str, + l3_assoc: int, + ): + """ + :param size: The size of each cache in the heirarchy. + :param assoc: The associativity of each cache. + :param device_dmas: Optional list of CPU connect device DMAs + """ + super().__init__() + + self._l1d_size = l1d_size + self._l1d_assoc = l1d_assoc + self._l1i_size = l1i_size + self._l1i_assoc = l1i_assoc + self._l2_size = l2_size + self._l2_assoc = l2_assoc + self._l3_size = l3_size + self._l3_assoc = l3_assoc + + self.ruby_system = RubySystem() + + @overrides(AbstractCacheHierarchy) + def incorporate_cache(self, board: ViperBoard) -> None: + requires(coherence_protocol_required=CoherenceProtocol.GPU_VIPER) + + # Ruby networks for CPU + self.ruby_system.network = SimplePt2Pt(self.ruby_system) + + # MOESI_AMD_Base uses 5 virtual networks. + self.ruby_system.number_of_virtual_networks = 5 + self.ruby_system.network.number_of_virtual_networks = 5 + + # There is a single local list of all of the controllers to make it + # easier to connect everything to the CPU network. This can be + # customized depending on the topology/network requirements. + # Create one controller for each L1 cache (and the cache mem obj.) + # Create a single directory controller (Really the memory cntrl). + self._controllers = [] + + cores = board.get_processor().get_cores() + num_cores = len(cores) + for i in range(0, num_cores, 2): + cache = CorePairCache( + l1d_size=self._l1d_size, + l1d_assoc=self._l1d_assoc, + l1i_size=self._l1i_size, + l1i_assoc=self._l1i_assoc, + l2_size=self._l2_size, + l2_assoc=self._l2_assoc, + network=self.ruby_system.network, + cache_line_size=board.get_cache_line_size(), + core=cores[i], + ) + + cache.version = i // 2 + cache.ruby_system = self.ruby_system + cache.clk_domain = board.get_clock_domain() + + cache.sequencer = RubySequencer( + version=i, + dcache=cache.L1D0cache, + ruby_system=self.ruby_system, + coreid=0, + is_cpu_sequencer=True, + clk_domain=board.get_clock_domain(), + ) + + cache.sequencer1 = RubySequencer( + version=i + 1, + dcache=cache.L1D1cache, + ruby_system=self.ruby_system, + coreid=1, + is_cpu_sequencer=True, + clk_domain=board.get_clock_domain(), + ) + + cache.sequencer.connectIOPorts(board.get_io_bus()) + cache.sequencer1.connectIOPorts(board.get_io_bus()) + + cores[i].connect_icache(cache.sequencer.in_ports) + cores[i].connect_dcache(cache.sequencer.in_ports) + + cores[i].connect_walker_ports( + cache.sequencer.in_ports, cache.sequencer.in_ports + ) + + # Connect the interrupt ports + int_req_port = cache.sequencer.interrupt_out_port + int_resp_port = cache.sequencer.in_ports + cores[i].connect_interrupt(int_req_port, int_resp_port) + + if i + 1 < num_cores: + cores[i + 1].connect_icache(cache.sequencer1.in_ports) + cores[i + 1].connect_dcache(cache.sequencer1.in_ports) + + cores[i + 1].connect_walker_ports( + cache.sequencer.in_ports, cache.sequencer1.in_ports + ) + + # Connect the interrupt ports + cores[i + 1].connect_interrupt(int_req_port, int_resp_port) + + self._controllers.append(cache) + + # Create the CPU directory controllers + self._directory_controllers = [] + + # Automatically determine the numa bit. This can be changed to + # increase the number of bytes to each memory channel before + # going to the next channels + dir_bits = int(math.log(len(board.get_mem_ports()), 2)) + block_size_bits = int(math.log(board.get_cache_line_size())) + + for addr_range, port in board.get_mem_ports(): + dir = ViperCPUDirectory( + self.ruby_system.network, + board.get_cache_line_size(), + addr_range, + port, + ) + dir.ruby_system = self.ruby_system + dir.version = len(self._directory_controllers) + self._directory_controllers.append(dir) + + dir.L3CacheMemory = RubyCache( + size=self._l3_size, + assoc=self._l3_assoc, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=16, + tagArrayBanks=16, + dataAccessLatency=20, + tagAccessLatency=15, + ) + + # Create the DMA Controllers, if required. + self._dma_controllers = [] + if board.has_dma_ports(): + dma_ports = board.get_dma_ports() + for i, port in enumerate(dma_ports): + ctrl = ViperCPUDMAController( + self.ruby_system.network, board.get_cache_line_size() + ) + ctrl.dma_sequencer = DMASequencer(version=i, in_ports=port) + + ctrl.ruby_system = self.ruby_system + ctrl.dma_sequencer.ruby_system = self.ruby_system + + self._dma_controllers.append(ctrl) + + # Create DMA Controllers requires for any devices in the system. + device_dmas = [] + if board.get_devices() is not None: + for device in board.get_devices(): + device_dmas += device.get_cpu_dma_ports() + + if len(device_dmas) > 0: + for _, port in enumerate(device_dmas): + ctrl = ViperCPUDMAController( + self.ruby_system.network, board.get_cache_line_size() + ) + ctrl.dma_sequencer = DMASequencer( + version=len(self._dma_controllers), in_ports=port + ) + + ctrl.ruby_system = self.ruby_system + ctrl.dma_sequencer.ruby_system = self.ruby_system + + self._dma_controllers.append(ctrl) + + # Number of sequencers = one per core pair + one per DMA + self.ruby_system.num_of_sequencers = len(self._controllers) * 2 + len( + self._dma_controllers + ) + + # Assign the controllers to their parent objects. + self.ruby_system.controllers = self._controllers + self.ruby_system.directory_controllers = self._directory_controllers + + if len(self._dma_controllers) != 0: + self.ruby_system.dma_controllers = self._dma_controllers + + # Connect the controllers using the network topology + self.ruby_system.network.connect( + self._controllers + + self._directory_controllers + + self._dma_controllers + ) + self.ruby_system.network.setup_buffers() + + # Set up a proxy port for the system_port. Used for load binaries and + # other functional-only things. + self.ruby_system.sys_port_proxy = RubyPortProxy( + ruby_system=self.ruby_system + ) + board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports) diff --git a/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py b/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py new file mode 100644 index 0000000000..1012d679db --- /dev/null +++ b/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py @@ -0,0 +1,351 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import math + +from m5.objects import ( + AddrRange, + DMASequencer, + HBM_2000_4H_1x64, + MemCtrl, + RubyCache, + RubySequencer, + RubySystem, + SrcClockDomain, + TreePLRURP, + VIPERCoalescer, + VoltageDomain, +) + +from ...coherence_protocol import CoherenceProtocol +from ...components.cachehierarchies.ruby.abstract_ruby_cache_hierarchy import ( + AbstractRubyCacheHierarchy, +) +from ...components.cachehierarchies.ruby.caches.viper.directory import ( + ViperGPUDirectory, +) +from ...components.cachehierarchies.ruby.caches.viper.dma_controller import ( + ViperGPUDMAController, +) +from ...components.cachehierarchies.ruby.caches.viper.sqc import SQCCache +from ...components.cachehierarchies.ruby.caches.viper.tcc import TCCCache +from ...components.cachehierarchies.ruby.caches.viper.tcp import TCPCache +from ...components.devices.gpus.viper_shader import ViperShader +from ...utils.requires import requires +from .viper_network import ( + SimpleDoubleCrossbar, + SimplePt2Pt, +) + + +class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy): + _seqs = 0 + + @classmethod + def seqCount(cls): + # Use SeqCount not class since we need global count + cls._seqs += 1 + return cls._seqs - 1 + + def __init__( + self, + tcp_size: str, + tcp_assoc: int, + sqc_size: str, + sqc_assoc: int, + scalar_size: str, + scalar_assoc: int, + tcc_size: str, + tcc_assoc: int, + tcc_count: int, + cu_per_sqc: int, + num_memory_channels: int, + cache_line_size: int, + shader: ViperShader, + ): + """ + :param size: The size of each cache in the heirarchy. + :param assoc: The associativity of each cache. + """ + super().__init__() + + self._tcp_size = tcp_size + self._tcp_assoc = tcp_assoc + self._sqc_size = sqc_size + self._sqc_assoc = sqc_assoc + self._scalar_size = scalar_size + self._scalar_assoc = scalar_assoc + self._tcc_size = tcc_size + self._tcc_assoc = tcc_assoc + self._cache_line_size = cache_line_size + + # We have everything we need to know to create the GPU cache hierarchy + # immediately. Therefore, an incorporate_cache method is not part of + # this cache hierarchy. Go ahead and incorporate everything now. + requires(coherence_protocol_required=CoherenceProtocol.GPU_VIPER) + + self.ruby_gpu = RubySystem() + self.ruby_gpu.block_size_bytes = cache_line_size + + # Ruby network for this GPU + self.ruby_gpu.network = SimpleDoubleCrossbar(self.ruby_gpu) + + # VIPER uses 6 virtual networks. + self.ruby_gpu.number_of_virtual_networks = 6 + self.ruby_gpu.network.number_of_virtual_networks = 6 + + # There is a single local list of all of the controllers to make it + # easier to connect everything to the GPU network. This can be + # customized depending on the topology/network requirements. + self._controllers = [] + self._directory_controllers = [] + self._dma_controllers = [] + self._mem_ctrls = [] + + self.clk_domain = SrcClockDomain( + clock="1801MHz", + voltage_domain=VoltageDomain(), + ) + + # Variables used by multiple objects are defined once here + tcc_bits = int(math.log(tcc_count, 2)) + deadlock_threshold = 500000 + + # Create one TCP per CU + compute_units = shader.get_compute_units() + for idx, cu in enumerate(compute_units): + tcp = TCPCache( + tcp_size=self._tcp_size, + tcp_assoc=self._tcp_assoc, + network=self.ruby_gpu.network, + cache_line_size=self._cache_line_size, + ) + + tcp.version = idx + + tcp.sequencer = RubySequencer( + version=self.seqCount(), + dcache=tcp.L1cache, + ruby_system=self.ruby_gpu, + is_cpu_sequencer=True, + ) + + tcp.coalescer = VIPERCoalescer( + version=self.seqCount(), + icache=tcp.L1cache, + dcache=tcp.L1cache, + ruby_system=self.ruby_gpu, + support_inst_reqs=False, + is_cpu_sequencer=False, + deadlock_threshold=deadlock_threshold, + max_coalesces_per_cycle=1, + gmTokenPort=cu.gmTokenPort, + ) + + for port_idx in range(cu.wf_size): + cu.memory_port[port_idx] = tcp.coalescer.in_ports + + tcp.ruby_system = self.ruby_gpu + tcp.TCC_select_num_bits = tcc_bits + tcp.use_seq_not_coal = False + tcp.issue_latency = 1 + tcp.clk_domain = self.clk_domain + tcp.recycle_latency = 10 + tcp.WB = False + tcp.disableL1 = False + + self._controllers.append(tcp) + + # This check ensures there are a same number of CUs with shared SQC + # and Scalar caches. + num_cus = len(shader.get_compute_units()) + assert (num_cus % cu_per_sqc) == 0 + num_sqcs = num_cus // cu_per_sqc + + for idx in range(num_sqcs): + sqc = SQCCache( + sqc_size=self._sqc_size, + sqc_assoc=self._sqc_assoc, + network=self.ruby_gpu.network, + cache_line_size=self._cache_line_size, + ) + + sqc.version = idx + + sqc.sequencer = RubySequencer( + version=self.seqCount(), + dcache=sqc.L1cache, + ruby_system=self.ruby_gpu, + support_data_reqs=False, + is_cpu_sequencer=False, + deadlock_threshold=deadlock_threshold, + ) + + # SQC is shared across {cu_per_sqc} CUs. + cu_base = cu_per_sqc * idx + for cu_num in range(cu_per_sqc): + cu_id = cu_base + cu_num + compute_units[cu_id].sqc_port = sqc.sequencer.in_ports + + sqc.ruby_system = self.ruby_gpu + sqc.TCC_select_num_bits = tcc_bits + sqc.clk_domain = self.clk_domain + sqc.recycle_latency = 10 + + self._controllers.append(sqc) + + num_scalars = num_sqcs + for idx in range(num_scalars): + scalar = SQCCache( + sqc_size=self._scalar_size, + sqc_assoc=self._scalar_assoc, + network=self.ruby_gpu.network, + cache_line_size=self._cache_line_size, + ) + + # Scalar uses same controller as SQC, so add SQC count + scalar.version = idx + num_sqcs + + scalar.sequencer = RubySequencer( + version=self.seqCount(), + dcache=scalar.L1cache, + ruby_system=self.ruby_gpu, + support_data_reqs=False, + is_cpu_sequencer=False, + deadlock_threshold=deadlock_threshold, + ) + + # Scalar cache is shared across {cu_per_sqc} CUs. + cu_base = cu_per_sqc * idx + for cu_num in range(cu_per_sqc): + cu_id = cu_base + cu_num + compute_units[cu_id].scalar_port = scalar.sequencer.in_ports + + scalar.ruby_system = self.ruby_gpu + scalar.TCC_select_num_bits = tcc_bits + scalar.clk_domain = self.clk_domain + scalar.recycle_latency = 10 + + self._controllers.append(scalar) + + # Create TCCs (GPU L2 cache) + for idx in range(tcc_count): + tcc = TCCCache( + tcc_size=self._tcc_size, + tcc_assoc=self._tcc_assoc, + network=self.ruby_gpu.network, + cache_line_size=self._cache_line_size, + ) + + tcc.version = idx + + tcc.ruby_system = self.ruby_gpu + tcc.WB = False + tcc.clk_domain = self.clk_domain + tcc.recycle_latency = 10 + + self._controllers.append(tcc) + + # Create DMA controllers + for i, port in enumerate(shader.get_gpu_dma_ports()): + ctrl = ViperGPUDMAController( + self.ruby_gpu.network, self._cache_line_size + ) + ctrl.dma_sequencer = DMASequencer(version=i, in_ports=port) + + ctrl.ruby_system = self.ruby_gpu + ctrl.dma_sequencer.ruby_system = self.ruby_gpu + + self._dma_controllers.append(ctrl) + + # Create GPU memories. Currently fixed to HBM2. + mem_type_cls = HBM_2000_4H_1x64 + + # AMDGPUDevice currently tells the driver there is 16GiB for memory. + # Until that is a parameter, this need to be fixed to 16GiB. + gpu_mem_range = AddrRange(0, size="16GiB") + intlv_low_bit = int(math.log(self._cache_line_size, 2)) + intlv_bits = int(math.log(num_memory_channels, 2)) + + for idx in range(num_memory_channels): + addr_range = AddrRange( + gpu_mem_range.start, + size=gpu_mem_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + intlvBits=intlv_bits, + intlvMatch=idx, + xorHighBit=0, + ) + + mem_ctrl = MemCtrl(dram=mem_type_cls(range=addr_range)) + self._mem_ctrls.append(mem_ctrl) + + dir = ViperGPUDirectory( + self.ruby_gpu.network, + self._cache_line_size, + addr_range, + self._mem_ctrls[idx].port, + ) + + dir.ruby_system = self.ruby_gpu + dir.TCC_select_num_bits = tcc_bits + dir.version = len(self._directory_controllers) + self._directory_controllers.append(dir) + + dir.L3CacheMemory = RubyCache( + size="16MiB", + assoc=16, + atomicALUs=64, + replacement_policy=TreePLRURP(), + resourceStalls=False, + dataArrayBanks=16, + tagArrayBanks=16, + dataAccessLatency=20, + tagAccessLatency=15, + ) + + # Number of sequencers = one per TCP, SQC, and Scalar + one per DMA. + self.ruby_gpu.num_of_sequencers = len(self._controllers) + len( + self._dma_controllers + ) + + # Assign the controllers to their parent objects. + self.ruby_gpu.controllers = self._controllers + self.ruby_gpu.directory_controllers = self._directory_controllers + + # Connect the controllers using the network topology + self.ruby_gpu.network.connect( + self._controllers + + self._directory_controllers + + self._dma_controllers + ) + self.ruby_gpu.network.setup_buffers() + + def get_mem_ctrls(self): + return self._mem_ctrls diff --git a/src/python/gem5/prebuilt/viper/viper_network.py b/src/python/gem5/prebuilt/viper/viper_network.py new file mode 100644 index 0000000000..e22a330126 --- /dev/null +++ b/src/python/gem5/prebuilt/viper/viper_network.py @@ -0,0 +1,165 @@ +# Copyright (c) 2021 The Regents of the University of California. +# All Rights Reserved +# +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects import ( + SimpleExtLink, + SimpleIntLink, + SimpleNetwork, + Switch, +) + + +class SimplePt2Pt(SimpleNetwork): + """A simple point-to-point network. This does not use garnet.""" + + def __init__(self, ruby_system): + super().__init__() + self.netifs = [] + + # TODO: These should be in a base class + # https://gem5.atlassian.net/browse/GEM5-1039 + self.ruby_system = ruby_system + + def connect(self, controllers): + """Connect all of the controllers to routers and connect the routers + together in a point-to-point network. + """ + # Create one router/switch per controller in the system + self.routers = [Switch(router_id=i) for i in range(len(controllers))] + + # Make a link from each controller to the router. The link goes + # externally to the network. + self.ext_links = [ + SimpleExtLink(link_id=i, ext_node=c, int_node=self.routers[i]) + for i, c in enumerate(controllers) + ] + + # Make an "internal" link (internal to the network) between every pair + # of routers. + link_count = 0 + int_links = [] + for ri in self.routers: + for rj in self.routers: + if ri == rj: + continue # Don't connect a router to itself! + link_count += 1 + int_links.append( + SimpleIntLink(link_id=link_count, src_node=ri, dst_node=rj) + ) + self.int_links = int_links + + +class SimpleDoubleCrossbar(SimpleNetwork): + """ + GPU network with crossbars between CU caches and L2 caches and between L2 + caches and directories/memory controllers/DMAs using SimpleNetwork. + """ + + def __init__(self, ruby_system): + super().__init__() + self.netifs = [] + + self.ruby_system = ruby_system + + def connect(self, controllers): + l2_xbar_types = ("TCP_Controller", "SQC_Controller", "TCC_Controller") + soc_xbar_types = ("DMA_Controller", "Directory_Controller") + + # Create one router per controller plus a crossbar for L2 controllers + # and a crossbar for SoC controllers. + routers = [Switch(router_id=i) for i in range(len(controllers))] + routers.append(Switch(router_id=len(routers))) + routers.append(Switch(router_id=len(routers))) + self.routers = routers + + # Routers 0 ... N-2 connect to the individual controllers + self.ext_links = [ + SimpleExtLink(link_id=i, ext_node=c, int_node=self.routers[i]) + for i, c in enumerate(controllers) + ] + + # Connect compute unit components and L2s to L2 crossbar in both + # directions. + l2_xbar_id = len(controllers) + soc_xbar_id = l2_xbar_id + 1 + int_links = [] + + for ext_link in self.ext_links: + if ext_link.ext_node.type in l2_xbar_types: + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=ext_link.int_node, + dst_node=self.routers[l2_xbar_id], + ) + ) + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=self.routers[l2_xbar_id], + dst_node=ext_link.int_node, + ) + ) + elif ext_link.ext_node.type in soc_xbar_types: + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=ext_link.int_node, + dst_node=self.routers[soc_xbar_id], + ) + ) + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=self.routers[soc_xbar_id], + dst_node=ext_link.int_node, + ) + ) + + # Connect L2 xbar to SoC xbar. + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=self.routers[l2_xbar_id], + dst_node=self.routers[soc_xbar_id], + ) + ) + int_links.append( + SimpleIntLink( + link_id=len(int_links), + src_node=self.routers[soc_xbar_id], + dst_node=self.routers[l2_xbar_id], + ) + ) + + # Finalize network int_links for unproxy + self.int_links = int_links