stdlib: Add viper board, viper cache, and gpu components

Adds GPU_VIPER protocol related caches to stdlib components: CorePair
cache, TCP, SQC, TCC, Directory, and DMA controllers. Adds GPU related
components in a new components/devices/gpus/ directory. Adds prebuilt
GPU and CPU cache hierarchies, GPU and CPU network classes, and a board
overriding the X86Board to provide helper methods for disk image root,
the complex kernel parameter list, and method to provide functionality
to the current GPUFS scripts to load in applications and handle loading
the GPU driver.

The new GPU components can be used as follows:
 - Create a GPU device *before* the CPU cache hierarchy is created.
 - Add the GPU's CPU-side DMA controllers to the list of CPU cache
   controllers.
 - Use GPU device method to connect to an AbstractBoard.

Each GPU components has it's own RubySystem, PCI device ID, and address
ranges for VBIOS and legacy PCI BARs. Therefore, in theory, multiple
GPUs can be created. This requires PR #1453 .

An example of using this board is added to configs/example/gem5_library
under x86-mi300x-gpu.py. It is designed to work with the disk image,
kernel, and applications provided in the gem5-resources repository.

Change-Id: Ie65ffcfee5e311d9492de935d6d0631260645cd3
This commit is contained in:
Matthew Poremba
2023-06-28 16:44:37 -07:00
committed by Bobby R. Bruce
parent 44b8f5f422
commit 2105dc47a9
18 changed files with 2292 additions and 0 deletions

View File

@@ -0,0 +1,155 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""
Script to run a full system GPU simulation.
Usage:
------
```
scons build/VEGA_X86/gem5.opt
./build/VEGA_X86/gem5.opt
configs/example/gem5_library/x86-viper-gpu.py
--image <disk image>
--kernel <kernel>
--app <gpu application>
```
Example:
--------
```
./build/VEGA_X86/gem5.opt
configs/example/gem5_library/x86-viper-gpu.py
--image ./gem5-resources/src/x86-ubuntu-gpu-ml/disk-image/x86-ubuntu-gpu-ml
--kernel ./gem5-resources/src/x86-ubuntu-gpu-ml/vmlinux-gpu-ml
--app ./gem5-resources/src/gpu/square/bin.default/square.default
```
"""
import argparse
from gem5.coherence_protocol import CoherenceProtocol
from gem5.components.devices.gpus.amdgpu import MI300X
from gem5.components.memory.single_channel import SingleChannelDDR4_2400
from gem5.components.processors.cpu_types import CPUTypes
from gem5.components.processors.simple_processor import SimpleProcessor
from gem5.isas import ISA
from gem5.prebuilt.viper.board import ViperBoard
from gem5.prebuilt.viper.cpu_cache_hierarchy import ViperCPUCacheHierarchy
from gem5.resources.resource import (
DiskImageResource,
FileResource,
)
from gem5.simulate.simulator import Simulator
from gem5.utils.requires import requires
requires(
isa_required=ISA.X86,
coherence_protocol_required=CoherenceProtocol.GPU_VIPER,
)
# Kernel, disk, and applications are obtained locally.
parser = argparse.ArgumentParser()
parser.add_argument(
"--image",
type=str,
required=True,
help="Full path to the gem5-resources x86-ubuntu-gpu-ml disk-image.",
)
parser.add_argument(
"--kernel",
type=str,
required=True,
help="Full path to the gem5-resources vmlinux-gpu-ml kernel.",
)
parser.add_argument(
"--app",
type=str,
required=True,
help="Path to GPU application, python script, or bash script to run",
)
parser.add_argument(
"--kvm-perf",
default=False,
action="store_true",
help="Use KVM perf counters to give accurate GPU insts/cycles with KVM",
)
args = parser.parse_args()
# stdlib only supports up to 3GiB currently. This will need to be expanded in
# the future.
memory = SingleChannelDDR4_2400(size="3GiB")
# Note: Only KVM and ATOMIC work due to buggy MOESI_AMD_Base protocol.
processor = SimpleProcessor(cpu_type=CPUTypes.KVM, isa=ISA.X86, num_cores=2)
for core in processor.cores:
if core.is_kvm_core():
core.get_simobject().usePerf = args.kvm_perf
# The GPU must be created first so we can assign CPU-side DMA ports to the
# CPU cache hierarchy.
gpu0 = MI300X()
cache_hierarchy = ViperCPUCacheHierarchy(
l1d_size="32KiB",
l1d_assoc=8,
l1i_size="32KiB",
l1i_assoc=8,
l2_size="1MiB",
l2_assoc=16,
l3_size="16MiB",
l3_assoc=16,
)
board = ViperBoard(
clk_freq="3GHz",
processor=processor,
memory=memory,
cache_hierarchy=cache_hierarchy,
gpus=[gpu0],
)
# Example of using a local disk image resource
disk = DiskImageResource(local_path=args.image, root_partition="1")
kernel = FileResource(local_path=args.kernel)
board.set_kernel_disk_workload(
kernel=kernel,
disk_image=disk,
readfile_contents=board.make_gpu_app(gpu0, args.app),
)
simulator = Simulator(board=board)
simulator.run()

View File

@@ -175,10 +175,34 @@ PySource('gem5.components.cachehierarchies.ruby.caches.mi_example',
'dma_controller.py')
PySource('gem5.components.cachehierarchies.ruby.caches.mi_example',
'gem5/components/cachehierarchies/ruby/caches/mi_example/l1_cache.py')
PySource('gem5.components.cachehierarchies.ruby.caches.viper',
'gem5/components/cachehierarchies/ruby/caches/viper/__init__.py')
PySource('gem5.components.cachehierarchies.ruby.caches.viper',
'gem5/components/cachehierarchies/ruby/caches/viper/corepair_cache.py')
PySource('gem5.components.cachehierarchies.ruby.caches.viper',
'gem5/components/cachehierarchies/ruby/caches/viper/directory.py')
PySource('gem5.components.cachehierarchies.ruby.caches.viper',
'gem5/components/cachehierarchies/ruby/caches/viper/dma_controller.py')
PySource('gem5.components.cachehierarchies.ruby.caches.viper',
'gem5/components/cachehierarchies/ruby/caches/viper/tcp.py')
PySource('gem5.components.cachehierarchies.ruby.caches.viper',
'gem5/components/cachehierarchies/ruby/caches/viper/sqc.py')
PySource('gem5.components.cachehierarchies.ruby.caches.viper',
'gem5/components/cachehierarchies/ruby/caches/viper/tcc.py')
PySource('gem5.components.cachehierarchies.ruby.topologies',
'gem5/components/cachehierarchies/ruby/topologies/__init__.py')
PySource('gem5.components.cachehierarchies.ruby.topologies',
'gem5/components/cachehierarchies/ruby/topologies/simple_pt2pt.py')
PySource('gem5.components.devices',
'gem5/components/devices/__init__.py')
PySource('gem5.components.devices.gpus',
'gem5/components/devices/gpus/__init__.py')
PySource('gem5.components.devices.gpus',
'gem5/components/devices/gpus/amdgpu.py')
PySource('gem5.components.devices.gpus',
'gem5/components/devices/gpus/viper_shader.py')
PySource('gem5.components.memory', 'gem5/components/memory/__init__.py')
PySource('gem5.components.memory', 'gem5/components/memory/abstract_memory_system.py')
PySource('gem5.components.memory', 'gem5/components/memory/dramsim_3.py')
@@ -289,6 +313,14 @@ PySource('gem5.prebuilt.riscvmatched',
'gem5/prebuilt/riscvmatched/riscvmatched_processor.py')
PySource('gem5.prebuilt.riscvmatched',
'gem5/prebuilt/riscvmatched/riscvmatched_core.py')
PySource('gem5.prebuilt.viper', 'gem5/prebuilt/viper/__init__.py')
PySource('gem5.prebuilt.viper', 'gem5/prebuilt/viper/board.py')
PySource('gem5.prebuilt.viper',
'gem5/prebuilt/viper/cpu_cache_hierarchy.py')
PySource('gem5.prebuilt.viper',
'gem5/prebuilt/viper/gpu_cache_hierarchy.py')
PySource('gem5.prebuilt.viper',
'gem5/prebuilt/viper/viper_network.py')
PySource('gem5.resources', 'gem5/resources/__init__.py')
PySource('gem5.resources', 'gem5/resources/client.py')
PySource('gem5.resources', 'gem5/resources/downloader.py')

View File

@@ -0,0 +1,124 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import math
from m5.objects import (
CorePair_Controller,
MessageBuffer,
RubyCache,
TreePLRURP,
)
from gem5.components.processors.abstract_core import AbstractCore
class CorePairCache(CorePair_Controller):
def __init__(
self,
l1i_size: str,
l1i_assoc: int,
l1d_size: str,
l1d_assoc: int,
l2_size: str,
l2_assoc: int,
network,
cache_line_size,
core: AbstractCore,
):
"""Creating CorePair cache controller. Consist of both instruction
and data cache for a pair of L1s and a single L2 cache shared between
them.
"""
super().__init__()
self.send_evictions = core.requires_send_evicts()
self.L1Icache = RubyCache(
size=l1i_size,
assoc=l1i_assoc,
replacement_policy=TreePLRURP(),
resourceStalls=False,
dataArrayBanks=2,
tagArrayBanks=2,
dataAccessLatency=1,
tagAccessLatency=1,
)
self.L1D0cache = RubyCache(
size=l1d_size,
assoc=l1d_assoc,
replacement_policy=TreePLRURP(),
resourceStalls=False,
dataArrayBanks=2,
tagArrayBanks=2,
dataAccessLatency=1,
tagAccessLatency=1,
)
self.L1D1cache = RubyCache(
size=l1d_size,
assoc=l1d_assoc,
replacement_policy=TreePLRURP(),
resourceStalls=False,
dataArrayBanks=2,
tagArrayBanks=2,
dataAccessLatency=1,
tagAccessLatency=1,
)
self.L2cache = RubyCache(
size=l2_size,
assoc=l2_assoc,
replacement_policy=TreePLRURP(),
resourceStalls=False,
dataArrayBanks=16,
tagArrayBanks=16,
)
self.connectQueues(network)
def connectQueues(self, network):
self.requestFromCore = MessageBuffer()
self.requestFromCore.out_port = network.in_port
self.responseFromCore = MessageBuffer()
self.responseFromCore.out_port = network.in_port
self.unblockFromCore = MessageBuffer()
self.unblockFromCore.out_port = network.in_port
self.probeToCore = MessageBuffer()
self.probeToCore.in_port = network.out_port
self.responseToCore = MessageBuffer()
self.responseToCore.in_port = network.out_port
self.mandatoryQueue = MessageBuffer()
self.triggerQueue = MessageBuffer(ordered=True)

View File

@@ -0,0 +1,106 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from m5.objects import (
MessageBuffer,
RubyDirectoryMemory,
)
from ......utils.override import overrides
from ..abstract_directory import AbstractDirectory
class ViperDirectory(AbstractDirectory):
def __init__(self, network, cache_line_size, mem_range, port):
super().__init__(network, cache_line_size)
self.addr_ranges = [mem_range]
self.directory = RubyDirectoryMemory(
block_size=cache_line_size,
ruby_system=network.ruby_system,
)
# Connect this directory to the memory side.
self.memory_out_port = port
# Turn off TCC (GPU cache) related parameters
self.noTCCdir = True
self.TCC_select_num_bits = 0
# Defaults which must be set
self.CPUonly = False
self.GPUonly = False
self.useL3OnWT = False
self.L2isWB = False
@overrides(AbstractDirectory)
def connectQueues(self, network):
self.requestFromDMA = MessageBuffer(ordered=True)
self.requestFromDMA.in_port = network.out_port
self.responseToDMA = MessageBuffer()
self.responseToDMA.out_port = network.in_port
self.requestFromCores = MessageBuffer(ordered=True)
self.requestFromCores.in_port = network.out_port
self.responseFromCores = MessageBuffer()
self.responseFromCores.in_port = network.out_port
self.unblockFromCores = MessageBuffer()
self.unblockFromCores.in_port = network.out_port
self.probeToCore = MessageBuffer()
self.probeToCore.out_port = network.in_port
self.responseToCore = MessageBuffer()
self.responseToCore.out_port = network.in_port
self.triggerQueue = MessageBuffer(ordered=True)
self.L3triggerQueue = MessageBuffer(ordered=True)
self.requestToMemory = MessageBuffer()
self.responseFromMemory = MessageBuffer()
# This is intended to be used on the CPU side
class ViperCPUDirectory(ViperDirectory):
def __init__(self, network, cache_line_size, mem_range, port):
super().__init__(network, cache_line_size, mem_range, port)
self.CPUonly = True
self.GPUonly = False
# This is intended to be used on the GPU side
class ViperGPUDirectory(ViperDirectory):
def __init__(self, network, cache_line_size, mem_range, port):
super().__init__(network, cache_line_size, mem_range, port)
self.CPUonly = False
self.GPUonly = True

View File

@@ -0,0 +1,68 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from m5.objects import MessageBuffer
from ......utils.override import overrides
from ..abstract_dma_controller import AbstractDMAController
# There is a controller for GPU and GPU to keep the "version" numbers
# incrementing seperately
class ViperCPUDMAController(AbstractDMAController):
def __init__(self, network, cache_line_size):
super().__init__(network, cache_line_size)
@overrides(AbstractDMAController)
def connectQueues(self, network):
# A buffer size of 0 means it is an infinite queue. The VIPER
# DMA controller has not been thoroughly tested with finite buffers.
# Test
self.mandatoryQueue = MessageBuffer(buffer_size=0)
self.responseFromDir = MessageBuffer(buffer_size=0)
self.responseFromDir.in_port = network.out_port
self.requestToDir = MessageBuffer(buffer_size=0)
self.requestToDir.out_port = network.in_port
class ViperGPUDMAController(AbstractDMAController):
def __init__(self, network, cache_line_size):
super().__init__(network, cache_line_size)
@overrides(AbstractDMAController)
def connectQueues(self, network):
# A buffer size of 0 means it is an infinite queue. The VIPER
# DMA controller has not been thoroughly tested with finite buffers.
# Test
self.mandatoryQueue = MessageBuffer(buffer_size=0)
self.responseFromDir = MessageBuffer(buffer_size=0)
self.responseFromDir.in_port = network.out_port
self.requestToDir = MessageBuffer(buffer_size=0)
self.requestToDir.out_port = network.in_port

View File

@@ -0,0 +1,73 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from m5.objects import (
MessageBuffer,
RubyCache,
SQC_Controller,
TreePLRURP,
)
class SQCCache(SQC_Controller):
def __init__(
self,
sqc_size: str,
sqc_assoc: int,
network,
cache_line_size,
):
"""Creating SQC cache controller. This is the Icache for GPU devices."""
super().__init__()
self.L1cache = RubyCache(
size=sqc_size,
assoc=sqc_assoc,
dataArrayBanks=8,
tagArrayBanks=8,
dataAccessLatency=1,
tagAccessLatency=1,
resourceStalls=True,
replacement_policy=TreePLRURP(),
)
self.connectQueues(network)
def connectQueues(self, network):
self.requestFromSQC = MessageBuffer(ordered=True)
self.requestFromSQC.out_port = network.in_port
self.probeToSQC = MessageBuffer(ordered=True)
self.probeToSQC.in_port = network.out_port
self.responseToSQC = MessageBuffer(ordered=True)
self.responseToSQC.in_port = network.out_port
self.mandatoryQueue = MessageBuffer()

View File

@@ -0,0 +1,87 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from m5.objects import (
MessageBuffer,
RubyCache,
TCC_Controller,
TreePLRURP,
)
class TCCCache(TCC_Controller):
def __init__(
self,
tcc_size: str,
tcc_assoc: int,
network,
cache_line_size,
):
"""Creating TCC cache controller. This is the L2 cache for GPU devices."""
super().__init__()
self.L2cache = RubyCache(
size=tcc_size,
assoc=tcc_assoc,
dataArrayBanks=256,
tagArrayBanks=256,
dataAccessLatency=8,
tagAccessLatency=2,
resourceStalls=True,
replacement_policy=TreePLRURP(),
atomicLatency=0,
atomicALUs=64,
)
self.connectQueues(network)
def connectQueues(self, network):
self.requestFromTCP = MessageBuffer(ordered=True)
self.requestFromTCP.in_port = network.out_port
self.responseToCore = MessageBuffer(ordered=True)
self.responseToCore.out_port = network.in_port
self.probeFromNB = MessageBuffer()
self.probeFromNB.in_port = network.out_port
self.responseFromNB = MessageBuffer()
self.responseFromNB.in_port = network.out_port
self.requestToNB = MessageBuffer(ordered=True)
self.requestToNB.out_port = network.in_port
self.responseToNB = MessageBuffer()
self.responseToNB.out_port = network.in_port
self.unblockToNB = MessageBuffer()
self.unblockToNB.out_port = network.in_port
self.triggerQueue = MessageBuffer(ordered=True)

View File

@@ -0,0 +1,79 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from m5.objects import (
MessageBuffer,
RubyCache,
TCP_Controller,
TreePLRURP,
)
class TCPCache(TCP_Controller):
def __init__(
self,
tcp_size: str,
tcp_assoc: int,
network,
cache_line_size,
):
"""Creating TCP cache controller. This is the L1 cache for GPU devices."""
super().__init__()
self.L1cache = RubyCache(
size=tcp_size,
assoc=tcp_assoc,
dataArrayBanks=16,
tagArrayBanks=16,
dataAccessLatency=4,
tagAccessLatency=1,
resourceStalls=True,
replacement_policy=TreePLRURP(),
)
self.connectQueues(network)
def connectQueues(self, network):
self.requestFromTCP = MessageBuffer(ordered=True)
self.requestFromTCP.out_port = network.in_port
self.responseFromTCP = MessageBuffer(ordered=True)
self.responseFromTCP.out_port = network.in_port
self.unblockFromCore = MessageBuffer()
self.unblockFromCore.out_port = network.in_port
self.probeToTCP = MessageBuffer(ordered=True)
self.probeToTCP.in_port = network.out_port
self.responseToTCP = MessageBuffer(ordered=True)
self.responseToTCP.in_port = network.out_port
self.mandatoryQueue = MessageBuffer()

View File

@@ -0,0 +1,279 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from m5.objects import (
AMDGPUDevice,
SubSystem,
)
from ....components.boards.abstract_board import AbstractBoard
from ....prebuilt.viper.gpu_cache_hierarchy import ViperGPUCacheHierarchy
from .viper_shader import ViperShader
class BaseViperGPU(SubSystem):
_base_pci_dev = 8
_gpu_count = 0
_my_id = 0
@classmethod
def next_pci_dev(cls):
cls._gpu_count += 1 # Use count for this particular type
return cls._base_pci_dev + cls._gpu_count - 1
@classmethod
def get_gpu_count(cls):
return cls._gpu_count
def __init__(self):
# Setup various PCI related parameters
self._my_id = self.get_gpu_count()
pci_dev = self.next_pci_dev()
device = AMDGPUDevice(pci_func=0, pci_dev=pci_dev, pci_bus=0)
self._device = device
def set_shader(self, shader: ViperShader):
self._shader = shader
def get_cpu_dma_ports(self):
return self._shader.get_cpu_dma_ports()
def connectGPU(self, board: AbstractBoard) -> None:
# Connect a CPU pointer. This is only used for SE mode. Any CPU will
# work, so pick assuming there is at least one
cpus = board.get_processor()
self._shader.set_cpu_pointer(cpus.cores[0].core)
# Connect all PIO buses
self._shader.connect_iobus(board.get_io_bus())
# The System() object in gem5 has a memories parameter which defaults
# to Self.all. This will collect *all* AbstractMemories and connect to
# the CPU side. To avoid this we manually assign the memories param to
# the CPU side memories. We need the MemInterface which is called dram
# in the MemCtrl class even though it might not be modelling dram.
memory = board.get_memory()
cpu_abs_mems = [mem.dram for mem in memory.get_memory_controllers()]
board.memories = cpu_abs_mems
# Make the cache hierarchy. This will create an independent RubySystem
# class containing only the GPU caches with no network connection to
# the CPU cache hierarchy.
self._device.gpu_caches = ViperGPUCacheHierarchy(
tcp_size=self._tcp_size,
tcp_assoc=self._tcp_assoc,
sqc_size=self._sqc_size,
sqc_assoc=self._sqc_assoc,
scalar_size=self._scalar_size,
scalar_assoc=self._scalar_assoc,
tcc_size=self._tcc_size,
tcc_assoc=self._tcc_assoc,
tcc_count=self._tcc_count,
cu_per_sqc=self._cu_per_sqc,
num_memory_channels=self._num_memory_channels,
cache_line_size=self._cache_line_size,
shader=self._shader,
)
# Collect GPU memory controllers created in the GPU cache hierarchy.
# First assign them as a child to the device so the SimObject unproxy.
# The device requires the memories parameter to be set as the system
# pointer required by the AbstractMemory class is set by AMDGPUDevice.
self._device.mem_ctrls = self._device.gpu_caches.get_mem_ctrls()
gpu_abs_mems = [mem.dram for mem in self._device.mem_ctrls]
self._device.memories = gpu_abs_mems
# Finally attach to the board. PciDevices default to Parent.any for the
# PciHost parameter. To make sure this is found we need to connect to
# board.pc or a child of board.pc. Historically we place this in the
# south bridge.
board.pc.south_bridge.gpu_shader = self._shader
# This is cosmetic so the device shows as board.pc.south_bridge.gpu###
# instead of board.pc.south_bridge.gpu_shader.CUs.l1_tlb.gpu_device.
gpu_name = f"gpu{self._my_id}"
self._device.set_parent(board.pc.south_bridge, gpu_name)
# A scaled down MI210-like device. Defaults to ~1/4th of an MI210.
class MI210(BaseViperGPU):
def __init__(
self,
num_cus: int = 32,
cu_per_sqc: int = 4,
tcp_size: str = "16KiB",
tcp_assoc: int = 16,
sqc_size: str = "32KiB",
sqc_assoc: int = 8,
scalar_size: str = "32KiB",
scalar_assoc: int = 8,
tcc_size: str = "256KiB",
tcc_assoc: int = 16,
tcc_count: int = 8,
num_memory_channels: int = 8,
cache_line_size: int = 64,
):
super().__init__()
self._cu_per_sqc = cu_per_sqc
self._tcp_size = tcp_size
self._tcp_assoc = tcp_assoc
self._sqc_size = sqc_size
self._sqc_assoc = sqc_assoc
self._scalar_size = scalar_size
self._scalar_assoc = scalar_assoc
self._tcc_size = tcc_size
self._tcc_assoc = tcc_assoc
self._tcc_count = tcc_count
self._num_memory_channels = num_memory_channels
self._cache_line_size = cache_line_size
self._device.device_name = "MI200"
self._device.DeviceID = 0x740F
self._device.SubsystemVendorID = 0x1002
self._device.SubsystemID = 0x0C34
# Setup device-specific address ranges for various SoC components.
shader = ViperShader(
self._my_id, num_cus, cache_line_size, self._device
)
self.set_shader(shader)
# Setup the SDMA engines depending on device. The MMIO base addresses
# can be found in the driver code under:
# include/asic_reg/sdmaX/sdmaX_Y_Z_offset.h
num_sdmas = 5
sdma_bases = [0x4980, 0x6180, 0x78000, 0x79000, 0x7A000]
sdma_sizes = [0x1000] * 5
self._device.sdmas = shader._create_sdmas(sdma_bases, sdma_sizes)
# Setup the Command Processor's PM4 engines.
pm4_starts = [0xC000]
pm4_ends = [0xD000]
self._device.pm4_pkt_procs = shader._create_pm4s(pm4_starts, pm4_ends)
def get_driver_command(self, debug: bool = False):
debug_commands = "dmesg -n8\n" if debug else ""
driver_load_command = (
"export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH\n"
"export HSA_ENABLE_INTERRUPT=0\n"
"export HCC_AMDGPU_TARGET=gfx90a\n"
f"{debug_commands}\n"
"dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128\n"
"if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then\n"
' echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."\n'
" /sbin/m5 exit\n"
"fi\n"
"modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0\n"
)
return driver_load_command
# Defaults to a single "XCD" (i.e., 1/8th of a full MI300X).
class MI300X(BaseViperGPU):
def __init__(
self,
num_cus: int = 40,
cu_per_sqc: int = 4,
tcp_size: str = "16KiB",
tcp_assoc: int = 16,
sqc_size: str = "32KiB",
sqc_assoc: int = 8,
scalar_size: str = "32KiB",
scalar_assoc: int = 8,
tcc_size: str = "256KiB",
tcc_assoc: int = 16,
tcc_count: int = 16,
num_memory_channels: int = 16,
cache_line_size: int = 64,
):
super().__init__()
self._cu_per_sqc = cu_per_sqc
self._tcp_size = tcp_size
self._tcp_assoc = tcp_assoc
self._sqc_size = sqc_size
self._sqc_assoc = sqc_assoc
self._scalar_size = scalar_size
self._scalar_assoc = scalar_assoc
self._tcc_size = tcc_size
self._tcc_assoc = tcc_assoc
self._tcc_count = tcc_count
self._num_memory_channels = num_memory_channels
self._cache_line_size = cache_line_size
self._device.device_name = "MI300X"
self._device.DeviceID = 0x740F
self._device.SubsystemVendorID = 0x1002
self._device.SubsystemID = 0x0C34
# Setup device-specific address ranges for various SoC components.
shader = ViperShader(
self._my_id, num_cus, cache_line_size, self._device
)
self.set_shader(shader)
# These currently use MI200 values until the MI300X bios is released.
num_sdmas = 5
sdma_bases = [0x4980, 0x6180, 0x78000, 0x79000, 0x7A000]
sdma_sizes = [0x1000] * 5
self._device.sdmas = shader._create_sdmas(sdma_bases, sdma_sizes)
# Setup the Command Processor's PM4 engines.
pm4_starts = [0xC000]
pm4_ends = [0xD000]
self._device.pm4_pkt_procs = shader._create_pm4s(pm4_starts, pm4_ends)
def get_driver_command(self, debug: bool = False):
debug_commands = "dmesg -n8\n" if debug else ""
driver_load_command = (
"export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH\n"
"export HSA_ENABLE_INTERRUPT=0\n"
"export HCC_AMDGPU_TARGET=gfx942\n"
'export HSA_OVERRIDE_GFX_VERSION="9.4.2"\n'
f"{debug_commands}\n"
"dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128\n"
"if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then\n"
' echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."\n'
" /sbin/m5 exit\n"
"fi\n"
"modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0\n"
)
return driver_load_command

View File

@@ -0,0 +1,377 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from typing import List
from m5.objects import (
AddrRange,
AMDGPUDevice,
AMDGPUInterruptHandler,
AMDGPUMemoryManager,
AMDGPUSystemHub,
BaseCPU,
BaseXBar,
ComputeUnit,
DynPoolManager,
GPUCommandProcessor,
GPUDispatcher,
HSAPacketProcessor,
LdsState,
PciLegacyIoBar,
PM4PacketProcessor,
RegisterFileCache,
RegisterManager,
ScalarRegisterFile,
SDMAEngine,
Shader,
VectorRegisterFile,
VegaGPUTLB,
VegaPagetableWalker,
VegaTLBCoalescer,
Wavefront,
)
class ViperCU(ComputeUnit):
def __init__(self, cu_id: int, device: AMDGPUDevice):
"""ComputeUnit object of a gfx9-like compute unit."""
super().__init__()
self._device = device
self.cu_id = cu_id
# Use in multiple places. Define variables to change once.
self._vreg_file_size = 2048
self._sreg_file_size = 2048
# Latencies, etc. use defaults in src/gpu-compute/GPU.py.
self.num_SIMDs = 4
self.n_wf = 8
self.localDataStore = LdsState(
size=65536,
)
self.wavefronts = [
Wavefront(simdId=j, wf_slot_id=k)
for j in range(self.num_SIMDs)
for k in range(self.n_wf)
]
self.vector_register_file = [
VectorRegisterFile(simd_id=i, num_regs=self._vreg_file_size)
for i in range(self.num_SIMDs)
]
self.scalar_register_file = [
ScalarRegisterFile(simd_id=i, num_regs=self._sreg_file_size)
for i in range(self.num_SIMDs)
]
self.register_file_cache = [
RegisterFileCache(simd_id=i) for i in range(self.num_SIMDs)
]
self.register_manager = RegisterManager(
policy="static",
vrf_pool_managers=[
DynPoolManager(pool_size=self._vreg_file_size, min_alloc=4)
for _ in range(self.num_SIMDs)
],
srf_pool_managers=[
DynPoolManager(pool_size=self._sreg_file_size, min_alloc=4)
for _ in range(self.num_SIMDs)
],
)
self.ldsPort = self.ldsBus.cpu_side_port
self.ldsBus.mem_side_port = self.localDataStore.cuPort
self._create_tlbs()
def _create_tlbs(self):
# Vector memory TLB
self.l1_tlb = VegaGPUTLB(
gpu_device=self._device,
size=64,
assoc=64,
hitLatency=1,
missLatency1=750,
missLatency2=750,
maxOutstandingReqs=64,
)
self.l1_coalescer = VegaTLBCoalescer(tlb_level=1)
self.translation_port = self.l1_coalescer.cpu_side_ports
self.l1_coalescer.mem_side_ports = self.l1_tlb.cpu_side_ports
# Scalar memory TLB
self.scalar_tlb = VegaGPUTLB(
gpu_device=self._device,
size=64,
assoc=64,
hitLatency=1,
missLatency1=750,
missLatency2=750,
maxOutstandingReqs=64,
)
self.scalar_coalescer = VegaTLBCoalescer(tlb_level=1)
self.scalar_tlb_port = self.scalar_coalescer.cpu_side_ports
self.scalar_coalescer.mem_side_ports = self.scalar_tlb.cpu_side_ports
# Instruction memory TLB
self.sqc_tlb = VegaGPUTLB(
gpu_device=self._device,
size=64,
assoc=64,
hitLatency=1,
missLatency1=750,
missLatency2=750,
maxOutstandingReqs=64,
)
self.sqc_coalescer = VegaTLBCoalescer(tlb_level=1)
self.sqc_tlb_port = self.sqc_coalescer.cpu_side_ports
self.sqc_coalescer.mem_side_ports = self.sqc_tlb.cpu_side_ports
def get_tlb_ports(self):
return [
self.l1_tlb.mem_side_ports,
self.sqc_tlb.mem_side_ports,
self.scalar_tlb.mem_side_ports,
]
class ViperShader(Shader):
def __init__(
self,
shader_id: int,
num_cus: int,
cache_line_size: int,
device: AMDGPUDevice,
):
"""
The shader defines something the represents a single software visible
GPU (e.g., a graphics card, a chiplet on a GPU, etc.).
"""
super().__init__()
self._shader_id = shader_id
self._cache_line_size = cache_line_size
self._device = device
self.n_wf = 8
self.timing = True
# used to track the (many, many) DMA ports
self._cpu_dma_ports = []
self._gpu_dma_ports = []
# VIPER GPU protocol implements release consistency at GPU side. So,
# we make their writes visible to the global memory and should read
# from global memory during kernal boundary. The pipeline initiates
# (or do not initiate) the acquire/release operation depending on
# these impl_kern_launch_rel and impl_kern_end_rel flags. The flag=true
# means pipeline initiates a acquire/release operation at kernel launch/end
# VIPER protocol is write-through based, and thus only impl_kern_launch_acq
# needs to set.
self.impl_kern_launch_acq = True
self.impl_kern_end_rel = False
# Attach compute units to GPU
self.CUs = [ViperCU(idx, device) for idx in range(num_cus)]
self._create_tlbs(device)
# This arbitrary address is something in the X86 I/O hole
hsapp_gpu_map_paddr = 0xE00000000
self.dispatcher = GPUDispatcher()
self.gpu_cmd_proc = GPUCommandProcessor(
hsapp=HSAPacketProcessor(
pioAddr=hsapp_gpu_map_paddr,
numHWQueues=10,
walker=VegaPagetableWalker(),
),
dispatcher=self.dispatcher,
walker=VegaPagetableWalker(),
)
self._cpu_dma_ports.append(self.gpu_cmd_proc.hsapp.dma)
self._cpu_dma_ports.append(self.gpu_cmd_proc.dma)
self._gpu_dma_ports.append(self.gpu_cmd_proc.hsapp.walker.port)
self._gpu_dma_ports.append(self.gpu_cmd_proc.walker.port)
self.system_hub = AMDGPUSystemHub()
self._cpu_dma_ports.append(self.system_hub.dma)
self._setup_device(device)
def get_compute_units(self):
return self.CUs
def _setup_device(self, device: AMDGPUDevice):
"""Set the device type info on the device connected via PCI."""
device.cp = self.gpu_cmd_proc
device.device_ih = AMDGPUInterruptHandler()
self._cpu_dma_ports.append(device.device_ih.dma)
# GPU data path
device.memory_manager = AMDGPUMemoryManager(
cache_line_size=self._cache_line_size,
)
self._gpu_dma_ports.append(device.memory_manager.port)
self._cpu_dma_ports.append(device.dma)
# Use the gem5 default of 0x280 OR'd with 0x10 which tells Linux there is
# a PCI capabilities list to travse.
device.Status = 0x0290
# The PCI capabilities are like a linked list. The list has a memory
# offset and a capability type ID read by the OS. Make the first
# capability at 0x80 and set the PXCAP (PCI express) capability to
# that address. Mark the type ID as PCI express.
# We leave the next ID of PXCAP blank to end the list.
device.PXCAPBaseOffset = 0x80
device.CapabilityPtr = 0x80
device.PXCAPCapId = 0x10
# Set bits 7 and 8 in the second PCIe device capabilities register which
# reports support for PCIe atomics for 32 and 64 bits respectively.
# Bit 9 for 128-bit compare and swap is not set because the amdgpu driver
# does not check this.
device.PXCAPDevCap2 = 0x00000180
# Set bit 6 to enable atomic requestor, meaning this device can request
# atomics from other PCI devices.
device.PXCAPDevCtrl2 = 0x00000040
# If there are multiple GPUs in the system, make sure the VBIOS region
# and the legacy IO bar do not overlap with the ranges from other GPUs.
if self._shader_id != 0:
device.ExpansionROM = 0xD0000000 + (0x20000 * self._shader_id)
bar4_addr = 0xF000 + (0x100 * self._shader_id)
device.BAR4 = PciLegacyIoBar(addr=bar4_addr, size="256B")
def _create_pm4s(self, pm4_starts: List[int], pm4_ends: List[int]):
"""Create PM4 packet processors."""
num_pm4s = len(pm4_starts)
pm4_procs = [
PM4PacketProcessor(
ip_id=i,
mmio_range=AddrRange(start=pm4_starts[i], end=pm4_ends[i]),
)
for i in range(num_pm4s)
]
for pm4_proc in pm4_procs:
self._cpu_dma_ports.append(pm4_proc.dma)
return pm4_procs
def _create_sdmas(self, sdma_bases: List[int], sdma_sizes: List[int]):
"""Create the SDMA engines."""
num_sdmas = len(sdma_bases)
sdmas = [
SDMAEngine(
walker=VegaPagetableWalker(),
mmio_base=sdma_bases[i],
mmio_size=sdma_sizes[i],
)
for i in range(num_sdmas)
]
for sdma in sdmas:
self._cpu_dma_ports.append(sdma.dma)
self._gpu_dma_ports.append(sdma.walker.port)
return sdmas
def get_cpu_dma_ports(self):
return self._cpu_dma_ports
def get_gpu_dma_ports(self):
return self._gpu_dma_ports
def _create_tlbs(self, device: AMDGPUDevice):
"""Connect per-CU TLBs to the L2/L3 TLBs"""
self.l2_tlb = VegaGPUTLB(
gpu_device=device,
size=4096,
assoc=64,
hitLatency=69,
missLatency1=750,
missLatency2=750,
maxOutstandingReqs=64,
)
self.l2_coalescer = VegaTLBCoalescer(tlb_level=2)
self.l3_tlb = VegaGPUTLB(
gpu_device=device,
size=8192,
assoc=64,
hitLatency=150,
missLatency1=750,
missLatency2=750,
maxOutstandingReqs=64,
)
self.l3_coalescer = VegaTLBCoalescer(tlb_level=3)
# Port flow: [L1s] -> L2 coalescer -> L2 tlb -> L3 coalescer -> L3 tlb
for cu in self.CUs:
for port in cu.get_tlb_ports():
self.l2_coalescer.cpu_side_ports = port
self.l2_coalescer.mem_side_ports = self.l2_tlb.cpu_side_ports
self.l2_tlb.mem_side_ports = self.l3_coalescer.cpu_side_ports
self.l3_coalescer.mem_side_ports = self.l3_tlb.cpu_side_ports
self._gpu_dma_ports.append(self.l3_tlb.walker.port)
def connect_iobus(self, iobus: BaseXBar):
"""Connect the GPU objects to the IO bus."""
self.gpu_cmd_proc.pio = iobus.mem_side_ports
self.gpu_cmd_proc.hsapp.pio = iobus.mem_side_ports
self.system_hub.pio = iobus.mem_side_ports
self._device.pio = iobus.mem_side_ports
self._device.device_ih.pio = iobus.mem_side_ports
for sdma in self._device.sdmas:
sdma.pio = iobus.mem_side_ports
for pm4_proc in self._device.pm4_pkt_procs:
pm4_proc.pio = iobus.mem_side_ports
def set_cpu_pointer(self, cpu: BaseCPU):
"""Set the CPU pointer for the Shader."""
self.cpu_pointer = cpu

View File

@@ -0,0 +1,123 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import base64
import os
from typing import (
List,
Optional,
)
from ...components.boards.abstract_board import AbstractBoard
from ...components.boards.kernel_disk_workload import KernelDiskWorkload
from ...components.boards.x86_board import X86Board
from ...components.cachehierarchies.abstract_cache_hierarchy import (
AbstractCacheHierarchy,
)
from ...components.devices.gpus.amdgpu import BaseViperGPU
from ...components.memory.abstract_memory_system import AbstractMemorySystem
from ...components.processors.abstract_processor import AbstractProcessor
from ...utils.override import overrides
class ViperBoard(X86Board):
"""
A derivative of X86Board capable of full system simulation for X86 with a
GPU device. Provides all the functionality of the X86Board with helper
methods specific to booting a disk with GPU libraries installed.
"""
def __init__(
self,
clk_freq: str,
processor: AbstractProcessor,
memory: AbstractMemorySystem,
cache_hierarchy: AbstractCacheHierarchy,
gpus: Optional[List[BaseViperGPU]] = None,
) -> None:
super().__init__(
clk_freq=clk_freq,
processor=processor,
memory=memory,
cache_hierarchy=cache_hierarchy,
)
print("Viper board __init__ was called here now")
self._gpus = gpus
def get_devices(self):
return self._gpus
@overrides(AbstractBoard)
def _connect_things(self) -> None:
print("Viper board connect things was called here now")
super()._connect_things()
if self._gpus is not None:
for gpu in self._gpus:
gpu.connectGPU(self)
@overrides(KernelDiskWorkload)
def get_disk_device(self):
return "/dev/sda"
@overrides(KernelDiskWorkload)
def get_default_kernel_args(self) -> List[str]:
# The regular parameters used with gem5 plus (1) fbdev_emulation=0
# to disable having to implement this functionality, (2) blacklist
# amdgpu because we need to copy the VBIOS into memory first, and (3)
# blacklist psmouse as amdgpu driver adds new mouse commands which
# gem5 does not implement and they do not seem to be documented.
return [
"earlyprintk=ttyS0",
"console=ttyS0",
"lpj=7999923",
"root={root_value}",
"drm_kms_helper.fbdev_emulation=0",
"modprobe.blacklist=amdgpu",
"modprobe.blacklist=psmouse",
]
# Replicate the capability of the old GPUFS config, which embed a binary
# application or script into a bash script setting up the environment and
# loading the GPU driver.
def make_gpu_app(self, gpu: BaseViperGPU, app: str, debug: bool = False):
driver_load_command = gpu.get_driver_command(debug=debug)
with open(os.path.abspath(app), "rb") as binfile:
encodedBin = base64.b64encode(binfile.read()).decode()
application_command = (
f'echo "{encodedBin}" | base64 -d > myapp\n'
"chmod +x myapp\n"
"./myapp {}\n"
"/sbin/m5 exit\n"
)
return driver_load_command + application_command

View File

@@ -0,0 +1,273 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import math
from m5.objects import (
DMASequencer,
RubyCache,
RubyPortProxy,
RubySequencer,
RubySystem,
SimpleMemory,
TreePLRURP,
)
from ...coherence_protocol import CoherenceProtocol
from ...components.cachehierarchies.abstract_cache_hierarchy import (
AbstractCacheHierarchy,
)
from ...components.cachehierarchies.ruby.abstract_ruby_cache_hierarchy import (
AbstractRubyCacheHierarchy,
)
from ...components.cachehierarchies.ruby.caches.viper.corepair_cache import (
CorePairCache,
)
from ...components.cachehierarchies.ruby.caches.viper.directory import (
ViperCPUDirectory,
)
from ...components.cachehierarchies.ruby.caches.viper.dma_controller import (
ViperCPUDMAController,
)
from ...prebuilt.viper.board import ViperBoard
from ...utils.override import overrides
from ...utils.requires import requires
from .viper_network import SimplePt2Pt
class ViperCPUCacheHierarchy(AbstractRubyCacheHierarchy):
"""
The VIPER CPU cache hierarchy creates CPU-side Ruby caches and connects
the nodes using a simple point-to-point topology.
"""
def __init__(
self,
l1d_size: str,
l1d_assoc: int,
l1i_size: str,
l1i_assoc: int,
l2_size: str,
l2_assoc: int,
l3_size: str,
l3_assoc: int,
):
"""
:param size: The size of each cache in the heirarchy.
:param assoc: The associativity of each cache.
:param device_dmas: Optional list of CPU connect device DMAs
"""
super().__init__()
self._l1d_size = l1d_size
self._l1d_assoc = l1d_assoc
self._l1i_size = l1i_size
self._l1i_assoc = l1i_assoc
self._l2_size = l2_size
self._l2_assoc = l2_assoc
self._l3_size = l3_size
self._l3_assoc = l3_assoc
self.ruby_system = RubySystem()
@overrides(AbstractCacheHierarchy)
def incorporate_cache(self, board: ViperBoard) -> None:
requires(coherence_protocol_required=CoherenceProtocol.GPU_VIPER)
# Ruby networks for CPU
self.ruby_system.network = SimplePt2Pt(self.ruby_system)
# MOESI_AMD_Base uses 5 virtual networks.
self.ruby_system.number_of_virtual_networks = 5
self.ruby_system.network.number_of_virtual_networks = 5
# There is a single local list of all of the controllers to make it
# easier to connect everything to the CPU network. This can be
# customized depending on the topology/network requirements.
# Create one controller for each L1 cache (and the cache mem obj.)
# Create a single directory controller (Really the memory cntrl).
self._controllers = []
cores = board.get_processor().get_cores()
num_cores = len(cores)
for i in range(0, num_cores, 2):
cache = CorePairCache(
l1d_size=self._l1d_size,
l1d_assoc=self._l1d_assoc,
l1i_size=self._l1i_size,
l1i_assoc=self._l1i_assoc,
l2_size=self._l2_size,
l2_assoc=self._l2_assoc,
network=self.ruby_system.network,
cache_line_size=board.get_cache_line_size(),
core=cores[i],
)
cache.version = i // 2
cache.ruby_system = self.ruby_system
cache.clk_domain = board.get_clock_domain()
cache.sequencer = RubySequencer(
version=i,
dcache=cache.L1D0cache,
ruby_system=self.ruby_system,
coreid=0,
is_cpu_sequencer=True,
clk_domain=board.get_clock_domain(),
)
cache.sequencer1 = RubySequencer(
version=i + 1,
dcache=cache.L1D1cache,
ruby_system=self.ruby_system,
coreid=1,
is_cpu_sequencer=True,
clk_domain=board.get_clock_domain(),
)
cache.sequencer.connectIOPorts(board.get_io_bus())
cache.sequencer1.connectIOPorts(board.get_io_bus())
cores[i].connect_icache(cache.sequencer.in_ports)
cores[i].connect_dcache(cache.sequencer.in_ports)
cores[i].connect_walker_ports(
cache.sequencer.in_ports, cache.sequencer.in_ports
)
# Connect the interrupt ports
int_req_port = cache.sequencer.interrupt_out_port
int_resp_port = cache.sequencer.in_ports
cores[i].connect_interrupt(int_req_port, int_resp_port)
if i + 1 < num_cores:
cores[i + 1].connect_icache(cache.sequencer1.in_ports)
cores[i + 1].connect_dcache(cache.sequencer1.in_ports)
cores[i + 1].connect_walker_ports(
cache.sequencer.in_ports, cache.sequencer1.in_ports
)
# Connect the interrupt ports
cores[i + 1].connect_interrupt(int_req_port, int_resp_port)
self._controllers.append(cache)
# Create the CPU directory controllers
self._directory_controllers = []
# Automatically determine the numa bit. This can be changed to
# increase the number of bytes to each memory channel before
# going to the next channels
dir_bits = int(math.log(len(board.get_mem_ports()), 2))
block_size_bits = int(math.log(board.get_cache_line_size()))
for addr_range, port in board.get_mem_ports():
dir = ViperCPUDirectory(
self.ruby_system.network,
board.get_cache_line_size(),
addr_range,
port,
)
dir.ruby_system = self.ruby_system
dir.version = len(self._directory_controllers)
self._directory_controllers.append(dir)
dir.L3CacheMemory = RubyCache(
size=self._l3_size,
assoc=self._l3_assoc,
replacement_policy=TreePLRURP(),
resourceStalls=False,
dataArrayBanks=16,
tagArrayBanks=16,
dataAccessLatency=20,
tagAccessLatency=15,
)
# Create the DMA Controllers, if required.
self._dma_controllers = []
if board.has_dma_ports():
dma_ports = board.get_dma_ports()
for i, port in enumerate(dma_ports):
ctrl = ViperCPUDMAController(
self.ruby_system.network, board.get_cache_line_size()
)
ctrl.dma_sequencer = DMASequencer(version=i, in_ports=port)
ctrl.ruby_system = self.ruby_system
ctrl.dma_sequencer.ruby_system = self.ruby_system
self._dma_controllers.append(ctrl)
# Create DMA Controllers requires for any devices in the system.
device_dmas = []
if board.get_devices() is not None:
for device in board.get_devices():
device_dmas += device.get_cpu_dma_ports()
if len(device_dmas) > 0:
for _, port in enumerate(device_dmas):
ctrl = ViperCPUDMAController(
self.ruby_system.network, board.get_cache_line_size()
)
ctrl.dma_sequencer = DMASequencer(
version=len(self._dma_controllers), in_ports=port
)
ctrl.ruby_system = self.ruby_system
ctrl.dma_sequencer.ruby_system = self.ruby_system
self._dma_controllers.append(ctrl)
# Number of sequencers = one per core pair + one per DMA
self.ruby_system.num_of_sequencers = len(self._controllers) * 2 + len(
self._dma_controllers
)
# Assign the controllers to their parent objects.
self.ruby_system.controllers = self._controllers
self.ruby_system.directory_controllers = self._directory_controllers
if len(self._dma_controllers) != 0:
self.ruby_system.dma_controllers = self._dma_controllers
# Connect the controllers using the network topology
self.ruby_system.network.connect(
self._controllers
+ self._directory_controllers
+ self._dma_controllers
)
self.ruby_system.network.setup_buffers()
# Set up a proxy port for the system_port. Used for load binaries and
# other functional-only things.
self.ruby_system.sys_port_proxy = RubyPortProxy(
ruby_system=self.ruby_system
)
board.connect_system_port(self.ruby_system.sys_port_proxy.in_ports)

View File

@@ -0,0 +1,351 @@
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import math
from m5.objects import (
AddrRange,
DMASequencer,
HBM_2000_4H_1x64,
MemCtrl,
RubyCache,
RubySequencer,
RubySystem,
SrcClockDomain,
TreePLRURP,
VIPERCoalescer,
VoltageDomain,
)
from ...coherence_protocol import CoherenceProtocol
from ...components.cachehierarchies.ruby.abstract_ruby_cache_hierarchy import (
AbstractRubyCacheHierarchy,
)
from ...components.cachehierarchies.ruby.caches.viper.directory import (
ViperGPUDirectory,
)
from ...components.cachehierarchies.ruby.caches.viper.dma_controller import (
ViperGPUDMAController,
)
from ...components.cachehierarchies.ruby.caches.viper.sqc import SQCCache
from ...components.cachehierarchies.ruby.caches.viper.tcc import TCCCache
from ...components.cachehierarchies.ruby.caches.viper.tcp import TCPCache
from ...components.devices.gpus.viper_shader import ViperShader
from ...utils.requires import requires
from .viper_network import (
SimpleDoubleCrossbar,
SimplePt2Pt,
)
class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
_seqs = 0
@classmethod
def seqCount(cls):
# Use SeqCount not class since we need global count
cls._seqs += 1
return cls._seqs - 1
def __init__(
self,
tcp_size: str,
tcp_assoc: int,
sqc_size: str,
sqc_assoc: int,
scalar_size: str,
scalar_assoc: int,
tcc_size: str,
tcc_assoc: int,
tcc_count: int,
cu_per_sqc: int,
num_memory_channels: int,
cache_line_size: int,
shader: ViperShader,
):
"""
:param size: The size of each cache in the heirarchy.
:param assoc: The associativity of each cache.
"""
super().__init__()
self._tcp_size = tcp_size
self._tcp_assoc = tcp_assoc
self._sqc_size = sqc_size
self._sqc_assoc = sqc_assoc
self._scalar_size = scalar_size
self._scalar_assoc = scalar_assoc
self._tcc_size = tcc_size
self._tcc_assoc = tcc_assoc
self._cache_line_size = cache_line_size
# We have everything we need to know to create the GPU cache hierarchy
# immediately. Therefore, an incorporate_cache method is not part of
# this cache hierarchy. Go ahead and incorporate everything now.
requires(coherence_protocol_required=CoherenceProtocol.GPU_VIPER)
self.ruby_gpu = RubySystem()
self.ruby_gpu.block_size_bytes = cache_line_size
# Ruby network for this GPU
self.ruby_gpu.network = SimpleDoubleCrossbar(self.ruby_gpu)
# VIPER uses 6 virtual networks.
self.ruby_gpu.number_of_virtual_networks = 6
self.ruby_gpu.network.number_of_virtual_networks = 6
# There is a single local list of all of the controllers to make it
# easier to connect everything to the GPU network. This can be
# customized depending on the topology/network requirements.
self._controllers = []
self._directory_controllers = []
self._dma_controllers = []
self._mem_ctrls = []
self.clk_domain = SrcClockDomain(
clock="1801MHz",
voltage_domain=VoltageDomain(),
)
# Variables used by multiple objects are defined once here
tcc_bits = int(math.log(tcc_count, 2))
deadlock_threshold = 500000
# Create one TCP per CU
compute_units = shader.get_compute_units()
for idx, cu in enumerate(compute_units):
tcp = TCPCache(
tcp_size=self._tcp_size,
tcp_assoc=self._tcp_assoc,
network=self.ruby_gpu.network,
cache_line_size=self._cache_line_size,
)
tcp.version = idx
tcp.sequencer = RubySequencer(
version=self.seqCount(),
dcache=tcp.L1cache,
ruby_system=self.ruby_gpu,
is_cpu_sequencer=True,
)
tcp.coalescer = VIPERCoalescer(
version=self.seqCount(),
icache=tcp.L1cache,
dcache=tcp.L1cache,
ruby_system=self.ruby_gpu,
support_inst_reqs=False,
is_cpu_sequencer=False,
deadlock_threshold=deadlock_threshold,
max_coalesces_per_cycle=1,
gmTokenPort=cu.gmTokenPort,
)
for port_idx in range(cu.wf_size):
cu.memory_port[port_idx] = tcp.coalescer.in_ports
tcp.ruby_system = self.ruby_gpu
tcp.TCC_select_num_bits = tcc_bits
tcp.use_seq_not_coal = False
tcp.issue_latency = 1
tcp.clk_domain = self.clk_domain
tcp.recycle_latency = 10
tcp.WB = False
tcp.disableL1 = False
self._controllers.append(tcp)
# This check ensures there are a same number of CUs with shared SQC
# and Scalar caches.
num_cus = len(shader.get_compute_units())
assert (num_cus % cu_per_sqc) == 0
num_sqcs = num_cus // cu_per_sqc
for idx in range(num_sqcs):
sqc = SQCCache(
sqc_size=self._sqc_size,
sqc_assoc=self._sqc_assoc,
network=self.ruby_gpu.network,
cache_line_size=self._cache_line_size,
)
sqc.version = idx
sqc.sequencer = RubySequencer(
version=self.seqCount(),
dcache=sqc.L1cache,
ruby_system=self.ruby_gpu,
support_data_reqs=False,
is_cpu_sequencer=False,
deadlock_threshold=deadlock_threshold,
)
# SQC is shared across {cu_per_sqc} CUs.
cu_base = cu_per_sqc * idx
for cu_num in range(cu_per_sqc):
cu_id = cu_base + cu_num
compute_units[cu_id].sqc_port = sqc.sequencer.in_ports
sqc.ruby_system = self.ruby_gpu
sqc.TCC_select_num_bits = tcc_bits
sqc.clk_domain = self.clk_domain
sqc.recycle_latency = 10
self._controllers.append(sqc)
num_scalars = num_sqcs
for idx in range(num_scalars):
scalar = SQCCache(
sqc_size=self._scalar_size,
sqc_assoc=self._scalar_assoc,
network=self.ruby_gpu.network,
cache_line_size=self._cache_line_size,
)
# Scalar uses same controller as SQC, so add SQC count
scalar.version = idx + num_sqcs
scalar.sequencer = RubySequencer(
version=self.seqCount(),
dcache=scalar.L1cache,
ruby_system=self.ruby_gpu,
support_data_reqs=False,
is_cpu_sequencer=False,
deadlock_threshold=deadlock_threshold,
)
# Scalar cache is shared across {cu_per_sqc} CUs.
cu_base = cu_per_sqc * idx
for cu_num in range(cu_per_sqc):
cu_id = cu_base + cu_num
compute_units[cu_id].scalar_port = scalar.sequencer.in_ports
scalar.ruby_system = self.ruby_gpu
scalar.TCC_select_num_bits = tcc_bits
scalar.clk_domain = self.clk_domain
scalar.recycle_latency = 10
self._controllers.append(scalar)
# Create TCCs (GPU L2 cache)
for idx in range(tcc_count):
tcc = TCCCache(
tcc_size=self._tcc_size,
tcc_assoc=self._tcc_assoc,
network=self.ruby_gpu.network,
cache_line_size=self._cache_line_size,
)
tcc.version = idx
tcc.ruby_system = self.ruby_gpu
tcc.WB = False
tcc.clk_domain = self.clk_domain
tcc.recycle_latency = 10
self._controllers.append(tcc)
# Create DMA controllers
for i, port in enumerate(shader.get_gpu_dma_ports()):
ctrl = ViperGPUDMAController(
self.ruby_gpu.network, self._cache_line_size
)
ctrl.dma_sequencer = DMASequencer(version=i, in_ports=port)
ctrl.ruby_system = self.ruby_gpu
ctrl.dma_sequencer.ruby_system = self.ruby_gpu
self._dma_controllers.append(ctrl)
# Create GPU memories. Currently fixed to HBM2.
mem_type_cls = HBM_2000_4H_1x64
# AMDGPUDevice currently tells the driver there is 16GiB for memory.
# Until that is a parameter, this need to be fixed to 16GiB.
gpu_mem_range = AddrRange(0, size="16GiB")
intlv_low_bit = int(math.log(self._cache_line_size, 2))
intlv_bits = int(math.log(num_memory_channels, 2))
for idx in range(num_memory_channels):
addr_range = AddrRange(
gpu_mem_range.start,
size=gpu_mem_range.size(),
intlvHighBit=intlv_low_bit + intlv_bits - 1,
intlvBits=intlv_bits,
intlvMatch=idx,
xorHighBit=0,
)
mem_ctrl = MemCtrl(dram=mem_type_cls(range=addr_range))
self._mem_ctrls.append(mem_ctrl)
dir = ViperGPUDirectory(
self.ruby_gpu.network,
self._cache_line_size,
addr_range,
self._mem_ctrls[idx].port,
)
dir.ruby_system = self.ruby_gpu
dir.TCC_select_num_bits = tcc_bits
dir.version = len(self._directory_controllers)
self._directory_controllers.append(dir)
dir.L3CacheMemory = RubyCache(
size="16MiB",
assoc=16,
atomicALUs=64,
replacement_policy=TreePLRURP(),
resourceStalls=False,
dataArrayBanks=16,
tagArrayBanks=16,
dataAccessLatency=20,
tagAccessLatency=15,
)
# Number of sequencers = one per TCP, SQC, and Scalar + one per DMA.
self.ruby_gpu.num_of_sequencers = len(self._controllers) + len(
self._dma_controllers
)
# Assign the controllers to their parent objects.
self.ruby_gpu.controllers = self._controllers
self.ruby_gpu.directory_controllers = self._directory_controllers
# Connect the controllers using the network topology
self.ruby_gpu.network.connect(
self._controllers
+ self._directory_controllers
+ self._dma_controllers
)
self.ruby_gpu.network.setup_buffers()
def get_mem_ctrls(self):
return self._mem_ctrls

View File

@@ -0,0 +1,165 @@
# Copyright (c) 2021 The Regents of the University of California.
# All Rights Reserved
#
# Copyright (c) 2024 Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from m5.objects import (
SimpleExtLink,
SimpleIntLink,
SimpleNetwork,
Switch,
)
class SimplePt2Pt(SimpleNetwork):
"""A simple point-to-point network. This does not use garnet."""
def __init__(self, ruby_system):
super().__init__()
self.netifs = []
# TODO: These should be in a base class
# https://gem5.atlassian.net/browse/GEM5-1039
self.ruby_system = ruby_system
def connect(self, controllers):
"""Connect all of the controllers to routers and connect the routers
together in a point-to-point network.
"""
# Create one router/switch per controller in the system
self.routers = [Switch(router_id=i) for i in range(len(controllers))]
# Make a link from each controller to the router. The link goes
# externally to the network.
self.ext_links = [
SimpleExtLink(link_id=i, ext_node=c, int_node=self.routers[i])
for i, c in enumerate(controllers)
]
# Make an "internal" link (internal to the network) between every pair
# of routers.
link_count = 0
int_links = []
for ri in self.routers:
for rj in self.routers:
if ri == rj:
continue # Don't connect a router to itself!
link_count += 1
int_links.append(
SimpleIntLink(link_id=link_count, src_node=ri, dst_node=rj)
)
self.int_links = int_links
class SimpleDoubleCrossbar(SimpleNetwork):
"""
GPU network with crossbars between CU caches and L2 caches and between L2
caches and directories/memory controllers/DMAs using SimpleNetwork.
"""
def __init__(self, ruby_system):
super().__init__()
self.netifs = []
self.ruby_system = ruby_system
def connect(self, controllers):
l2_xbar_types = ("TCP_Controller", "SQC_Controller", "TCC_Controller")
soc_xbar_types = ("DMA_Controller", "Directory_Controller")
# Create one router per controller plus a crossbar for L2 controllers
# and a crossbar for SoC controllers.
routers = [Switch(router_id=i) for i in range(len(controllers))]
routers.append(Switch(router_id=len(routers)))
routers.append(Switch(router_id=len(routers)))
self.routers = routers
# Routers 0 ... N-2 connect to the individual controllers
self.ext_links = [
SimpleExtLink(link_id=i, ext_node=c, int_node=self.routers[i])
for i, c in enumerate(controllers)
]
# Connect compute unit components and L2s to L2 crossbar in both
# directions.
l2_xbar_id = len(controllers)
soc_xbar_id = l2_xbar_id + 1
int_links = []
for ext_link in self.ext_links:
if ext_link.ext_node.type in l2_xbar_types:
int_links.append(
SimpleIntLink(
link_id=len(int_links),
src_node=ext_link.int_node,
dst_node=self.routers[l2_xbar_id],
)
)
int_links.append(
SimpleIntLink(
link_id=len(int_links),
src_node=self.routers[l2_xbar_id],
dst_node=ext_link.int_node,
)
)
elif ext_link.ext_node.type in soc_xbar_types:
int_links.append(
SimpleIntLink(
link_id=len(int_links),
src_node=ext_link.int_node,
dst_node=self.routers[soc_xbar_id],
)
)
int_links.append(
SimpleIntLink(
link_id=len(int_links),
src_node=self.routers[soc_xbar_id],
dst_node=ext_link.int_node,
)
)
# Connect L2 xbar to SoC xbar.
int_links.append(
SimpleIntLink(
link_id=len(int_links),
src_node=self.routers[l2_xbar_id],
dst_node=self.routers[soc_xbar_id],
)
)
int_links.append(
SimpleIntLink(
link_id=len(int_links),
src_node=self.routers[soc_xbar_id],
dst_node=self.routers[l2_xbar_id],
)
)
# Finalize network int_links for unproxy
self.int_links = int_links