From 2fca39cec7738b18d6d9db2c8112133ed706755a Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 30 Oct 2024 12:45:44 -0700 Subject: [PATCH] dev-amdgpu: Separating gpu_memory from gpu_cache. This change separates the instantiation of gpu memory from instantiatiing the gpu cache. Prior to this change, the gpu cache instantiated the memories for the gpu by receiving number of channels as a parameter. With this change, the gpu memory should be constructed outside the gpu, without being added as a child to any other object, and passed to the constructor of the gpu. --- .../example/gem5_library/x86-mi300x-gpu.py | 3 +- .../gem5/components/devices/gpus/amdgpu.py | 48 +++++++++---------- src/python/gem5/prebuilt/viper/board.py | 7 +++ .../prebuilt/viper/cpu_cache_hierarchy.py | 1 - .../prebuilt/viper/gpu_cache_hierarchy.py | 35 +++----------- 5 files changed, 39 insertions(+), 55 deletions(-) diff --git a/configs/example/gem5_library/x86-mi300x-gpu.py b/configs/example/gem5_library/x86-mi300x-gpu.py index 20fa99b9d8..712c185277 100644 --- a/configs/example/gem5_library/x86-mi300x-gpu.py +++ b/configs/example/gem5_library/x86-mi300x-gpu.py @@ -56,6 +56,7 @@ import argparse from gem5.coherence_protocol import CoherenceProtocol from gem5.components.devices.gpus.amdgpu import MI300X +from gem5.components.memory import HBM2Stack from gem5.components.memory.single_channel import SingleChannelDDR4_2400 from gem5.components.processors.cpu_types import CPUTypes from gem5.components.processors.simple_processor import SimpleProcessor @@ -120,7 +121,7 @@ for core in processor.cores: # The GPU must be created first so we can assign CPU-side DMA ports to the # CPU cache hierarchy. -gpu0 = MI300X() +gpu0 = MI300X(gpu_memory=HBM2Stack(size="16GiB")) cache_hierarchy = ViperCPUCacheHierarchy( l1d_size="32KiB", diff --git a/src/python/gem5/components/devices/gpus/amdgpu.py b/src/python/gem5/components/devices/gpus/amdgpu.py index 67ecc6c2c6..0865aa8b17 100644 --- a/src/python/gem5/components/devices/gpus/amdgpu.py +++ b/src/python/gem5/components/devices/gpus/amdgpu.py @@ -33,6 +33,7 @@ from m5.objects import ( ) from ....components.boards.abstract_board import AbstractBoard +from ....components.memory.abstract_memory_system import AbstractMemorySystem from ....prebuilt.viper.gpu_cache_hierarchy import ViperGPUCacheHierarchy from .viper_shader import ViperShader @@ -51,7 +52,16 @@ class BaseViperGPU(SubSystem): def get_gpu_count(cls): return cls._gpu_count - def __init__(self): + def __init__(self, gpu_memory: AbstractMemorySystem): + super().__init__() + if gpu_memory.has_parent(): + raise ValueError( + "`memory` should not have a parent, i.e. you should " + "instantiate the gpu memory like gpu_memory = HBM2Stack() " + "and **not** like board.gpu_memory = HBM2Stack()" + ) + self._memory = gpu_memory + # Setup various PCI related parameters self._my_id = self.get_gpu_count() pci_dev = self.next_pci_dev() @@ -74,19 +84,11 @@ class BaseViperGPU(SubSystem): # Connect all PIO buses self._shader.connect_iobus(board.get_io_bus()) - # The System() object in gem5 has a memories parameter which defaults - # to Self.all. This will collect *all* AbstractMemories and connect to - # the CPU side. To avoid this we manually assign the memories param to - # the CPU side memories. We need the MemInterface which is called dram - # in the MemCtrl class even though it might not be modelling dram. - memory = board.get_memory() - cpu_abs_mems = [mem.dram for mem in memory.get_memory_controllers()] - board.memories = cpu_abs_mems - # Make the cache hierarchy. This will create an independent RubySystem # class containing only the GPU caches with no network connection to # the CPU cache hierarchy. self._device.gpu_caches = ViperGPUCacheHierarchy( + gpu_memory=self._memory, tcp_size=self._tcp_size, tcp_assoc=self._tcp_assoc, sqc_size=self._sqc_size, @@ -97,19 +99,10 @@ class BaseViperGPU(SubSystem): tcc_assoc=self._tcc_assoc, tcc_count=self._tcc_count, cu_per_sqc=self._cu_per_sqc, - num_memory_channels=self._num_memory_channels, cache_line_size=self._cache_line_size, shader=self._shader, ) - # Collect GPU memory controllers created in the GPU cache hierarchy. - # First assign them as a child to the device so the SimObject unproxy. - # The device requires the memories parameter to be set as the system - # pointer required by the AbstractMemory class is set by AMDGPUDevice. - self._device.mem_ctrls = self._device.gpu_caches.get_mem_ctrls() - gpu_abs_mems = [mem.dram for mem in self._device.mem_ctrls] - self._device.memories = gpu_abs_mems - # Finally attach to the board. PciDevices default to Parent.any for the # PciHost parameter. To make sure this is found we need to connect to # board.pc or a child of board.pc. Historically we place this in the @@ -120,12 +113,20 @@ class BaseViperGPU(SubSystem): # instead of board.pc.south_bridge.gpu_shader.CUs.l1_tlb.gpu_device. gpu_name = f"gpu{self._my_id}" self._device.set_parent(board.pc.south_bridge, gpu_name) + self._device.memory = self._memory + + # Collect GPU memory controllers created in the GPU cache hierarchy. + # First assign them as a child to the device so the SimObject unproxy. + # The device requires the memories parameter to be set as the system + # pointer required by the AbstractMemory class is set by AMDGPUDevice. + self._device.memories = self._memory.get_mem_interfaces() # A scaled down MI210-like device. Defaults to ~1/4th of an MI210. class MI210(BaseViperGPU): def __init__( self, + gpu_memory: AbstractMemorySystem, num_cus: int = 32, cu_per_sqc: int = 4, tcp_size: str = "16KiB", @@ -137,10 +138,9 @@ class MI210(BaseViperGPU): tcc_size: str = "256KiB", tcc_assoc: int = 16, tcc_count: int = 8, - num_memory_channels: int = 8, cache_line_size: int = 64, ): - super().__init__() + super().__init__(gpu_memory=gpu_memory) self._cu_per_sqc = cu_per_sqc self._tcp_size = tcp_size @@ -152,7 +152,6 @@ class MI210(BaseViperGPU): self._tcc_size = tcc_size self._tcc_assoc = tcc_assoc self._tcc_count = tcc_count - self._num_memory_channels = num_memory_channels self._cache_line_size = cache_line_size self._device.device_name = "MI200" @@ -205,6 +204,7 @@ class MI210(BaseViperGPU): class MI300X(BaseViperGPU): def __init__( self, + gpu_memory: AbstractMemorySystem, num_cus: int = 40, cu_per_sqc: int = 4, tcp_size: str = "16KiB", @@ -216,10 +216,9 @@ class MI300X(BaseViperGPU): tcc_size: str = "256KiB", tcc_assoc: int = 16, tcc_count: int = 16, - num_memory_channels: int = 16, cache_line_size: int = 64, ): - super().__init__() + super().__init__(gpu_memory=gpu_memory) self._cu_per_sqc = cu_per_sqc self._tcp_size = tcp_size @@ -231,7 +230,6 @@ class MI300X(BaseViperGPU): self._tcc_size = tcc_size self._tcc_assoc = tcc_assoc self._tcc_count = tcc_count - self._num_memory_channels = num_memory_channels self._cache_line_size = cache_line_size self._device.device_name = "MI300X" diff --git a/src/python/gem5/prebuilt/viper/board.py b/src/python/gem5/prebuilt/viper/board.py index 2bf8869b92..a7958016ac 100644 --- a/src/python/gem5/prebuilt/viper/board.py +++ b/src/python/gem5/prebuilt/viper/board.py @@ -121,6 +121,13 @@ class ViperBoard(X86Board): isa.ExtendedState = avx_extended_state isa.FamilyModelStepping = avx_cpu_features + # The System() object in gem5 has a memories parameter which defaults + # to Self.all. This will collect *all* AbstractMemories and connect to + # the CPU side. To avoid this we manually assign the memories param to + # the CPU side memories. We need the MemInterface which is called dram + # in the MemCtrl class even though it might not be modelling dram. + self.memories = self.memory.get_mem_interfaces() + @overrides(KernelDiskWorkload) def get_disk_device(self): return "/dev/sda" diff --git a/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py b/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py index 6d7d5d90b8..960feff55a 100644 --- a/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py +++ b/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py @@ -35,7 +35,6 @@ from m5.objects import ( RubyPortProxy, RubySequencer, RubySystem, - SimpleMemory, TreePLRURP, ) diff --git a/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py b/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py index 1012d679db..93102b6a32 100644 --- a/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py +++ b/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py @@ -57,11 +57,9 @@ from ...components.cachehierarchies.ruby.caches.viper.sqc import SQCCache from ...components.cachehierarchies.ruby.caches.viper.tcc import TCCCache from ...components.cachehierarchies.ruby.caches.viper.tcp import TCPCache from ...components.devices.gpus.viper_shader import ViperShader +from ...components.memory.abstract_memory_system import AbstractMemorySystem from ...utils.requires import requires -from .viper_network import ( - SimpleDoubleCrossbar, - SimplePt2Pt, -) +from .viper_network import SimpleDoubleCrossbar class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy): @@ -75,6 +73,7 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy): def __init__( self, + gpu_memory: AbstractMemorySystem, tcp_size: str, tcp_assoc: int, sqc_size: str, @@ -85,7 +84,6 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy): tcc_assoc: int, tcc_count: int, cu_per_sqc: int, - num_memory_channels: int, cache_line_size: int, shader: ViperShader, ): @@ -284,33 +282,14 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy): self._dma_controllers.append(ctrl) - # Create GPU memories. Currently fixed to HBM2. - mem_type_cls = HBM_2000_4H_1x64 - - # AMDGPUDevice currently tells the driver there is 16GiB for memory. - # Until that is a parameter, this need to be fixed to 16GiB. - gpu_mem_range = AddrRange(0, size="16GiB") - intlv_low_bit = int(math.log(self._cache_line_size, 2)) - intlv_bits = int(math.log(num_memory_channels, 2)) - - for idx in range(num_memory_channels): - addr_range = AddrRange( - gpu_mem_range.start, - size=gpu_mem_range.size(), - intlvHighBit=intlv_low_bit + intlv_bits - 1, - intlvBits=intlv_bits, - intlvMatch=idx, - xorHighBit=0, - ) - - mem_ctrl = MemCtrl(dram=mem_type_cls(range=addr_range)) - self._mem_ctrls.append(mem_ctrl) - + gpu_memory.set_memory_range([AddrRange(0, size=gpu_memory.get_size())]) + self._mem_ctrls = gpu_memory.get_memory_controllers() + for addr_range, port in gpu_memory.get_mem_ports(): dir = ViperGPUDirectory( self.ruby_gpu.network, self._cache_line_size, addr_range, - self._mem_ctrls[idx].port, + port, ) dir.ruby_system = self.ruby_gpu