dev-amdgpu: Separating gpu_memory from gpu_cache.

This change separates the instantiation of gpu memory from instantiatiing the gpu cache. Prior to this change, the gpu cache instantiated the memories for the gpu by receiving number of channels as a parameter. With this change, the gpu memory should be constructed outside the gpu, without being added as a child to any other object, and passed to the constructor of the gpu.
2024-10-30 12:45:44 -07:00
parent 1948155fb2
commit 2fca39cec7
5 changed files with 39 additions and 55 deletions
--- a/configs/example/gem5_library/x86-mi300x-gpu.py
+++ b/configs/example/gem5_library/x86-mi300x-gpu.py
@@ -56,6 +56,7 @@ import argparse

 from gem5.coherence_protocol import CoherenceProtocol
 from gem5.components.devices.gpus.amdgpu import MI300X
+from gem5.components.memory import HBM2Stack
 from gem5.components.memory.single_channel import SingleChannelDDR4_2400
 from gem5.components.processors.cpu_types import CPUTypes
 from gem5.components.processors.simple_processor import SimpleProcessor
@@ -120,7 +121,7 @@ for core in processor.cores:

 # The GPU must be created first so we can assign CPU-side DMA ports to the
 # CPU cache hierarchy.
-gpu0 = MI300X()
+gpu0 = MI300X(gpu_memory=HBM2Stack(size="16GiB"))

 cache_hierarchy = ViperCPUCacheHierarchy(
    l1d_size="32KiB",
--- a/src/python/gem5/components/devices/gpus/amdgpu.py
+++ b/src/python/gem5/components/devices/gpus/amdgpu.py
@@ -33,6 +33,7 @@ from m5.objects import (
 )

 from ....components.boards.abstract_board import AbstractBoard
+from ....components.memory.abstract_memory_system import AbstractMemorySystem
 from ....prebuilt.viper.gpu_cache_hierarchy import ViperGPUCacheHierarchy
 from .viper_shader import ViperShader

@@ -51,7 +52,16 @@ class BaseViperGPU(SubSystem):
    def get_gpu_count(cls):
        return cls._gpu_count

-    def __init__(self):
+    def __init__(self, gpu_memory: AbstractMemorySystem):
+        super().__init__()
+        if gpu_memory.has_parent():
+            raise ValueError(
+                "`memory` should not have a parent, i.e. you should "
+                "instantiate the gpu memory like gpu_memory = HBM2Stack() "
+                "and **not** like board.gpu_memory = HBM2Stack()"
+            )
+        self._memory = gpu_memory
+
        # Setup various PCI related parameters
        self._my_id = self.get_gpu_count()
        pci_dev = self.next_pci_dev()
@@ -74,19 +84,11 @@ class BaseViperGPU(SubSystem):
        # Connect all PIO buses
        self._shader.connect_iobus(board.get_io_bus())

-        # The System() object in gem5 has a memories parameter which defaults
-        # to Self.all. This will collect *all* AbstractMemories and connect to
-        # the CPU side. To avoid this we manually assign the memories param to
-        # the CPU side memories. We need the MemInterface which is called dram
-        # in the MemCtrl class even though it might not be modelling dram.
-        memory = board.get_memory()
-        cpu_abs_mems = [mem.dram for mem in memory.get_memory_controllers()]
-        board.memories = cpu_abs_mems
-
        # Make the cache hierarchy. This will create an independent RubySystem
        # class containing only the GPU caches with no network connection to
        # the CPU cache hierarchy.
        self._device.gpu_caches = ViperGPUCacheHierarchy(
+            gpu_memory=self._memory,
            tcp_size=self._tcp_size,
            tcp_assoc=self._tcp_assoc,
            sqc_size=self._sqc_size,
@@ -97,19 +99,10 @@ class BaseViperGPU(SubSystem):
            tcc_assoc=self._tcc_assoc,
            tcc_count=self._tcc_count,
            cu_per_sqc=self._cu_per_sqc,
-            num_memory_channels=self._num_memory_channels,
            cache_line_size=self._cache_line_size,
            shader=self._shader,
        )

-        # Collect GPU memory controllers created in the GPU cache hierarchy.
-        # First assign them as a child to the device so the SimObject unproxy.
-        # The device requires the memories parameter to be set as the system
-        # pointer required by the AbstractMemory class is set by AMDGPUDevice.
-        self._device.mem_ctrls = self._device.gpu_caches.get_mem_ctrls()
-        gpu_abs_mems = [mem.dram for mem in self._device.mem_ctrls]
-        self._device.memories = gpu_abs_mems
-
        # Finally attach to the board. PciDevices default to Parent.any for the
        # PciHost parameter. To make sure this is found we need to connect to
        # board.pc or a child of board.pc. Historically we place this in the
@@ -120,12 +113,20 @@ class BaseViperGPU(SubSystem):
        # instead of board.pc.south_bridge.gpu_shader.CUs.l1_tlb.gpu_device.
        gpu_name = f"gpu{self._my_id}"
        self._device.set_parent(board.pc.south_bridge, gpu_name)
+        self._device.memory = self._memory
+
+        # Collect GPU memory controllers created in the GPU cache hierarchy.
+        # First assign them as a child to the device so the SimObject unproxy.
+        # The device requires the memories parameter to be set as the system
+        # pointer required by the AbstractMemory class is set by AMDGPUDevice.
+        self._device.memories = self._memory.get_mem_interfaces()


 # A scaled down MI210-like device. Defaults to ~1/4th of an MI210.
 class MI210(BaseViperGPU):
    def __init__(
        self,
+        gpu_memory: AbstractMemorySystem,
        num_cus: int = 32,
        cu_per_sqc: int = 4,
        tcp_size: str = "16KiB",
@@ -137,10 +138,9 @@ class MI210(BaseViperGPU):
        tcc_size: str = "256KiB",
        tcc_assoc: int = 16,
        tcc_count: int = 8,
-        num_memory_channels: int = 8,
        cache_line_size: int = 64,
    ):
-        super().__init__()
+        super().__init__(gpu_memory=gpu_memory)

        self._cu_per_sqc = cu_per_sqc
        self._tcp_size = tcp_size
@@ -152,7 +152,6 @@ class MI210(BaseViperGPU):
        self._tcc_size = tcc_size
        self._tcc_assoc = tcc_assoc
        self._tcc_count = tcc_count
-        self._num_memory_channels = num_memory_channels
        self._cache_line_size = cache_line_size

        self._device.device_name = "MI200"
@@ -205,6 +204,7 @@ class MI210(BaseViperGPU):
 class MI300X(BaseViperGPU):
    def __init__(
        self,
+        gpu_memory: AbstractMemorySystem,
        num_cus: int = 40,
        cu_per_sqc: int = 4,
        tcp_size: str = "16KiB",
@@ -216,10 +216,9 @@ class MI300X(BaseViperGPU):
        tcc_size: str = "256KiB",
        tcc_assoc: int = 16,
        tcc_count: int = 16,
-        num_memory_channels: int = 16,
        cache_line_size: int = 64,
    ):
-        super().__init__()
+        super().__init__(gpu_memory=gpu_memory)

        self._cu_per_sqc = cu_per_sqc
        self._tcp_size = tcp_size
@@ -231,7 +230,6 @@ class MI300X(BaseViperGPU):
        self._tcc_size = tcc_size
        self._tcc_assoc = tcc_assoc
        self._tcc_count = tcc_count
-        self._num_memory_channels = num_memory_channels
        self._cache_line_size = cache_line_size

        self._device.device_name = "MI300X"
--- a/src/python/gem5/prebuilt/viper/board.py
+++ b/src/python/gem5/prebuilt/viper/board.py
@@ -121,6 +121,13 @@ class ViperBoard(X86Board):
                isa.ExtendedState = avx_extended_state
                isa.FamilyModelStepping = avx_cpu_features

+        # The System() object in gem5 has a memories parameter which defaults
+        # to Self.all. This will collect *all* AbstractMemories and connect to
+        # the CPU side. To avoid this we manually assign the memories param to
+        # the CPU side memories. We need the MemInterface which is called dram
+        # in the MemCtrl class even though it might not be modelling dram.
+        self.memories = self.memory.get_mem_interfaces()
+
    @overrides(KernelDiskWorkload)
    def get_disk_device(self):
        return "/dev/sda"
--- a/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py
+++ b/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py
@@ -35,7 +35,6 @@ from m5.objects import (
    RubyPortProxy,
    RubySequencer,
    RubySystem,
-    SimpleMemory,
    TreePLRURP,
 )

--- a/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py
+++ b/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py
@@ -57,11 +57,9 @@ from ...components.cachehierarchies.ruby.caches.viper.sqc import SQCCache
 from ...components.cachehierarchies.ruby.caches.viper.tcc import TCCCache
 from ...components.cachehierarchies.ruby.caches.viper.tcp import TCPCache
 from ...components.devices.gpus.viper_shader import ViperShader
+from ...components.memory.abstract_memory_system import AbstractMemorySystem
 from ...utils.requires import requires
-from .viper_network import (
-    SimpleDoubleCrossbar,
-    SimplePt2Pt,
-)
+from .viper_network import SimpleDoubleCrossbar


 class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
@@ -75,6 +73,7 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):

    def __init__(
        self,
+        gpu_memory: AbstractMemorySystem,
        tcp_size: str,
        tcp_assoc: int,
        sqc_size: str,
@@ -85,7 +84,6 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
        tcc_assoc: int,
        tcc_count: int,
        cu_per_sqc: int,
-        num_memory_channels: int,
        cache_line_size: int,
        shader: ViperShader,
    ):
@@ -284,33 +282,14 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):

            self._dma_controllers.append(ctrl)

-        # Create GPU memories. Currently fixed to HBM2.
-        mem_type_cls = HBM_2000_4H_1x64
-
-        # AMDGPUDevice currently tells the driver there is 16GiB for memory.
-        # Until that is a parameter, this need to be fixed to 16GiB.
-        gpu_mem_range = AddrRange(0, size="16GiB")
-        intlv_low_bit = int(math.log(self._cache_line_size, 2))
-        intlv_bits = int(math.log(num_memory_channels, 2))
-
-        for idx in range(num_memory_channels):
-            addr_range = AddrRange(
-                gpu_mem_range.start,
-                size=gpu_mem_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                intlvBits=intlv_bits,
-                intlvMatch=idx,
-                xorHighBit=0,
-            )
-
-            mem_ctrl = MemCtrl(dram=mem_type_cls(range=addr_range))
-            self._mem_ctrls.append(mem_ctrl)
-
+        gpu_memory.set_memory_range([AddrRange(0, size=gpu_memory.get_size())])
+        self._mem_ctrls = gpu_memory.get_memory_controllers()
+        for addr_range, port in gpu_memory.get_mem_ports():
            dir = ViperGPUDirectory(
                self.ruby_gpu.network,
                self._cache_line_size,
                addr_range,
-                self._mem_ctrls[idx].port,
+                port,
            )

            dir.ruby_system = self.ruby_gpu