From 2fca39cec7738b18d6d9db2c8112133ed706755a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 30 Oct 2024 12:45:44 -0700
Subject: [PATCH] dev-amdgpu: Separating gpu_memory from gpu_cache.

This change separates the instantiation of gpu memory from
instantiatiing the gpu cache. Prior to this change, the gpu
cache instantiated the memories for the gpu by receiving number
of channels as a parameter. With this change, the gpu memory
should be constructed outside the gpu, without being added as a
child to any other object, and passed to the constructor of
the gpu.
---
 .../example/gem5_library/x86-mi300x-gpu.py    |  3 +-
 .../gem5/components/devices/gpus/amdgpu.py    | 48 +++++++++----------
 src/python/gem5/prebuilt/viper/board.py       |  7 +++
 .../prebuilt/viper/cpu_cache_hierarchy.py     |  1 -
 .../prebuilt/viper/gpu_cache_hierarchy.py     | 35 +++-----------
 5 files changed, 39 insertions(+), 55 deletions(-)

diff --git a/configs/example/gem5_library/x86-mi300x-gpu.py b/configs/example/gem5_library/x86-mi300x-gpu.py
index 20fa99b9d8..712c185277 100644
--- a/configs/example/gem5_library/x86-mi300x-gpu.py
+++ b/configs/example/gem5_library/x86-mi300x-gpu.py
@@ -56,6 +56,7 @@ import argparse
 
 from gem5.coherence_protocol import CoherenceProtocol
 from gem5.components.devices.gpus.amdgpu import MI300X
+from gem5.components.memory import HBM2Stack
 from gem5.components.memory.single_channel import SingleChannelDDR4_2400
 from gem5.components.processors.cpu_types import CPUTypes
 from gem5.components.processors.simple_processor import SimpleProcessor
@@ -120,7 +121,7 @@ for core in processor.cores:
 
 # The GPU must be created first so we can assign CPU-side DMA ports to the
 # CPU cache hierarchy.
-gpu0 = MI300X()
+gpu0 = MI300X(gpu_memory=HBM2Stack(size="16GiB"))
 
 cache_hierarchy = ViperCPUCacheHierarchy(
     l1d_size="32KiB",
diff --git a/src/python/gem5/components/devices/gpus/amdgpu.py b/src/python/gem5/components/devices/gpus/amdgpu.py
index 67ecc6c2c6..0865aa8b17 100644
--- a/src/python/gem5/components/devices/gpus/amdgpu.py
+++ b/src/python/gem5/components/devices/gpus/amdgpu.py
@@ -33,6 +33,7 @@ from m5.objects import (
 )
 
 from ....components.boards.abstract_board import AbstractBoard
+from ....components.memory.abstract_memory_system import AbstractMemorySystem
 from ....prebuilt.viper.gpu_cache_hierarchy import ViperGPUCacheHierarchy
 from .viper_shader import ViperShader
 
@@ -51,7 +52,16 @@ class BaseViperGPU(SubSystem):
     def get_gpu_count(cls):
         return cls._gpu_count
 
-    def __init__(self):
+    def __init__(self, gpu_memory: AbstractMemorySystem):
+        super().__init__()
+        if gpu_memory.has_parent():
+            raise ValueError(
+                "`memory` should not have a parent, i.e. you should "
+                "instantiate the gpu memory like gpu_memory = HBM2Stack() "
+                "and **not** like board.gpu_memory = HBM2Stack()"
+            )
+        self._memory = gpu_memory
+
         # Setup various PCI related parameters
         self._my_id = self.get_gpu_count()
         pci_dev = self.next_pci_dev()
@@ -74,19 +84,11 @@ class BaseViperGPU(SubSystem):
         # Connect all PIO buses
         self._shader.connect_iobus(board.get_io_bus())
 
-        # The System() object in gem5 has a memories parameter which defaults
-        # to Self.all. This will collect *all* AbstractMemories and connect to
-        # the CPU side. To avoid this we manually assign the memories param to
-        # the CPU side memories. We need the MemInterface which is called dram
-        # in the MemCtrl class even though it might not be modelling dram.
-        memory = board.get_memory()
-        cpu_abs_mems = [mem.dram for mem in memory.get_memory_controllers()]
-        board.memories = cpu_abs_mems
-
         # Make the cache hierarchy. This will create an independent RubySystem
         # class containing only the GPU caches with no network connection to
         # the CPU cache hierarchy.
         self._device.gpu_caches = ViperGPUCacheHierarchy(
+            gpu_memory=self._memory,
             tcp_size=self._tcp_size,
             tcp_assoc=self._tcp_assoc,
             sqc_size=self._sqc_size,
@@ -97,19 +99,10 @@ class BaseViperGPU(SubSystem):
             tcc_assoc=self._tcc_assoc,
             tcc_count=self._tcc_count,
             cu_per_sqc=self._cu_per_sqc,
-            num_memory_channels=self._num_memory_channels,
             cache_line_size=self._cache_line_size,
             shader=self._shader,
         )
 
-        # Collect GPU memory controllers created in the GPU cache hierarchy.
-        # First assign them as a child to the device so the SimObject unproxy.
-        # The device requires the memories parameter to be set as the system
-        # pointer required by the AbstractMemory class is set by AMDGPUDevice.
-        self._device.mem_ctrls = self._device.gpu_caches.get_mem_ctrls()
-        gpu_abs_mems = [mem.dram for mem in self._device.mem_ctrls]
-        self._device.memories = gpu_abs_mems
-
         # Finally attach to the board. PciDevices default to Parent.any for the
         # PciHost parameter. To make sure this is found we need to connect to
         # board.pc or a child of board.pc. Historically we place this in the
@@ -120,12 +113,20 @@ class BaseViperGPU(SubSystem):
         # instead of board.pc.south_bridge.gpu_shader.CUs.l1_tlb.gpu_device.
         gpu_name = f"gpu{self._my_id}"
         self._device.set_parent(board.pc.south_bridge, gpu_name)
+        self._device.memory = self._memory
+
+        # Collect GPU memory controllers created in the GPU cache hierarchy.
+        # First assign them as a child to the device so the SimObject unproxy.
+        # The device requires the memories parameter to be set as the system
+        # pointer required by the AbstractMemory class is set by AMDGPUDevice.
+        self._device.memories = self._memory.get_mem_interfaces()
 
 
 # A scaled down MI210-like device. Defaults to ~1/4th of an MI210.
 class MI210(BaseViperGPU):
     def __init__(
         self,
+        gpu_memory: AbstractMemorySystem,
         num_cus: int = 32,
         cu_per_sqc: int = 4,
         tcp_size: str = "16KiB",
@@ -137,10 +138,9 @@ class MI210(BaseViperGPU):
         tcc_size: str = "256KiB",
         tcc_assoc: int = 16,
         tcc_count: int = 8,
-        num_memory_channels: int = 8,
         cache_line_size: int = 64,
     ):
-        super().__init__()
+        super().__init__(gpu_memory=gpu_memory)
 
         self._cu_per_sqc = cu_per_sqc
         self._tcp_size = tcp_size
@@ -152,7 +152,6 @@ class MI210(BaseViperGPU):
         self._tcc_size = tcc_size
         self._tcc_assoc = tcc_assoc
         self._tcc_count = tcc_count
-        self._num_memory_channels = num_memory_channels
         self._cache_line_size = cache_line_size
 
         self._device.device_name = "MI200"
@@ -205,6 +204,7 @@ class MI210(BaseViperGPU):
 class MI300X(BaseViperGPU):
     def __init__(
         self,
+        gpu_memory: AbstractMemorySystem,
         num_cus: int = 40,
         cu_per_sqc: int = 4,
         tcp_size: str = "16KiB",
@@ -216,10 +216,9 @@ class MI300X(BaseViperGPU):
         tcc_size: str = "256KiB",
         tcc_assoc: int = 16,
         tcc_count: int = 16,
-        num_memory_channels: int = 16,
         cache_line_size: int = 64,
     ):
-        super().__init__()
+        super().__init__(gpu_memory=gpu_memory)
 
         self._cu_per_sqc = cu_per_sqc
         self._tcp_size = tcp_size
@@ -231,7 +230,6 @@ class MI300X(BaseViperGPU):
         self._tcc_size = tcc_size
         self._tcc_assoc = tcc_assoc
         self._tcc_count = tcc_count
-        self._num_memory_channels = num_memory_channels
         self._cache_line_size = cache_line_size
 
         self._device.device_name = "MI300X"
diff --git a/src/python/gem5/prebuilt/viper/board.py b/src/python/gem5/prebuilt/viper/board.py
index 2bf8869b92..a7958016ac 100644
--- a/src/python/gem5/prebuilt/viper/board.py
+++ b/src/python/gem5/prebuilt/viper/board.py
@@ -121,6 +121,13 @@ class ViperBoard(X86Board):
                 isa.ExtendedState = avx_extended_state
                 isa.FamilyModelStepping = avx_cpu_features
 
+        # The System() object in gem5 has a memories parameter which defaults
+        # to Self.all. This will collect *all* AbstractMemories and connect to
+        # the CPU side. To avoid this we manually assign the memories param to
+        # the CPU side memories. We need the MemInterface which is called dram
+        # in the MemCtrl class even though it might not be modelling dram.
+        self.memories = self.memory.get_mem_interfaces()
+
     @overrides(KernelDiskWorkload)
     def get_disk_device(self):
         return "/dev/sda"
diff --git a/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py b/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py
index 6d7d5d90b8..960feff55a 100644
--- a/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py
+++ b/src/python/gem5/prebuilt/viper/cpu_cache_hierarchy.py
@@ -35,7 +35,6 @@ from m5.objects import (
     RubyPortProxy,
     RubySequencer,
     RubySystem,
-    SimpleMemory,
     TreePLRURP,
 )
 
diff --git a/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py b/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py
index 1012d679db..93102b6a32 100644
--- a/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py
+++ b/src/python/gem5/prebuilt/viper/gpu_cache_hierarchy.py
@@ -57,11 +57,9 @@ from ...components.cachehierarchies.ruby.caches.viper.sqc import SQCCache
 from ...components.cachehierarchies.ruby.caches.viper.tcc import TCCCache
 from ...components.cachehierarchies.ruby.caches.viper.tcp import TCPCache
 from ...components.devices.gpus.viper_shader import ViperShader
+from ...components.memory.abstract_memory_system import AbstractMemorySystem
 from ...utils.requires import requires
-from .viper_network import (
-    SimpleDoubleCrossbar,
-    SimplePt2Pt,
-)
+from .viper_network import SimpleDoubleCrossbar
 
 
 class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
@@ -75,6 +73,7 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
 
     def __init__(
         self,
+        gpu_memory: AbstractMemorySystem,
         tcp_size: str,
         tcp_assoc: int,
         sqc_size: str,
@@ -85,7 +84,6 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
         tcc_assoc: int,
         tcc_count: int,
         cu_per_sqc: int,
-        num_memory_channels: int,
         cache_line_size: int,
         shader: ViperShader,
     ):
@@ -284,33 +282,14 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
 
             self._dma_controllers.append(ctrl)
 
-        # Create GPU memories. Currently fixed to HBM2.
-        mem_type_cls = HBM_2000_4H_1x64
-
-        # AMDGPUDevice currently tells the driver there is 16GiB for memory.
-        # Until that is a parameter, this need to be fixed to 16GiB.
-        gpu_mem_range = AddrRange(0, size="16GiB")
-        intlv_low_bit = int(math.log(self._cache_line_size, 2))
-        intlv_bits = int(math.log(num_memory_channels, 2))
-
-        for idx in range(num_memory_channels):
-            addr_range = AddrRange(
-                gpu_mem_range.start,
-                size=gpu_mem_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                intlvBits=intlv_bits,
-                intlvMatch=idx,
-                xorHighBit=0,
-            )
-
-            mem_ctrl = MemCtrl(dram=mem_type_cls(range=addr_range))
-            self._mem_ctrls.append(mem_ctrl)
-
+        gpu_memory.set_memory_range([AddrRange(0, size=gpu_memory.get_size())])
+        self._mem_ctrls = gpu_memory.get_memory_controllers()
+        for addr_range, port in gpu_memory.get_mem_ports():
             dir = ViperGPUDirectory(
                 self.ruby_gpu.network,
                 self._cache_line_size,
                 addr_range,
-                self._mem_ctrls[idx].port,
+                port,
             )
 
             dir.ruby_system = self.ruby_gpu