dev-amdgpu: Separating gpu_memory from gpu_cache.

This change separates the instantiation of gpu memory from
instantiatiing the gpu cache. Prior to this change, the gpu
cache instantiated the memories for the gpu by receiving number
of channels as a parameter. With this change, the gpu memory
should be constructed outside the gpu, without being added as a
child to any other object, and passed to the constructor of
the gpu.
This commit is contained in:
Mahyar Samani
2024-10-30 12:45:44 -07:00
committed by Bobby R. Bruce
parent 1948155fb2
commit 2fca39cec7
5 changed files with 39 additions and 55 deletions

View File

@@ -56,6 +56,7 @@ import argparse
from gem5.coherence_protocol import CoherenceProtocol
from gem5.components.devices.gpus.amdgpu import MI300X
from gem5.components.memory import HBM2Stack
from gem5.components.memory.single_channel import SingleChannelDDR4_2400
from gem5.components.processors.cpu_types import CPUTypes
from gem5.components.processors.simple_processor import SimpleProcessor
@@ -120,7 +121,7 @@ for core in processor.cores:
# The GPU must be created first so we can assign CPU-side DMA ports to the
# CPU cache hierarchy.
gpu0 = MI300X()
gpu0 = MI300X(gpu_memory=HBM2Stack(size="16GiB"))
cache_hierarchy = ViperCPUCacheHierarchy(
l1d_size="32KiB",

View File

@@ -33,6 +33,7 @@ from m5.objects import (
)
from ....components.boards.abstract_board import AbstractBoard
from ....components.memory.abstract_memory_system import AbstractMemorySystem
from ....prebuilt.viper.gpu_cache_hierarchy import ViperGPUCacheHierarchy
from .viper_shader import ViperShader
@@ -51,7 +52,16 @@ class BaseViperGPU(SubSystem):
def get_gpu_count(cls):
return cls._gpu_count
def __init__(self):
def __init__(self, gpu_memory: AbstractMemorySystem):
super().__init__()
if gpu_memory.has_parent():
raise ValueError(
"`memory` should not have a parent, i.e. you should "
"instantiate the gpu memory like gpu_memory = HBM2Stack() "
"and **not** like board.gpu_memory = HBM2Stack()"
)
self._memory = gpu_memory
# Setup various PCI related parameters
self._my_id = self.get_gpu_count()
pci_dev = self.next_pci_dev()
@@ -74,19 +84,11 @@ class BaseViperGPU(SubSystem):
# Connect all PIO buses
self._shader.connect_iobus(board.get_io_bus())
# The System() object in gem5 has a memories parameter which defaults
# to Self.all. This will collect *all* AbstractMemories and connect to
# the CPU side. To avoid this we manually assign the memories param to
# the CPU side memories. We need the MemInterface which is called dram
# in the MemCtrl class even though it might not be modelling dram.
memory = board.get_memory()
cpu_abs_mems = [mem.dram for mem in memory.get_memory_controllers()]
board.memories = cpu_abs_mems
# Make the cache hierarchy. This will create an independent RubySystem
# class containing only the GPU caches with no network connection to
# the CPU cache hierarchy.
self._device.gpu_caches = ViperGPUCacheHierarchy(
gpu_memory=self._memory,
tcp_size=self._tcp_size,
tcp_assoc=self._tcp_assoc,
sqc_size=self._sqc_size,
@@ -97,19 +99,10 @@ class BaseViperGPU(SubSystem):
tcc_assoc=self._tcc_assoc,
tcc_count=self._tcc_count,
cu_per_sqc=self._cu_per_sqc,
num_memory_channels=self._num_memory_channels,
cache_line_size=self._cache_line_size,
shader=self._shader,
)
# Collect GPU memory controllers created in the GPU cache hierarchy.
# First assign them as a child to the device so the SimObject unproxy.
# The device requires the memories parameter to be set as the system
# pointer required by the AbstractMemory class is set by AMDGPUDevice.
self._device.mem_ctrls = self._device.gpu_caches.get_mem_ctrls()
gpu_abs_mems = [mem.dram for mem in self._device.mem_ctrls]
self._device.memories = gpu_abs_mems
# Finally attach to the board. PciDevices default to Parent.any for the
# PciHost parameter. To make sure this is found we need to connect to
# board.pc or a child of board.pc. Historically we place this in the
@@ -120,12 +113,20 @@ class BaseViperGPU(SubSystem):
# instead of board.pc.south_bridge.gpu_shader.CUs.l1_tlb.gpu_device.
gpu_name = f"gpu{self._my_id}"
self._device.set_parent(board.pc.south_bridge, gpu_name)
self._device.memory = self._memory
# Collect GPU memory controllers created in the GPU cache hierarchy.
# First assign them as a child to the device so the SimObject unproxy.
# The device requires the memories parameter to be set as the system
# pointer required by the AbstractMemory class is set by AMDGPUDevice.
self._device.memories = self._memory.get_mem_interfaces()
# A scaled down MI210-like device. Defaults to ~1/4th of an MI210.
class MI210(BaseViperGPU):
def __init__(
self,
gpu_memory: AbstractMemorySystem,
num_cus: int = 32,
cu_per_sqc: int = 4,
tcp_size: str = "16KiB",
@@ -137,10 +138,9 @@ class MI210(BaseViperGPU):
tcc_size: str = "256KiB",
tcc_assoc: int = 16,
tcc_count: int = 8,
num_memory_channels: int = 8,
cache_line_size: int = 64,
):
super().__init__()
super().__init__(gpu_memory=gpu_memory)
self._cu_per_sqc = cu_per_sqc
self._tcp_size = tcp_size
@@ -152,7 +152,6 @@ class MI210(BaseViperGPU):
self._tcc_size = tcc_size
self._tcc_assoc = tcc_assoc
self._tcc_count = tcc_count
self._num_memory_channels = num_memory_channels
self._cache_line_size = cache_line_size
self._device.device_name = "MI200"
@@ -205,6 +204,7 @@ class MI210(BaseViperGPU):
class MI300X(BaseViperGPU):
def __init__(
self,
gpu_memory: AbstractMemorySystem,
num_cus: int = 40,
cu_per_sqc: int = 4,
tcp_size: str = "16KiB",
@@ -216,10 +216,9 @@ class MI300X(BaseViperGPU):
tcc_size: str = "256KiB",
tcc_assoc: int = 16,
tcc_count: int = 16,
num_memory_channels: int = 16,
cache_line_size: int = 64,
):
super().__init__()
super().__init__(gpu_memory=gpu_memory)
self._cu_per_sqc = cu_per_sqc
self._tcp_size = tcp_size
@@ -231,7 +230,6 @@ class MI300X(BaseViperGPU):
self._tcc_size = tcc_size
self._tcc_assoc = tcc_assoc
self._tcc_count = tcc_count
self._num_memory_channels = num_memory_channels
self._cache_line_size = cache_line_size
self._device.device_name = "MI300X"

View File

@@ -121,6 +121,13 @@ class ViperBoard(X86Board):
isa.ExtendedState = avx_extended_state
isa.FamilyModelStepping = avx_cpu_features
# The System() object in gem5 has a memories parameter which defaults
# to Self.all. This will collect *all* AbstractMemories and connect to
# the CPU side. To avoid this we manually assign the memories param to
# the CPU side memories. We need the MemInterface which is called dram
# in the MemCtrl class even though it might not be modelling dram.
self.memories = self.memory.get_mem_interfaces()
@overrides(KernelDiskWorkload)
def get_disk_device(self):
return "/dev/sda"

View File

@@ -35,7 +35,6 @@ from m5.objects import (
RubyPortProxy,
RubySequencer,
RubySystem,
SimpleMemory,
TreePLRURP,
)

View File

@@ -57,11 +57,9 @@ from ...components.cachehierarchies.ruby.caches.viper.sqc import SQCCache
from ...components.cachehierarchies.ruby.caches.viper.tcc import TCCCache
from ...components.cachehierarchies.ruby.caches.viper.tcp import TCPCache
from ...components.devices.gpus.viper_shader import ViperShader
from ...components.memory.abstract_memory_system import AbstractMemorySystem
from ...utils.requires import requires
from .viper_network import (
SimpleDoubleCrossbar,
SimplePt2Pt,
)
from .viper_network import SimpleDoubleCrossbar
class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
@@ -75,6 +73,7 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
def __init__(
self,
gpu_memory: AbstractMemorySystem,
tcp_size: str,
tcp_assoc: int,
sqc_size: str,
@@ -85,7 +84,6 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
tcc_assoc: int,
tcc_count: int,
cu_per_sqc: int,
num_memory_channels: int,
cache_line_size: int,
shader: ViperShader,
):
@@ -284,33 +282,14 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
self._dma_controllers.append(ctrl)
# Create GPU memories. Currently fixed to HBM2.
mem_type_cls = HBM_2000_4H_1x64
# AMDGPUDevice currently tells the driver there is 16GiB for memory.
# Until that is a parameter, this need to be fixed to 16GiB.
gpu_mem_range = AddrRange(0, size="16GiB")
intlv_low_bit = int(math.log(self._cache_line_size, 2))
intlv_bits = int(math.log(num_memory_channels, 2))
for idx in range(num_memory_channels):
addr_range = AddrRange(
gpu_mem_range.start,
size=gpu_mem_range.size(),
intlvHighBit=intlv_low_bit + intlv_bits - 1,
intlvBits=intlv_bits,
intlvMatch=idx,
xorHighBit=0,
)
mem_ctrl = MemCtrl(dram=mem_type_cls(range=addr_range))
self._mem_ctrls.append(mem_ctrl)
gpu_memory.set_memory_range([AddrRange(0, size=gpu_memory.get_size())])
self._mem_ctrls = gpu_memory.get_memory_controllers()
for addr_range, port in gpu_memory.get_mem_ports():
dir = ViperGPUDirectory(
self.ruby_gpu.network,
self._cache_line_size,
addr_range,
self._mem_ctrls[idx].port,
port,
)
dir.ruby_system = self.ruby_gpu