dev-amdgpu: Separating gpu_memory from gpu_cache.
This change separates the instantiation of gpu memory from instantiatiing the gpu cache. Prior to this change, the gpu cache instantiated the memories for the gpu by receiving number of channels as a parameter. With this change, the gpu memory should be constructed outside the gpu, without being added as a child to any other object, and passed to the constructor of the gpu.
This commit is contained in:
committed by
Bobby R. Bruce
parent
1948155fb2
commit
2fca39cec7
@@ -56,6 +56,7 @@ import argparse
|
||||
|
||||
from gem5.coherence_protocol import CoherenceProtocol
|
||||
from gem5.components.devices.gpus.amdgpu import MI300X
|
||||
from gem5.components.memory import HBM2Stack
|
||||
from gem5.components.memory.single_channel import SingleChannelDDR4_2400
|
||||
from gem5.components.processors.cpu_types import CPUTypes
|
||||
from gem5.components.processors.simple_processor import SimpleProcessor
|
||||
@@ -120,7 +121,7 @@ for core in processor.cores:
|
||||
|
||||
# The GPU must be created first so we can assign CPU-side DMA ports to the
|
||||
# CPU cache hierarchy.
|
||||
gpu0 = MI300X()
|
||||
gpu0 = MI300X(gpu_memory=HBM2Stack(size="16GiB"))
|
||||
|
||||
cache_hierarchy = ViperCPUCacheHierarchy(
|
||||
l1d_size="32KiB",
|
||||
|
||||
@@ -33,6 +33,7 @@ from m5.objects import (
|
||||
)
|
||||
|
||||
from ....components.boards.abstract_board import AbstractBoard
|
||||
from ....components.memory.abstract_memory_system import AbstractMemorySystem
|
||||
from ....prebuilt.viper.gpu_cache_hierarchy import ViperGPUCacheHierarchy
|
||||
from .viper_shader import ViperShader
|
||||
|
||||
@@ -51,7 +52,16 @@ class BaseViperGPU(SubSystem):
|
||||
def get_gpu_count(cls):
|
||||
return cls._gpu_count
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, gpu_memory: AbstractMemorySystem):
|
||||
super().__init__()
|
||||
if gpu_memory.has_parent():
|
||||
raise ValueError(
|
||||
"`memory` should not have a parent, i.e. you should "
|
||||
"instantiate the gpu memory like gpu_memory = HBM2Stack() "
|
||||
"and **not** like board.gpu_memory = HBM2Stack()"
|
||||
)
|
||||
self._memory = gpu_memory
|
||||
|
||||
# Setup various PCI related parameters
|
||||
self._my_id = self.get_gpu_count()
|
||||
pci_dev = self.next_pci_dev()
|
||||
@@ -74,19 +84,11 @@ class BaseViperGPU(SubSystem):
|
||||
# Connect all PIO buses
|
||||
self._shader.connect_iobus(board.get_io_bus())
|
||||
|
||||
# The System() object in gem5 has a memories parameter which defaults
|
||||
# to Self.all. This will collect *all* AbstractMemories and connect to
|
||||
# the CPU side. To avoid this we manually assign the memories param to
|
||||
# the CPU side memories. We need the MemInterface which is called dram
|
||||
# in the MemCtrl class even though it might not be modelling dram.
|
||||
memory = board.get_memory()
|
||||
cpu_abs_mems = [mem.dram for mem in memory.get_memory_controllers()]
|
||||
board.memories = cpu_abs_mems
|
||||
|
||||
# Make the cache hierarchy. This will create an independent RubySystem
|
||||
# class containing only the GPU caches with no network connection to
|
||||
# the CPU cache hierarchy.
|
||||
self._device.gpu_caches = ViperGPUCacheHierarchy(
|
||||
gpu_memory=self._memory,
|
||||
tcp_size=self._tcp_size,
|
||||
tcp_assoc=self._tcp_assoc,
|
||||
sqc_size=self._sqc_size,
|
||||
@@ -97,19 +99,10 @@ class BaseViperGPU(SubSystem):
|
||||
tcc_assoc=self._tcc_assoc,
|
||||
tcc_count=self._tcc_count,
|
||||
cu_per_sqc=self._cu_per_sqc,
|
||||
num_memory_channels=self._num_memory_channels,
|
||||
cache_line_size=self._cache_line_size,
|
||||
shader=self._shader,
|
||||
)
|
||||
|
||||
# Collect GPU memory controllers created in the GPU cache hierarchy.
|
||||
# First assign them as a child to the device so the SimObject unproxy.
|
||||
# The device requires the memories parameter to be set as the system
|
||||
# pointer required by the AbstractMemory class is set by AMDGPUDevice.
|
||||
self._device.mem_ctrls = self._device.gpu_caches.get_mem_ctrls()
|
||||
gpu_abs_mems = [mem.dram for mem in self._device.mem_ctrls]
|
||||
self._device.memories = gpu_abs_mems
|
||||
|
||||
# Finally attach to the board. PciDevices default to Parent.any for the
|
||||
# PciHost parameter. To make sure this is found we need to connect to
|
||||
# board.pc or a child of board.pc. Historically we place this in the
|
||||
@@ -120,12 +113,20 @@ class BaseViperGPU(SubSystem):
|
||||
# instead of board.pc.south_bridge.gpu_shader.CUs.l1_tlb.gpu_device.
|
||||
gpu_name = f"gpu{self._my_id}"
|
||||
self._device.set_parent(board.pc.south_bridge, gpu_name)
|
||||
self._device.memory = self._memory
|
||||
|
||||
# Collect GPU memory controllers created in the GPU cache hierarchy.
|
||||
# First assign them as a child to the device so the SimObject unproxy.
|
||||
# The device requires the memories parameter to be set as the system
|
||||
# pointer required by the AbstractMemory class is set by AMDGPUDevice.
|
||||
self._device.memories = self._memory.get_mem_interfaces()
|
||||
|
||||
|
||||
# A scaled down MI210-like device. Defaults to ~1/4th of an MI210.
|
||||
class MI210(BaseViperGPU):
|
||||
def __init__(
|
||||
self,
|
||||
gpu_memory: AbstractMemorySystem,
|
||||
num_cus: int = 32,
|
||||
cu_per_sqc: int = 4,
|
||||
tcp_size: str = "16KiB",
|
||||
@@ -137,10 +138,9 @@ class MI210(BaseViperGPU):
|
||||
tcc_size: str = "256KiB",
|
||||
tcc_assoc: int = 16,
|
||||
tcc_count: int = 8,
|
||||
num_memory_channels: int = 8,
|
||||
cache_line_size: int = 64,
|
||||
):
|
||||
super().__init__()
|
||||
super().__init__(gpu_memory=gpu_memory)
|
||||
|
||||
self._cu_per_sqc = cu_per_sqc
|
||||
self._tcp_size = tcp_size
|
||||
@@ -152,7 +152,6 @@ class MI210(BaseViperGPU):
|
||||
self._tcc_size = tcc_size
|
||||
self._tcc_assoc = tcc_assoc
|
||||
self._tcc_count = tcc_count
|
||||
self._num_memory_channels = num_memory_channels
|
||||
self._cache_line_size = cache_line_size
|
||||
|
||||
self._device.device_name = "MI200"
|
||||
@@ -205,6 +204,7 @@ class MI210(BaseViperGPU):
|
||||
class MI300X(BaseViperGPU):
|
||||
def __init__(
|
||||
self,
|
||||
gpu_memory: AbstractMemorySystem,
|
||||
num_cus: int = 40,
|
||||
cu_per_sqc: int = 4,
|
||||
tcp_size: str = "16KiB",
|
||||
@@ -216,10 +216,9 @@ class MI300X(BaseViperGPU):
|
||||
tcc_size: str = "256KiB",
|
||||
tcc_assoc: int = 16,
|
||||
tcc_count: int = 16,
|
||||
num_memory_channels: int = 16,
|
||||
cache_line_size: int = 64,
|
||||
):
|
||||
super().__init__()
|
||||
super().__init__(gpu_memory=gpu_memory)
|
||||
|
||||
self._cu_per_sqc = cu_per_sqc
|
||||
self._tcp_size = tcp_size
|
||||
@@ -231,7 +230,6 @@ class MI300X(BaseViperGPU):
|
||||
self._tcc_size = tcc_size
|
||||
self._tcc_assoc = tcc_assoc
|
||||
self._tcc_count = tcc_count
|
||||
self._num_memory_channels = num_memory_channels
|
||||
self._cache_line_size = cache_line_size
|
||||
|
||||
self._device.device_name = "MI300X"
|
||||
|
||||
@@ -121,6 +121,13 @@ class ViperBoard(X86Board):
|
||||
isa.ExtendedState = avx_extended_state
|
||||
isa.FamilyModelStepping = avx_cpu_features
|
||||
|
||||
# The System() object in gem5 has a memories parameter which defaults
|
||||
# to Self.all. This will collect *all* AbstractMemories and connect to
|
||||
# the CPU side. To avoid this we manually assign the memories param to
|
||||
# the CPU side memories. We need the MemInterface which is called dram
|
||||
# in the MemCtrl class even though it might not be modelling dram.
|
||||
self.memories = self.memory.get_mem_interfaces()
|
||||
|
||||
@overrides(KernelDiskWorkload)
|
||||
def get_disk_device(self):
|
||||
return "/dev/sda"
|
||||
|
||||
@@ -35,7 +35,6 @@ from m5.objects import (
|
||||
RubyPortProxy,
|
||||
RubySequencer,
|
||||
RubySystem,
|
||||
SimpleMemory,
|
||||
TreePLRURP,
|
||||
)
|
||||
|
||||
|
||||
@@ -57,11 +57,9 @@ from ...components.cachehierarchies.ruby.caches.viper.sqc import SQCCache
|
||||
from ...components.cachehierarchies.ruby.caches.viper.tcc import TCCCache
|
||||
from ...components.cachehierarchies.ruby.caches.viper.tcp import TCPCache
|
||||
from ...components.devices.gpus.viper_shader import ViperShader
|
||||
from ...components.memory.abstract_memory_system import AbstractMemorySystem
|
||||
from ...utils.requires import requires
|
||||
from .viper_network import (
|
||||
SimpleDoubleCrossbar,
|
||||
SimplePt2Pt,
|
||||
)
|
||||
from .viper_network import SimpleDoubleCrossbar
|
||||
|
||||
|
||||
class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
|
||||
@@ -75,6 +73,7 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gpu_memory: AbstractMemorySystem,
|
||||
tcp_size: str,
|
||||
tcp_assoc: int,
|
||||
sqc_size: str,
|
||||
@@ -85,7 +84,6 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
|
||||
tcc_assoc: int,
|
||||
tcc_count: int,
|
||||
cu_per_sqc: int,
|
||||
num_memory_channels: int,
|
||||
cache_line_size: int,
|
||||
shader: ViperShader,
|
||||
):
|
||||
@@ -284,33 +282,14 @@ class ViperGPUCacheHierarchy(AbstractRubyCacheHierarchy):
|
||||
|
||||
self._dma_controllers.append(ctrl)
|
||||
|
||||
# Create GPU memories. Currently fixed to HBM2.
|
||||
mem_type_cls = HBM_2000_4H_1x64
|
||||
|
||||
# AMDGPUDevice currently tells the driver there is 16GiB for memory.
|
||||
# Until that is a parameter, this need to be fixed to 16GiB.
|
||||
gpu_mem_range = AddrRange(0, size="16GiB")
|
||||
intlv_low_bit = int(math.log(self._cache_line_size, 2))
|
||||
intlv_bits = int(math.log(num_memory_channels, 2))
|
||||
|
||||
for idx in range(num_memory_channels):
|
||||
addr_range = AddrRange(
|
||||
gpu_mem_range.start,
|
||||
size=gpu_mem_range.size(),
|
||||
intlvHighBit=intlv_low_bit + intlv_bits - 1,
|
||||
intlvBits=intlv_bits,
|
||||
intlvMatch=idx,
|
||||
xorHighBit=0,
|
||||
)
|
||||
|
||||
mem_ctrl = MemCtrl(dram=mem_type_cls(range=addr_range))
|
||||
self._mem_ctrls.append(mem_ctrl)
|
||||
|
||||
gpu_memory.set_memory_range([AddrRange(0, size=gpu_memory.get_size())])
|
||||
self._mem_ctrls = gpu_memory.get_memory_controllers()
|
||||
for addr_range, port in gpu_memory.get_mem_ports():
|
||||
dir = ViperGPUDirectory(
|
||||
self.ruby_gpu.network,
|
||||
self._cache_line_size,
|
||||
addr_range,
|
||||
self._mem_ctrls[idx].port,
|
||||
port,
|
||||
)
|
||||
|
||||
dir.ruby_system = self.ruby_gpu
|
||||
|
||||
Reference in New Issue
Block a user