diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py index ae483cc401..9ca95839f8 100644 --- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py +++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py @@ -109,7 +109,7 @@ class RISCVMatchedBoard( def __init__( self, clk_freq: str = "1.2GHz", - l2_size: str = "2MB", + l2_size: str = "2MiB", is_fs: bool = False, ) -> None: """ diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py index dc66af354b..25e55ef310 100644 --- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py +++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 The Regents of the University of California +# Copyright (c) 2023 The Regents of the University of California # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -42,6 +42,7 @@ from gem5.isas import ISA from m5.objects import Cache, L2XBar, BaseXBar, SystemXBar, BadAddr, Port from gem5.utils.override import * +from typing import Type class RISCVMatchedCacheHierarchy( @@ -50,7 +51,7 @@ class RISCVMatchedCacheHierarchy( """ A cache setup where each core has a private L1 Data and Instruction Cache, - and a private L2 cache. + and a shared L2 cache. The HiFive board has a partially inclusive cache hierarchy, hence this hierarchy is chosen. The details of the cache hierarchy are in Table 7, page 36 of the datasheet. @@ -74,9 +75,9 @@ class RISCVMatchedCacheHierarchy( AbstractClassicCacheHierarchy.__init__(self=self) AbstractTwoLevelCacheHierarchy.__init__( self, - l1i_size="32kB", + l1i_size="32KiB", l1i_assoc=4, - l1d_size="32kB", + l1d_size="32KiB", l1d_assoc=8, l2_size=l2_size, l2_assoc=16, @@ -108,16 +109,17 @@ class RISCVMatchedCacheHierarchy( for i in range(board.get_processor().get_num_cores()) ] self.l1dcaches = [ - L1DCache(size=self._l1d_size, assoc=self._l1d_assoc) - for i in range(board.get_processor().get_num_cores()) - ] - self.l2buses = [ - L2XBar() for i in range(board.get_processor().get_num_cores()) - ] - self.l2caches = [ - L2Cache(size=self._l2_size, assoc=self._l2_assoc) + L1DCache( + size=self._l1d_size, assoc=self._l1d_assoc, response_latency=10 + ) for i in range(board.get_processor().get_num_cores()) ] + self.l2bus = L2XBar() + + self.l2cache = L2Cache( + size=self._l2_size, assoc=self._l2_assoc, data_latency=20 + ) + # ITLB Page walk caches self.iptw_caches = [ MMUCache(size="4KiB") @@ -137,14 +139,10 @@ class RISCVMatchedCacheHierarchy( cpu.connect_icache(self.l1icaches[i].cpu_side) cpu.connect_dcache(self.l1dcaches[i].cpu_side) - self.l1icaches[i].mem_side = self.l2buses[i].cpu_side_ports - self.l1dcaches[i].mem_side = self.l2buses[i].cpu_side_ports - self.iptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports - self.dptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports - - self.l2buses[i].mem_side_ports = self.l2caches[i].cpu_side - - self.membus.cpu_side_ports = self.l2caches[i].mem_side + self.l1icaches[i].mem_side = self.l2bus.cpu_side_ports + self.l1dcaches[i].mem_side = self.l2bus.cpu_side_ports + self.iptw_caches[i].mem_side = self.l2bus.cpu_side_ports + self.dptw_caches[i].mem_side = self.l2bus.cpu_side_ports cpu.connect_walker_ports( self.iptw_caches[i].cpu_side, self.dptw_caches[i].cpu_side @@ -157,6 +155,9 @@ class RISCVMatchedCacheHierarchy( else: cpu.connect_interrupt() + self.l2bus.mem_side_ports = self.l2cache.cpu_side + self.membus.cpu_side_ports = self.l2cache.mem_side + def _setup_io_cache(self, board: AbstractBoard) -> None: """Create a cache for coherent I/O connections""" self.iocache = Cache( diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py index 0b4375ce8d..48291bf670 100644 --- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py +++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py @@ -61,8 +61,14 @@ class U74PredFU(MinorDefaultPredFU): pass -class U74MemFU(MinorDefaultMemFU): - opLat = 3 +class U74MemReadFU(MinorDefaultMemFU): + opClasses = minorMakeOpClassSet(["MemRead", "FloatMemRead"]) + opLat = 2 + + +class U74MemWriteFU(MinorDefaultMemFU): + opClasses = minorMakeOpClassSet(["MemWrite", "FloatMemWrite"]) + opLat = 2 class U74MiscFU(MinorDefaultMiscFU): @@ -77,18 +83,24 @@ class U74FUPool(MinorFUPool): U74IntDivFU(), U74FloatSimdFU(), U74PredFU(), - U74MemFU(), + U74MemReadFU(), + U74MemWriteFU(), U74MiscFU(), ] class U74BP(TournamentBP): - BTBEntries = 16 - RASSize = 6 + BTBEntries = 32 + RASSize = 12 localHistoryTableSize = 4096 # is 3.6 KiB but gem5 requires power of 2 - + localPredictorSize = 16384 + globalPredictorSize = 16384 + choicePredictorSize = 16384 + localCtrBits = 4 + globalCtrBits = 4 + choiceCtrBits = 4 indirectBranchPred = SimpleIndirectPredictor() - indirectBranchPred.indirectSets = 8 + indirectBranchPred.indirectSets = 16 class U74CPU(RiscvMinorCPU): @@ -97,26 +109,49 @@ class U74CPU(RiscvMinorCPU): This information about the CPU can be found on page 15 of gem5_rsk_gem5-21.2.pdf at https://github.com/arm-university/arm-gem5-rsk - The only parameter that is changed is the decodeToExecuteForwardDelay. - This is changed from 1 to 2 to avoid a PMC address fault. + The parameters that are changed are: + - threadPolicy: + This is initialized to "SingleThreaded". + - decodeToExecuteForwardDelay: + This is changed from 1 to 2 to avoid a PMC address fault. + - fetch1ToFetch2BackwardDelay: + This is changed from 1 to 0 to better match hardware performance. + - fetch2InputBufferSize: + This is changed from 2 to 1 to better match hardware performance. + - decodeInputBufferSize: + This is changed from 3 to 2 to better match hardware performance. + - decodeToExecuteForwardDelay: + This is changed from 2 to 1 to better match hardware performance. + - executeInputBufferSize: + This is changed from 7 to 4 to better match hardware performance. + - executeMaxAccessesInMemory: + This is changed from 2 to 1 to better match hardware performance. + - executeLSQStoreBufferSize: + This is changed from 5 to 3 to better match hardware performance. + - executeBranchDelay: + This is changed from 1 to 2 to better match hardware performance. + - enableIdling: + This is changed to False to better match hardware performance. """ + threadPolicy = "SingleThreaded" + # Fetch1 stage fetch1LineSnapWidth = 0 fetch1LineWidth = 0 fetch1FetchLimit = 1 fetch1ToFetch2ForwardDelay = 1 - fetch1ToFetch2BackwardDelay = 1 + fetch1ToFetch2BackwardDelay = 0 # Fetch2 stage - fetch2InputBufferSize = 2 + fetch2InputBufferSize = 1 fetch2ToDecodeForwardDelay = 1 fetch2CycleInput = True # Decode stage - decodeInputBufferSize = 3 - decodeToExecuteForwardDelay = 2 + decodeInputBufferSize = 2 + decodeToExecuteForwardDelay = 1 decodeInputWidth = 2 decodeCycleInput = True @@ -127,17 +162,17 @@ class U74CPU(RiscvMinorCPU): executeMemoryIssueLimit = 1 executeCommitLimit = 2 executeMemoryCommitLimit = 1 - executeInputBufferSize = 7 - executeMaxAccessesInMemory = 2 + executeInputBufferSize = 4 + executeMaxAccessesInMemory = 1 executeLSQMaxStoreBufferStoresPerCycle = 2 executeLSQRequestsQueueSize = 1 executeLSQTransfersQueueSize = 2 - executeLSQStoreBufferSize = 5 - executeBranchDelay = 1 + executeLSQStoreBufferSize = 3 + executeBranchDelay = 2 executeSetTraceTimeOnCommit = True executeSetTraceTimeOnIssue = False executeAllowEarlyMemoryIssue = True - enableIdling = True + enableIdling = False # Functional Units and Branch Prediction executeFuncUnits = U74FUPool() @@ -152,13 +187,21 @@ class U74Core(BaseCPUCore): - IntFU: 1 cycle - IntMulFU: 3 cycles - IntDivFU: 6 cycles (NOTE: latency is variable, but is set to 6 cycles) - - MemFU: 3 cycles + - MemReadFU: 2 cycles + - MemWriteFU: 2 cycles The branch predictor is a TournamentBP, based on Section 4.2.5 on page 38. - - BTBEntries: 16 entries - - RASSize: 6 entries - - IndirectSets: 8 sets + - BTBEntries: 32 entries + - RASSize: 12 entries + - IndirectSets: 16 sets + - localPredictorSize: 16384 + - globalPredictorSize: 16384 + - choicePredictorSize: 16384 + - localCtrBits: 4 + - globalCtrBits: 4 + - choiceCtrBits: 4 - localHistoryTableSize: 4096 B - NOTE: The BHT of the HiFive Board is 3.6KiB but gem5 requires a power of 2, so the BHT is 4096B. + NOTE: The TournamentBP deviates from the actual BP. + This configuration performs the best in relation to the hardware. """ def __init__(