From e0a28b1a27a5cd79eca7649ad7cb5ec8cce85b08 Mon Sep 17 00:00:00 2001 From: KUNAL PAI Date: Fri, 19 May 2023 14:38:48 -0700 Subject: [PATCH] stdlib: Edit RISCVMatched Configuration This patch changes the RISCVMatched Cache Hierarchy to private L1 shared L2. It also changes the RISCVMatched Core's parameters to better match hardware performance. Also, sizes are changed to MiB or KiB instead of MB or KB, to match the datasheet. All the changes that deviate from the datasheet and the ARM HPI CPU (reference for pipeline parameters) are documented. The core parameters that are changed are: - threadPolicy: This is initialized to "SingleThreaded". - decodeToExecuteForwardDelay: This is changed from 1 to 2 to avoid a PMC address fault. - fetch1ToFetch2BackwardDelay: This is changed from 1 to 0 to better match hardware performance. - fetch2InputBufferSize: This is changed from 2 to 1 to better match hardware performance. - decodeInputBufferSize: This is changed from 3 to 2 to better match hardware performance. - decodeToExecuteForwardDelay: This is changed from 2 to 1 to better match hardware performance. - executeInputBufferSize: This is changed from 7 to 4 to better match hardware performance. - executeMaxAccessesInMemory: This is changed from 2 to 1 to better match hardware performance. - executeLSQStoreBufferSize: This is changed from 5 to 3 to better match hardware performance. - executeBranchDelay: This is changed from 1 to 2 to better match hardware performance. - enableIdling: This is changed to False to better match hardware performance. - MemReadFU: changed to 2 cycles from 3 cycles. The changes in the branch predictor are: - BTBEntries: This is changed from 16 entries to 32 entries. - RASSize: This is changed from 6 entries to 12 entries. - IndirectSets: This is changed from 8 sets to 16 sets. - localPredictorSize: This is changed from 8192 to 16384. - globalPredictorSize: This is changed from 8192 to 16384. - choicePredictorSize: This is changed from 8192 to 16384. - localCtrBits: This is changed from 2 to 4. - globalCtrBits: This is changed from 2 to 4. - choiceCtrBits: This is changed from 2 to 4. Change-Id: I4235140f33be6a3b529a819ae6a7223cb88bb7ab Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70798 Maintainer: Bobby Bruce Tested-by: kokoro Reviewed-by: Jason Lowe-Power Maintainer: Jason Lowe-Power --- .../riscvmatched/riscvmatched_board.py | 2 +- .../riscvmatched/riscvmatched_cache.py | 41 ++++----- .../riscvmatched/riscvmatched_core.py | 89 ++++++++++++++----- 3 files changed, 88 insertions(+), 44 deletions(-) diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py index ae483cc401..9ca95839f8 100644 --- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py +++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py @@ -109,7 +109,7 @@ class RISCVMatchedBoard( def __init__( self, clk_freq: str = "1.2GHz", - l2_size: str = "2MB", + l2_size: str = "2MiB", is_fs: bool = False, ) -> None: """ diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py index dc66af354b..25e55ef310 100644 --- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py +++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 The Regents of the University of California +# Copyright (c) 2023 The Regents of the University of California # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -42,6 +42,7 @@ from gem5.isas import ISA from m5.objects import Cache, L2XBar, BaseXBar, SystemXBar, BadAddr, Port from gem5.utils.override import * +from typing import Type class RISCVMatchedCacheHierarchy( @@ -50,7 +51,7 @@ class RISCVMatchedCacheHierarchy( """ A cache setup where each core has a private L1 Data and Instruction Cache, - and a private L2 cache. + and a shared L2 cache. The HiFive board has a partially inclusive cache hierarchy, hence this hierarchy is chosen. The details of the cache hierarchy are in Table 7, page 36 of the datasheet. @@ -74,9 +75,9 @@ class RISCVMatchedCacheHierarchy( AbstractClassicCacheHierarchy.__init__(self=self) AbstractTwoLevelCacheHierarchy.__init__( self, - l1i_size="32kB", + l1i_size="32KiB", l1i_assoc=4, - l1d_size="32kB", + l1d_size="32KiB", l1d_assoc=8, l2_size=l2_size, l2_assoc=16, @@ -108,16 +109,17 @@ class RISCVMatchedCacheHierarchy( for i in range(board.get_processor().get_num_cores()) ] self.l1dcaches = [ - L1DCache(size=self._l1d_size, assoc=self._l1d_assoc) - for i in range(board.get_processor().get_num_cores()) - ] - self.l2buses = [ - L2XBar() for i in range(board.get_processor().get_num_cores()) - ] - self.l2caches = [ - L2Cache(size=self._l2_size, assoc=self._l2_assoc) + L1DCache( + size=self._l1d_size, assoc=self._l1d_assoc, response_latency=10 + ) for i in range(board.get_processor().get_num_cores()) ] + self.l2bus = L2XBar() + + self.l2cache = L2Cache( + size=self._l2_size, assoc=self._l2_assoc, data_latency=20 + ) + # ITLB Page walk caches self.iptw_caches = [ MMUCache(size="4KiB") @@ -137,14 +139,10 @@ class RISCVMatchedCacheHierarchy( cpu.connect_icache(self.l1icaches[i].cpu_side) cpu.connect_dcache(self.l1dcaches[i].cpu_side) - self.l1icaches[i].mem_side = self.l2buses[i].cpu_side_ports - self.l1dcaches[i].mem_side = self.l2buses[i].cpu_side_ports - self.iptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports - self.dptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports - - self.l2buses[i].mem_side_ports = self.l2caches[i].cpu_side - - self.membus.cpu_side_ports = self.l2caches[i].mem_side + self.l1icaches[i].mem_side = self.l2bus.cpu_side_ports + self.l1dcaches[i].mem_side = self.l2bus.cpu_side_ports + self.iptw_caches[i].mem_side = self.l2bus.cpu_side_ports + self.dptw_caches[i].mem_side = self.l2bus.cpu_side_ports cpu.connect_walker_ports( self.iptw_caches[i].cpu_side, self.dptw_caches[i].cpu_side @@ -157,6 +155,9 @@ class RISCVMatchedCacheHierarchy( else: cpu.connect_interrupt() + self.l2bus.mem_side_ports = self.l2cache.cpu_side + self.membus.cpu_side_ports = self.l2cache.mem_side + def _setup_io_cache(self, board: AbstractBoard) -> None: """Create a cache for coherent I/O connections""" self.iocache = Cache( diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py index 0b4375ce8d..48291bf670 100644 --- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py +++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py @@ -61,8 +61,14 @@ class U74PredFU(MinorDefaultPredFU): pass -class U74MemFU(MinorDefaultMemFU): - opLat = 3 +class U74MemReadFU(MinorDefaultMemFU): + opClasses = minorMakeOpClassSet(["MemRead", "FloatMemRead"]) + opLat = 2 + + +class U74MemWriteFU(MinorDefaultMemFU): + opClasses = minorMakeOpClassSet(["MemWrite", "FloatMemWrite"]) + opLat = 2 class U74MiscFU(MinorDefaultMiscFU): @@ -77,18 +83,24 @@ class U74FUPool(MinorFUPool): U74IntDivFU(), U74FloatSimdFU(), U74PredFU(), - U74MemFU(), + U74MemReadFU(), + U74MemWriteFU(), U74MiscFU(), ] class U74BP(TournamentBP): - BTBEntries = 16 - RASSize = 6 + BTBEntries = 32 + RASSize = 12 localHistoryTableSize = 4096 # is 3.6 KiB but gem5 requires power of 2 - + localPredictorSize = 16384 + globalPredictorSize = 16384 + choicePredictorSize = 16384 + localCtrBits = 4 + globalCtrBits = 4 + choiceCtrBits = 4 indirectBranchPred = SimpleIndirectPredictor() - indirectBranchPred.indirectSets = 8 + indirectBranchPred.indirectSets = 16 class U74CPU(RiscvMinorCPU): @@ -97,26 +109,49 @@ class U74CPU(RiscvMinorCPU): This information about the CPU can be found on page 15 of gem5_rsk_gem5-21.2.pdf at https://github.com/arm-university/arm-gem5-rsk - The only parameter that is changed is the decodeToExecuteForwardDelay. - This is changed from 1 to 2 to avoid a PMC address fault. + The parameters that are changed are: + - threadPolicy: + This is initialized to "SingleThreaded". + - decodeToExecuteForwardDelay: + This is changed from 1 to 2 to avoid a PMC address fault. + - fetch1ToFetch2BackwardDelay: + This is changed from 1 to 0 to better match hardware performance. + - fetch2InputBufferSize: + This is changed from 2 to 1 to better match hardware performance. + - decodeInputBufferSize: + This is changed from 3 to 2 to better match hardware performance. + - decodeToExecuteForwardDelay: + This is changed from 2 to 1 to better match hardware performance. + - executeInputBufferSize: + This is changed from 7 to 4 to better match hardware performance. + - executeMaxAccessesInMemory: + This is changed from 2 to 1 to better match hardware performance. + - executeLSQStoreBufferSize: + This is changed from 5 to 3 to better match hardware performance. + - executeBranchDelay: + This is changed from 1 to 2 to better match hardware performance. + - enableIdling: + This is changed to False to better match hardware performance. """ + threadPolicy = "SingleThreaded" + # Fetch1 stage fetch1LineSnapWidth = 0 fetch1LineWidth = 0 fetch1FetchLimit = 1 fetch1ToFetch2ForwardDelay = 1 - fetch1ToFetch2BackwardDelay = 1 + fetch1ToFetch2BackwardDelay = 0 # Fetch2 stage - fetch2InputBufferSize = 2 + fetch2InputBufferSize = 1 fetch2ToDecodeForwardDelay = 1 fetch2CycleInput = True # Decode stage - decodeInputBufferSize = 3 - decodeToExecuteForwardDelay = 2 + decodeInputBufferSize = 2 + decodeToExecuteForwardDelay = 1 decodeInputWidth = 2 decodeCycleInput = True @@ -127,17 +162,17 @@ class U74CPU(RiscvMinorCPU): executeMemoryIssueLimit = 1 executeCommitLimit = 2 executeMemoryCommitLimit = 1 - executeInputBufferSize = 7 - executeMaxAccessesInMemory = 2 + executeInputBufferSize = 4 + executeMaxAccessesInMemory = 1 executeLSQMaxStoreBufferStoresPerCycle = 2 executeLSQRequestsQueueSize = 1 executeLSQTransfersQueueSize = 2 - executeLSQStoreBufferSize = 5 - executeBranchDelay = 1 + executeLSQStoreBufferSize = 3 + executeBranchDelay = 2 executeSetTraceTimeOnCommit = True executeSetTraceTimeOnIssue = False executeAllowEarlyMemoryIssue = True - enableIdling = True + enableIdling = False # Functional Units and Branch Prediction executeFuncUnits = U74FUPool() @@ -152,13 +187,21 @@ class U74Core(BaseCPUCore): - IntFU: 1 cycle - IntMulFU: 3 cycles - IntDivFU: 6 cycles (NOTE: latency is variable, but is set to 6 cycles) - - MemFU: 3 cycles + - MemReadFU: 2 cycles + - MemWriteFU: 2 cycles The branch predictor is a TournamentBP, based on Section 4.2.5 on page 38. - - BTBEntries: 16 entries - - RASSize: 6 entries - - IndirectSets: 8 sets + - BTBEntries: 32 entries + - RASSize: 12 entries + - IndirectSets: 16 sets + - localPredictorSize: 16384 + - globalPredictorSize: 16384 + - choicePredictorSize: 16384 + - localCtrBits: 4 + - globalCtrBits: 4 + - choiceCtrBits: 4 - localHistoryTableSize: 4096 B - NOTE: The BHT of the HiFive Board is 3.6KiB but gem5 requires a power of 2, so the BHT is 4096B. + NOTE: The TournamentBP deviates from the actual BP. + This configuration performs the best in relation to the hardware. """ def __init__(