stdlib: Edit RISCVMatched Configuration

This patch changes the RISCVMatched Cache Hierarchy to
private L1 shared L2.
It also changes the RISCVMatched Core's parameters to
better match hardware performance.
Also, sizes are changed to MiB or KiB instead of MB
or KB, to match the datasheet.
All the changes that deviate from the datasheet and the
ARM HPI CPU (reference for pipeline parameters)
are documented.

The core parameters that are changed are:
    - threadPolicy:
        This is initialized to "SingleThreaded".
    - decodeToExecuteForwardDelay:
        This is changed from 1 to 2 to avoid a PMC address fault.
    - fetch1ToFetch2BackwardDelay:
        This is changed from 1 to 0 to better match hardware performance.
    - fetch2InputBufferSize:
        This is changed from 2 to 1 to better match hardware performance.
    - decodeInputBufferSize:
        This is changed from 3 to 2 to better match hardware performance.
    - decodeToExecuteForwardDelay:
        This is changed from 2 to 1 to better match hardware performance.
    - executeInputBufferSize:
        This is changed from 7 to 4 to better match hardware performance.
    - executeMaxAccessesInMemory:
        This is changed from 2 to 1 to better match hardware performance.
    - executeLSQStoreBufferSize:
        This is changed from 5 to 3 to better match hardware performance.
    - executeBranchDelay:
        This is changed from 1 to 2 to better match hardware performance.
    - enableIdling:
        This is changed to False to better match hardware performance.
    - MemReadFU: changed to 2 cycles from 3 cycles.

The changes in the branch predictor are:

      - BTBEntries:
        This is changed from 16 entries to 32 entries.
      - RASSize:
        This is changed from 6 entries to 12 entries.
      - IndirectSets:
        This is changed from 8 sets to 16 sets.
      - localPredictorSize:
        This is changed from 8192 to 16384.
      - globalPredictorSize:
        This is changed from 8192 to 16384.
      - choicePredictorSize:
        This is changed from 8192 to 16384.
      - localCtrBits:
        This is changed from 2 to 4.
      - globalCtrBits:
        This is changed from 2 to 4.
      - choiceCtrBits:
        This is changed from 2 to 4.

Change-Id: I4235140f33be6a3b529a819ae6a7223cb88bb7ab
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70798
Maintainer: Bobby Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
This commit is contained in:
KUNAL PAI
2023-05-19 14:38:48 -07:00
committed by Bobby Bruce
parent 5095e29c8e
commit e0a28b1a27
3 changed files with 88 additions and 44 deletions

View File

@@ -109,7 +109,7 @@ class RISCVMatchedBoard(
def __init__(
self,
clk_freq: str = "1.2GHz",
l2_size: str = "2MB",
l2_size: str = "2MiB",
is_fs: bool = False,
) -> None:
"""

View File

@@ -1,4 +1,4 @@
# Copyright (c) 2022 The Regents of the University of California
# Copyright (c) 2023 The Regents of the University of California
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -42,6 +42,7 @@ from gem5.isas import ISA
from m5.objects import Cache, L2XBar, BaseXBar, SystemXBar, BadAddr, Port
from gem5.utils.override import *
from typing import Type
class RISCVMatchedCacheHierarchy(
@@ -50,7 +51,7 @@ class RISCVMatchedCacheHierarchy(
"""
A cache setup where each core has a private L1 Data and Instruction Cache,
and a private L2 cache.
and a shared L2 cache.
The HiFive board has a partially inclusive cache hierarchy, hence this hierarchy is chosen.
The details of the cache hierarchy are in Table 7, page 36 of the datasheet.
@@ -74,9 +75,9 @@ class RISCVMatchedCacheHierarchy(
AbstractClassicCacheHierarchy.__init__(self=self)
AbstractTwoLevelCacheHierarchy.__init__(
self,
l1i_size="32kB",
l1i_size="32KiB",
l1i_assoc=4,
l1d_size="32kB",
l1d_size="32KiB",
l1d_assoc=8,
l2_size=l2_size,
l2_assoc=16,
@@ -108,16 +109,17 @@ class RISCVMatchedCacheHierarchy(
for i in range(board.get_processor().get_num_cores())
]
self.l1dcaches = [
L1DCache(size=self._l1d_size, assoc=self._l1d_assoc)
for i in range(board.get_processor().get_num_cores())
]
self.l2buses = [
L2XBar() for i in range(board.get_processor().get_num_cores())
]
self.l2caches = [
L2Cache(size=self._l2_size, assoc=self._l2_assoc)
L1DCache(
size=self._l1d_size, assoc=self._l1d_assoc, response_latency=10
)
for i in range(board.get_processor().get_num_cores())
]
self.l2bus = L2XBar()
self.l2cache = L2Cache(
size=self._l2_size, assoc=self._l2_assoc, data_latency=20
)
# ITLB Page walk caches
self.iptw_caches = [
MMUCache(size="4KiB")
@@ -137,14 +139,10 @@ class RISCVMatchedCacheHierarchy(
cpu.connect_icache(self.l1icaches[i].cpu_side)
cpu.connect_dcache(self.l1dcaches[i].cpu_side)
self.l1icaches[i].mem_side = self.l2buses[i].cpu_side_ports
self.l1dcaches[i].mem_side = self.l2buses[i].cpu_side_ports
self.iptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports
self.dptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports
self.l2buses[i].mem_side_ports = self.l2caches[i].cpu_side
self.membus.cpu_side_ports = self.l2caches[i].mem_side
self.l1icaches[i].mem_side = self.l2bus.cpu_side_ports
self.l1dcaches[i].mem_side = self.l2bus.cpu_side_ports
self.iptw_caches[i].mem_side = self.l2bus.cpu_side_ports
self.dptw_caches[i].mem_side = self.l2bus.cpu_side_ports
cpu.connect_walker_ports(
self.iptw_caches[i].cpu_side, self.dptw_caches[i].cpu_side
@@ -157,6 +155,9 @@ class RISCVMatchedCacheHierarchy(
else:
cpu.connect_interrupt()
self.l2bus.mem_side_ports = self.l2cache.cpu_side
self.membus.cpu_side_ports = self.l2cache.mem_side
def _setup_io_cache(self, board: AbstractBoard) -> None:
"""Create a cache for coherent I/O connections"""
self.iocache = Cache(

View File

@@ -61,8 +61,14 @@ class U74PredFU(MinorDefaultPredFU):
pass
class U74MemFU(MinorDefaultMemFU):
opLat = 3
class U74MemReadFU(MinorDefaultMemFU):
opClasses = minorMakeOpClassSet(["MemRead", "FloatMemRead"])
opLat = 2
class U74MemWriteFU(MinorDefaultMemFU):
opClasses = minorMakeOpClassSet(["MemWrite", "FloatMemWrite"])
opLat = 2
class U74MiscFU(MinorDefaultMiscFU):
@@ -77,18 +83,24 @@ class U74FUPool(MinorFUPool):
U74IntDivFU(),
U74FloatSimdFU(),
U74PredFU(),
U74MemFU(),
U74MemReadFU(),
U74MemWriteFU(),
U74MiscFU(),
]
class U74BP(TournamentBP):
BTBEntries = 16
RASSize = 6
BTBEntries = 32
RASSize = 12
localHistoryTableSize = 4096 # is 3.6 KiB but gem5 requires power of 2
localPredictorSize = 16384
globalPredictorSize = 16384
choicePredictorSize = 16384
localCtrBits = 4
globalCtrBits = 4
choiceCtrBits = 4
indirectBranchPred = SimpleIndirectPredictor()
indirectBranchPred.indirectSets = 8
indirectBranchPred.indirectSets = 16
class U74CPU(RiscvMinorCPU):
@@ -97,26 +109,49 @@ class U74CPU(RiscvMinorCPU):
This information about the CPU can be found on page 15 of
gem5_rsk_gem5-21.2.pdf at https://github.com/arm-university/arm-gem5-rsk
The only parameter that is changed is the decodeToExecuteForwardDelay.
This is changed from 1 to 2 to avoid a PMC address fault.
The parameters that are changed are:
- threadPolicy:
This is initialized to "SingleThreaded".
- decodeToExecuteForwardDelay:
This is changed from 1 to 2 to avoid a PMC address fault.
- fetch1ToFetch2BackwardDelay:
This is changed from 1 to 0 to better match hardware performance.
- fetch2InputBufferSize:
This is changed from 2 to 1 to better match hardware performance.
- decodeInputBufferSize:
This is changed from 3 to 2 to better match hardware performance.
- decodeToExecuteForwardDelay:
This is changed from 2 to 1 to better match hardware performance.
- executeInputBufferSize:
This is changed from 7 to 4 to better match hardware performance.
- executeMaxAccessesInMemory:
This is changed from 2 to 1 to better match hardware performance.
- executeLSQStoreBufferSize:
This is changed from 5 to 3 to better match hardware performance.
- executeBranchDelay:
This is changed from 1 to 2 to better match hardware performance.
- enableIdling:
This is changed to False to better match hardware performance.
"""
threadPolicy = "SingleThreaded"
# Fetch1 stage
fetch1LineSnapWidth = 0
fetch1LineWidth = 0
fetch1FetchLimit = 1
fetch1ToFetch2ForwardDelay = 1
fetch1ToFetch2BackwardDelay = 1
fetch1ToFetch2BackwardDelay = 0
# Fetch2 stage
fetch2InputBufferSize = 2
fetch2InputBufferSize = 1
fetch2ToDecodeForwardDelay = 1
fetch2CycleInput = True
# Decode stage
decodeInputBufferSize = 3
decodeToExecuteForwardDelay = 2
decodeInputBufferSize = 2
decodeToExecuteForwardDelay = 1
decodeInputWidth = 2
decodeCycleInput = True
@@ -127,17 +162,17 @@ class U74CPU(RiscvMinorCPU):
executeMemoryIssueLimit = 1
executeCommitLimit = 2
executeMemoryCommitLimit = 1
executeInputBufferSize = 7
executeMaxAccessesInMemory = 2
executeInputBufferSize = 4
executeMaxAccessesInMemory = 1
executeLSQMaxStoreBufferStoresPerCycle = 2
executeLSQRequestsQueueSize = 1
executeLSQTransfersQueueSize = 2
executeLSQStoreBufferSize = 5
executeBranchDelay = 1
executeLSQStoreBufferSize = 3
executeBranchDelay = 2
executeSetTraceTimeOnCommit = True
executeSetTraceTimeOnIssue = False
executeAllowEarlyMemoryIssue = True
enableIdling = True
enableIdling = False
# Functional Units and Branch Prediction
executeFuncUnits = U74FUPool()
@@ -152,13 +187,21 @@ class U74Core(BaseCPUCore):
- IntFU: 1 cycle
- IntMulFU: 3 cycles
- IntDivFU: 6 cycles (NOTE: latency is variable, but is set to 6 cycles)
- MemFU: 3 cycles
- MemReadFU: 2 cycles
- MemWriteFU: 2 cycles
The branch predictor is a TournamentBP, based on Section 4.2.5 on page 38.
- BTBEntries: 16 entries
- RASSize: 6 entries
- IndirectSets: 8 sets
- BTBEntries: 32 entries
- RASSize: 12 entries
- IndirectSets: 16 sets
- localPredictorSize: 16384
- globalPredictorSize: 16384
- choicePredictorSize: 16384
- localCtrBits: 4
- globalCtrBits: 4
- choiceCtrBits: 4
- localHistoryTableSize: 4096 B
NOTE: The BHT of the HiFive Board is 3.6KiB but gem5 requires a power of 2, so the BHT is 4096B.
NOTE: The TournamentBP deviates from the actual BP.
This configuration performs the best in relation to the hardware.
"""
def __init__(