stdlib: Edit RISCVMatched Configuration

This patch changes the RISCVMatched Cache Hierarchy to private L1 shared L2. It also changes the RISCVMatched Core's parameters to better match hardware performance. Also, sizes are changed to MiB or KiB instead of MB or KB, to match the datasheet. All the changes that deviate from the datasheet and the ARM HPI CPU (reference for pipeline parameters) are documented. The core parameters that are changed are: - threadPolicy: This is initialized to "SingleThreaded". - decodeToExecuteForwardDelay: This is changed from 1 to 2 to avoid a PMC address fault. - fetch1ToFetch2BackwardDelay: This is changed from 1 to 0 to better match hardware performance. - fetch2InputBufferSize: This is changed from 2 to 1 to better match hardware performance. - decodeInputBufferSize: This is changed from 3 to 2 to better match hardware performance. - decodeToExecuteForwardDelay: This is changed from 2 to 1 to better match hardware performance. - executeInputBufferSize: This is changed from 7 to 4 to better match hardware performance. - executeMaxAccessesInMemory: This is changed from 2 to 1 to better match hardware performance. - executeLSQStoreBufferSize: This is changed from 5 to 3 to better match hardware performance. - executeBranchDelay: This is changed from 1 to 2 to better match hardware performance. - enableIdling: This is changed to False to better match hardware performance. - MemReadFU: changed to 2 cycles from 3 cycles. The changes in the branch predictor are: - BTBEntries: This is changed from 16 entries to 32 entries. - RASSize: This is changed from 6 entries to 12 entries. - IndirectSets: This is changed from 8 sets to 16 sets. - localPredictorSize: This is changed from 8192 to 16384. - globalPredictorSize: This is changed from 8192 to 16384. - choicePredictorSize: This is changed from 8192 to 16384. - localCtrBits: This is changed from 2 to 4. - globalCtrBits: This is changed from 2 to 4. - choiceCtrBits: This is changed from 2 to 4. Change-Id: I4235140f33be6a3b529a819ae6a7223cb88bb7ab Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70798 Maintainer: Bobby Bruce <bbruce@ucdavis.edu> Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Jason Lowe-Power <power.jg@gmail.com>
2023-05-19 14:38:48 -07:00
parent 5095e29c8e
commit e0a28b1a27
3 changed files with 88 additions and 44 deletions
--- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
+++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
@@ -109,7 +109,7 @@ class RISCVMatchedBoard(
    def __init__(
        self,
        clk_freq: str = "1.2GHz",
-        l2_size: str = "2MB",
+        l2_size: str = "2MiB",
        is_fs: bool = False,
    ) -> None:
        """
--- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py
+++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 The Regents of the University of California
+# Copyright (c) 2023 The Regents of the University of California
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -42,6 +42,7 @@ from gem5.isas import ISA
 from m5.objects import Cache, L2XBar, BaseXBar, SystemXBar, BadAddr, Port

 from gem5.utils.override import *
+from typing import Type


 class RISCVMatchedCacheHierarchy(
@@ -50,7 +51,7 @@ class RISCVMatchedCacheHierarchy(
    """

    A cache setup where each core has a private L1 Data and Instruction Cache,
-    and a private L2 cache.
+    and a shared L2 cache.
    The HiFive board has a partially inclusive cache hierarchy, hence this hierarchy is chosen.
    The details of the cache hierarchy are in Table 7, page 36 of the datasheet.

@@ -74,9 +75,9 @@ class RISCVMatchedCacheHierarchy(
        AbstractClassicCacheHierarchy.__init__(self=self)
        AbstractTwoLevelCacheHierarchy.__init__(
            self,
-            l1i_size="32kB",
+            l1i_size="32KiB",
            l1i_assoc=4,
-            l1d_size="32kB",
+            l1d_size="32KiB",
            l1d_assoc=8,
            l2_size=l2_size,
            l2_assoc=16,
@@ -108,16 +109,17 @@ class RISCVMatchedCacheHierarchy(
            for i in range(board.get_processor().get_num_cores())
        ]
        self.l1dcaches = [
-            L1DCache(size=self._l1d_size, assoc=self._l1d_assoc)
-            for i in range(board.get_processor().get_num_cores())
-        ]
-        self.l2buses = [
-            L2XBar() for i in range(board.get_processor().get_num_cores())
-        ]
-        self.l2caches = [
-            L2Cache(size=self._l2_size, assoc=self._l2_assoc)
+            L1DCache(
+                size=self._l1d_size, assoc=self._l1d_assoc, response_latency=10
+            )
            for i in range(board.get_processor().get_num_cores())
        ]
+        self.l2bus = L2XBar()
+
+        self.l2cache = L2Cache(
+            size=self._l2_size, assoc=self._l2_assoc, data_latency=20
+        )
+
        # ITLB Page walk caches
        self.iptw_caches = [
            MMUCache(size="4KiB")
@@ -137,14 +139,10 @@ class RISCVMatchedCacheHierarchy(
            cpu.connect_icache(self.l1icaches[i].cpu_side)
            cpu.connect_dcache(self.l1dcaches[i].cpu_side)

-            self.l1icaches[i].mem_side = self.l2buses[i].cpu_side_ports
-            self.l1dcaches[i].mem_side = self.l2buses[i].cpu_side_ports
-            self.iptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports
-            self.dptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports
-
-            self.l2buses[i].mem_side_ports = self.l2caches[i].cpu_side
-
-            self.membus.cpu_side_ports = self.l2caches[i].mem_side
+            self.l1icaches[i].mem_side = self.l2bus.cpu_side_ports
+            self.l1dcaches[i].mem_side = self.l2bus.cpu_side_ports
+            self.iptw_caches[i].mem_side = self.l2bus.cpu_side_ports
+            self.dptw_caches[i].mem_side = self.l2bus.cpu_side_ports

            cpu.connect_walker_ports(
                self.iptw_caches[i].cpu_side, self.dptw_caches[i].cpu_side
@@ -157,6 +155,9 @@ class RISCVMatchedCacheHierarchy(
            else:
                cpu.connect_interrupt()

+        self.l2bus.mem_side_ports = self.l2cache.cpu_side
+        self.membus.cpu_side_ports = self.l2cache.mem_side
+
    def _setup_io_cache(self, board: AbstractBoard) -> None:
        """Create a cache for coherent I/O connections"""
        self.iocache = Cache(
--- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py
+++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py
@@ -61,8 +61,14 @@ class U74PredFU(MinorDefaultPredFU):
    pass


-class U74MemFU(MinorDefaultMemFU):
-    opLat = 3
+class U74MemReadFU(MinorDefaultMemFU):
+    opClasses = minorMakeOpClassSet(["MemRead", "FloatMemRead"])
+    opLat = 2
+
+
+class U74MemWriteFU(MinorDefaultMemFU):
+    opClasses = minorMakeOpClassSet(["MemWrite", "FloatMemWrite"])
+    opLat = 2


 class U74MiscFU(MinorDefaultMiscFU):
@@ -77,18 +83,24 @@ class U74FUPool(MinorFUPool):
        U74IntDivFU(),
        U74FloatSimdFU(),
        U74PredFU(),
-        U74MemFU(),
+        U74MemReadFU(),
+        U74MemWriteFU(),
        U74MiscFU(),
    ]


 class U74BP(TournamentBP):
-    BTBEntries = 16
-    RASSize = 6
+    BTBEntries = 32
+    RASSize = 12
    localHistoryTableSize = 4096  # is 3.6 KiB but gem5 requires power of 2
-
+    localPredictorSize = 16384
+    globalPredictorSize = 16384
+    choicePredictorSize = 16384
+    localCtrBits = 4
+    globalCtrBits = 4
+    choiceCtrBits = 4
    indirectBranchPred = SimpleIndirectPredictor()
-    indirectBranchPred.indirectSets = 8
+    indirectBranchPred.indirectSets = 16


 class U74CPU(RiscvMinorCPU):
@@ -97,26 +109,49 @@ class U74CPU(RiscvMinorCPU):
    This information about the CPU can be found on page 15 of
    gem5_rsk_gem5-21.2.pdf at https://github.com/arm-university/arm-gem5-rsk

-    The only parameter that is changed is the decodeToExecuteForwardDelay.
-    This is changed from 1 to 2 to avoid a PMC address fault.
+    The parameters that are changed are:
+    - threadPolicy:
+        This is initialized to "SingleThreaded".
+    - decodeToExecuteForwardDelay:
+        This is changed from 1 to 2 to avoid a PMC address fault.
+    - fetch1ToFetch2BackwardDelay:
+        This is changed from 1 to 0 to better match hardware performance.
+    - fetch2InputBufferSize:
+        This is changed from 2 to 1 to better match hardware performance.
+    - decodeInputBufferSize:
+        This is changed from 3 to 2 to better match hardware performance.
+    - decodeToExecuteForwardDelay:
+        This is changed from 2 to 1 to better match hardware performance.
+    - executeInputBufferSize:
+        This is changed from 7 to 4 to better match hardware performance.
+    - executeMaxAccessesInMemory:
+        This is changed from 2 to 1 to better match hardware performance.
+    - executeLSQStoreBufferSize:
+        This is changed from 5 to 3 to better match hardware performance.
+    - executeBranchDelay:
+        This is changed from 1 to 2 to better match hardware performance.
+    - enableIdling:
+        This is changed to False to better match hardware performance.

    """

+    threadPolicy = "SingleThreaded"
+
    # Fetch1 stage
    fetch1LineSnapWidth = 0
    fetch1LineWidth = 0
    fetch1FetchLimit = 1
    fetch1ToFetch2ForwardDelay = 1
-    fetch1ToFetch2BackwardDelay = 1
+    fetch1ToFetch2BackwardDelay = 0

    # Fetch2 stage
-    fetch2InputBufferSize = 2
+    fetch2InputBufferSize = 1
    fetch2ToDecodeForwardDelay = 1
    fetch2CycleInput = True

    # Decode stage
-    decodeInputBufferSize = 3
-    decodeToExecuteForwardDelay = 2
+    decodeInputBufferSize = 2
+    decodeToExecuteForwardDelay = 1
    decodeInputWidth = 2
    decodeCycleInput = True

@@ -127,17 +162,17 @@ class U74CPU(RiscvMinorCPU):
    executeMemoryIssueLimit = 1
    executeCommitLimit = 2
    executeMemoryCommitLimit = 1
-    executeInputBufferSize = 7
-    executeMaxAccessesInMemory = 2
+    executeInputBufferSize = 4
+    executeMaxAccessesInMemory = 1
    executeLSQMaxStoreBufferStoresPerCycle = 2
    executeLSQRequestsQueueSize = 1
    executeLSQTransfersQueueSize = 2
-    executeLSQStoreBufferSize = 5
-    executeBranchDelay = 1
+    executeLSQStoreBufferSize = 3
+    executeBranchDelay = 2
    executeSetTraceTimeOnCommit = True
    executeSetTraceTimeOnIssue = False
    executeAllowEarlyMemoryIssue = True
-    enableIdling = True
+    enableIdling = False

    # Functional Units and Branch Prediction
    executeFuncUnits = U74FUPool()
@@ -152,13 +187,21 @@ class U74Core(BaseCPUCore):
      - IntFU: 1 cycle
      - IntMulFU: 3 cycles
      - IntDivFU: 6 cycles (NOTE: latency is variable, but is set to 6 cycles)
-      - MemFU: 3 cycles
+      - MemReadFU: 2 cycles
+      - MemWriteFU: 2 cycles
    The branch predictor is a TournamentBP, based on Section 4.2.5 on page 38.
-      - BTBEntries: 16 entries
-      - RASSize: 6 entries
-      - IndirectSets: 8 sets
+      - BTBEntries: 32 entries
+      - RASSize: 12 entries
+      - IndirectSets: 16 sets
+      - localPredictorSize: 16384
+      - globalPredictorSize: 16384
+      - choicePredictorSize: 16384
+      - localCtrBits: 4
+      - globalCtrBits: 4
+      - choiceCtrBits: 4
      - localHistoryTableSize: 4096 B
-    NOTE: The BHT of the HiFive Board is 3.6KiB but gem5 requires a power of 2, so the BHT is 4096B.
+    NOTE: The TournamentBP deviates from the actual BP.
+    This configuration performs the best in relation to the hardware.
    """

    def __init__(