misc: Merge branch 'release-staging-v21-0' into develop

Change-Id: I0ad043ded56fb848e045057a1e7a56ea39797906
2021-03-18 11:13:14 -07:00
parent 6a7403f1a2 377a96644e
commit 68064d8043
49 changed files with 9819 additions and 226 deletions
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -1,3 +1,16 @@
+# Version 20.1.0.5
+
+**[HOTFIX]** This hotfix release fixes three known bugs:
+
+* `src/python/m5/util/convert.py` incorrectly stated kibibytes as 'kiB' instead of 'KiB'. This has been fixed.
+* Atomic accesses were not checking the access permission bits in the page table descriptors. They were incorrectly using the nature of the request itself. This is now fixed.
+* `num_l2chaches_per_cluster` and `num_cpus_per_cluster` were cast to floats in `configs/ruby/MESI_Three_Level_HTM.py`, which caused errors. This has been fixed so they are correctly cast to integers.
+
+# Version 20.1.0.4
+
+**[HOTFIX]** [gem5 was failing to build with SCons 4.0.1 and 4.1.0](https://gem5.atlassian.net/browse/GEM5-916).
+This hotfix makes the necessary changes to `site_scons/site_tools/default.py` for gem5 to compile successfully on these versions of SCons.
+
 # Version 20.1.0.3

 **[HOTFIX]** A patch was apply to fix an [error where booting Linux stalled when using the ARM ISA](https://gem5.atlassian.net/browse/GEM5-901).
--- a/3
+++ b/3
@@ -760,6 +760,9 @@ protocol_dirs = []
 Export('protocol_dirs')
 slicc_includes = []
 Export('slicc_includes')
+# list of protocols that require the partial functional read interface
+need_partial_func_reads = []
+Export('need_partial_func_reads')

 # Walk the tree and execute all SConsopts scripts that wil add to the
 # above variables
--- a/configs/example/noc_config/2x4.yaml
+++ b/configs/example/noc_config/2x4.yaml
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# 2x4 mesh definition
+#
+# 0 --- 1 --- 2 --- 3
+# |     |     |     |
+# 4 --- 5 --- 6 --- 7
+#
+mesh:
+    num_rows : 2
+    num_cols : 4
+    router_latency : 1
+    link_latency : 1
+
+# Bindings for each CHI node type.
+
+CHI_RNF:
+    # Uncomment to map num_nodes_per_router RNFs in each provided router,
+    # assuming num. created CHI_RNFs == len(router_list)*num_nodes_per_router
+    # num_nodes_per_router: 1
+    router_list: [1, 2, 5, 6]
+
+CHI_HNF:
+    # num_nodes_per_router: 1
+    router_list: [1, 2, 5, 6]
+
+CHI_SNF_MainMem:
+    # num_nodes_per_router: 1
+    router_list: [0, 4]
+
+# Applies to CHI_SNF_BootMem and possibly other non-main memories
+CHI_SNF_IO:
+    router_list: [3]
+
+# Applies to CHI_RNI_DMA and CHI_RNI_IO
+CHI_RNI_IO:
+    router_list: [7]
--- a/configs/ruby/CHI.py
+++ b/configs/ruby/CHI.py
@@ -0,0 +1,840 @@
+# Copyright (c) 2021 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import yaml
+import m5
+from m5.objects import *
+from m5.defines import buildEnv
+from .Ruby import create_topology, setup_memory_controllers
+
+def define_options(parser):
+    parser.add_option("--noc-config", action="store", type="string",
+                      default=None,
+                      help="YAML NoC config. parameters and bindings. "
+                           "required for CustomMesh topology")
+
+class Versions:
+    '''
+    Helper class to obtain unique ids for a given controller class.
+    These are passed as the 'version' parameter when creating the controller.
+    '''
+    _seqs = 0
+    @classmethod
+    def getSeqId(cls):
+        val = cls._seqs
+        cls._seqs += 1
+        return val
+
+    _version = {}
+    @classmethod
+    def getVersion(cls, tp):
+        if tp not in cls._version:
+            cls._version[tp] = 0
+        val = cls._version[tp]
+        cls._version[tp] = val + 1
+        return val
+
+
+class CHI_Node(SubSystem):
+    '''
+    Base class with common functions for setting up Cache or Memory
+    controllers that are part of a CHI RNF, RNFI, HNF, or SNF nodes.
+    Notice getNetworkSideControllers and getAllControllers must be implemented
+    in the derived classes.
+    '''
+
+    def __init__(self, ruby_system):
+        super(CHI_Node, self).__init__()
+        self._ruby_system = ruby_system
+        self._network = ruby_system.network
+
+    def getNetworkSideControllers(self):
+        '''
+        Returns all ruby controllers that need to be connected to the
+        network
+        '''
+        raise NotImplementedError()
+
+    def getAllControllers(self):
+        '''
+        Returns all ruby controllers associated with this node
+        '''
+        raise NotImplementedError()
+
+    def setDownstream(self, cntrls):
+        '''
+        Sets cntrls as the downstream list of all controllers in this node
+        '''
+        for c in self.getNetworkSideControllers():
+            c.downstream_destinations = cntrls
+
+    def connectController(self, cntrl):
+        '''
+        Creates and configures the messages buffers for the CHI input/output
+        ports that connect to the network
+        '''
+        cntrl.reqOut = MessageBuffer()
+        cntrl.rspOut = MessageBuffer()
+        cntrl.snpOut = MessageBuffer()
+        cntrl.datOut = MessageBuffer()
+        cntrl.reqIn = MessageBuffer()
+        cntrl.rspIn = MessageBuffer()
+        cntrl.snpIn = MessageBuffer()
+        cntrl.datIn = MessageBuffer()
+
+        # All CHI ports are always connected to the network.
+        # Controllers that are not part of the getNetworkSideControllers list
+        # still communicate using internal routers, thus we need to wire-up the
+        # ports
+        cntrl.reqOut.out_port = self._network.in_port
+        cntrl.rspOut.out_port = self._network.in_port
+        cntrl.snpOut.out_port = self._network.in_port
+        cntrl.datOut.out_port = self._network.in_port
+        cntrl.reqIn.in_port = self._network.out_port
+        cntrl.rspIn.in_port = self._network.out_port
+        cntrl.snpIn.in_port = self._network.out_port
+        cntrl.datIn.in_port = self._network.out_port
+
+class TriggerMessageBuffer(MessageBuffer):
+    '''
+    MessageBuffer for triggering internal controller events.
+    These buffers should not be affected by the Ruby tester randomization
+    and allow poping messages enqueued in the same cycle.
+    '''
+    randomization = 'disabled'
+    allow_zero_latency = True
+
+class OrderedTriggerMessageBuffer(TriggerMessageBuffer):
+    ordered = True
+
+class CHI_Cache_Controller(Cache_Controller):
+    '''
+    Default parameters for a Cache controller
+    The Cache_Controller can also be used as a DMA requester or as
+    a pure directory if all cache allocation policies are disabled.
+    '''
+
+    def __init__(self, ruby_system):
+        super(CHI_Cache_Controller, self).__init__(
+            version = Versions.getVersion(Cache_Controller),
+            ruby_system = ruby_system,
+            mandatoryQueue = MessageBuffer(),
+            prefetchQueue = MessageBuffer(),
+            triggerQueue = TriggerMessageBuffer(),
+            retryTriggerQueue = OrderedTriggerMessageBuffer(),
+            replTriggerQueue = OrderedTriggerMessageBuffer(),
+            reqRdy = TriggerMessageBuffer(),
+            snpRdy = TriggerMessageBuffer())
+        # Set somewhat large number since we really a lot on internal
+        # triggers. To limit the controller performance, tweak other
+        # params such as: input port buffer size, cache banks, and output
+        # port latency
+        self.transitions_per_cycle = 128
+        # This should be set to true in the data cache controller to enable
+        # timeouts on unique lines when a store conditional fails
+        self.sc_lock_enabled = False
+
+class CHI_L1Controller(CHI_Cache_Controller):
+    '''
+    Default parameters for a L1 Cache controller
+    '''
+
+    def __init__(self, ruby_system, sequencer, cache, prefetcher):
+        super(CHI_L1Controller, self).__init__(ruby_system)
+        self.sequencer = sequencer
+        self.cache = cache
+        self.use_prefetcher = False
+        self.send_evictions = True
+        self.is_HN = False
+        self.enable_DMT = False
+        self.enable_DCT = False
+        # Strict inclusive MOESI
+        self.allow_SD = True
+        self.alloc_on_seq_acc = True
+        self.alloc_on_seq_line_write = False
+        self.alloc_on_readshared = True
+        self.alloc_on_readunique = True
+        self.alloc_on_readonce = True
+        self.alloc_on_writeback = True
+        self.dealloc_on_unique = False
+        self.dealloc_on_shared = False
+        self.dealloc_backinv_unique = True
+        self.dealloc_backinv_shared = True
+        # Some reasonable default TBE params
+        self.number_of_TBEs = 16
+        self.number_of_repl_TBEs = 16
+        self.number_of_snoop_TBEs = 4
+        self.unify_repl_TBEs = False
+
+class CHI_L2Controller(CHI_Cache_Controller):
+    '''
+    Default parameters for a L2 Cache controller
+    '''
+
+    def __init__(self, ruby_system, cache, prefetcher):
+        super(CHI_L2Controller, self).__init__(ruby_system)
+        self.sequencer = NULL
+        self.cache = cache
+        self.use_prefetcher = False
+        self.allow_SD = True
+        self.is_HN = False
+        self.enable_DMT = False
+        self.enable_DCT = False
+        self.send_evictions = False
+        # Strict inclusive MOESI
+        self.alloc_on_seq_acc = False
+        self.alloc_on_seq_line_write = False
+        self.alloc_on_readshared = True
+        self.alloc_on_readunique = True
+        self.alloc_on_readonce = True
+        self.alloc_on_writeback = True
+        self.dealloc_on_unique = False
+        self.dealloc_on_shared = False
+        self.dealloc_backinv_unique = True
+        self.dealloc_backinv_shared = True
+        # Some reasonable default TBE params
+        self.number_of_TBEs = 32
+        self.number_of_repl_TBEs = 32
+        self.number_of_snoop_TBEs = 16
+        self.unify_repl_TBEs = False
+
+class CHI_HNFController(CHI_Cache_Controller):
+    '''
+    Default parameters for a coherent home node (HNF) cache controller
+    '''
+
+    def __init__(self, ruby_system, cache, prefetcher, addr_ranges):
+        super(CHI_HNFController, self).__init__(ruby_system)
+        self.sequencer = NULL
+        self.cache = cache
+        self.use_prefetcher = False
+        self.addr_ranges = addr_ranges
+        self.allow_SD = True
+        self.is_HN = True
+        self.enable_DMT = True
+        self.enable_DCT = True
+        self.send_evictions = False
+        # MOESI / Mostly inclusive for shared / Exclusive for unique
+        self.alloc_on_seq_acc = False
+        self.alloc_on_seq_line_write = False
+        self.alloc_on_readshared = True
+        self.alloc_on_readunique = False
+        self.alloc_on_readonce = True
+        self.alloc_on_writeback = True
+        self.dealloc_on_unique = True
+        self.dealloc_on_shared = False
+        self.dealloc_backinv_unique = False
+        self.dealloc_backinv_shared = False
+        # Some reasonable default TBE params
+        self.number_of_TBEs = 32
+        self.number_of_repl_TBEs = 32
+        self.number_of_snoop_TBEs = 1 # should not receive any snoop
+        self.unify_repl_TBEs = False
+
+class CHI_DMAController(CHI_Cache_Controller):
+    '''
+    Default parameters for a DMA controller
+    '''
+
+    def __init__(self, ruby_system, sequencer):
+        super(CHI_DMAController, self).__init__(ruby_system)
+        self.sequencer = sequencer
+        class DummyCache(RubyCache):
+            dataAccessLatency = 0
+            tagAccessLatency = 1
+            size = "128"
+            assoc = 1
+        self.use_prefetcher = False
+        self.cache = DummyCache()
+        self.sequencer.dcache = NULL
+        # All allocations are false
+        # Deallocations are true (don't really matter)
+        self.allow_SD = False
+        self.is_HN = False
+        self.enable_DMT = False
+        self.enable_DCT = False
+        self.alloc_on_seq_acc = False
+        self.alloc_on_seq_line_write = False
+        self.alloc_on_readshared = False
+        self.alloc_on_readunique = False
+        self.alloc_on_readonce = False
+        self.alloc_on_writeback = False
+        self.dealloc_on_unique = False
+        self.dealloc_on_shared = False
+        self.dealloc_backinv_unique = False
+        self.dealloc_backinv_shared = False
+        self.send_evictions = False
+        self.number_of_TBEs = 16
+        self.number_of_repl_TBEs = 1
+        self.number_of_snoop_TBEs = 1 # should not receive any snoop
+        self.unify_repl_TBEs = False
+
+class CPUSequencerWrapper:
+    '''
+    Other generic configuration scripts assume a matching number of sequencers
+    and cpus. This wraps the instruction and data sequencer so they are
+    compatible with the other scripts. This assumes all scripts are using
+    connectCpuPorts/connectIOPorts to bind ports
+    '''
+
+    def __init__(self, iseq, dseq):
+        # use this style due to __setattr__ override below
+        self.__dict__['inst_seq'] = iseq
+        self.__dict__['data_seq'] = dseq
+        self.__dict__['support_data_reqs'] = True
+        self.__dict__['support_inst_reqs'] = True
+        # Compatibility with certain scripts that wire up ports
+        # without connectCpuPorts
+        self.__dict__['slave'] = dseq.in_ports
+        self.__dict__['in_ports'] = dseq.in_ports
+
+    def connectCpuPorts(self, cpu):
+        assert(isinstance(cpu, BaseCPU))
+        cpu.icache_port = self.inst_seq.in_ports
+        for p in cpu._cached_ports:
+            if str(p) != 'icache_port':
+                exec('cpu.%s = self.data_seq.in_ports' % p)
+        cpu.connectUncachedPorts(self.data_seq)
+
+    def connectIOPorts(self, piobus):
+        self.data_seq.connectIOPorts(piobus)
+
+    def __setattr__(self, name, value):
+        setattr(self.inst_seq, name, value)
+        setattr(self.data_seq, name, value)
+
+class CHI_RNF(CHI_Node):
+    '''
+    Defines a CHI request node.
+    Notice all contollers and sequencers are set as children of the cpus, so
+    this object acts more like a proxy for seting things up and has no topology
+    significance unless the cpus are set as its children at the top level
+    '''
+    def __init__(self, cpus, ruby_system,
+                 l1Icache_type, l1Dcache_type,
+                 cache_line_size,
+                 l1Iprefetcher_type=None, l1Dprefetcher_type=None):
+        super(CHI_RNF, self).__init__(ruby_system)
+
+        self._block_size_bits = int(math.log(cache_line_size, 2))
+
+        # All sequencers and controllers
+        self._seqs = []
+        self._cntrls = []
+
+        # Last level controllers in this node, i.e., the ones that will send
+        # requests to the home nodes
+        self._ll_cntrls = []
+
+        self._cpus = cpus
+
+        # First creates L1 caches and sequencers
+        for cpu in self._cpus:
+            cpu.inst_sequencer = RubySequencer(version = Versions.getSeqId(),
+                                         ruby_system = ruby_system)
+            cpu.data_sequencer = RubySequencer(version = Versions.getSeqId(),
+                                         ruby_system = ruby_system)
+
+            self._seqs.append(CPUSequencerWrapper(cpu.inst_sequencer,
+                                                  cpu.data_sequencer))
+
+            # caches
+            l1i_cache = l1Icache_type(start_index_bit = self._block_size_bits,
+                                      is_icache = True)
+
+            l1d_cache = l1Dcache_type(start_index_bit = self._block_size_bits,
+                                      is_icache = False)
+
+            # Placeholders for future prefetcher support
+            if l1Iprefetcher_type != None or l1Dprefetcher_type != None:
+                m5.fatal('Prefetching not supported yet')
+            l1i_pf = NULL
+            l1d_pf = NULL
+
+            # cache controllers
+            cpu.l1i = CHI_L1Controller(ruby_system, cpu.inst_sequencer,
+                                       l1i_cache, l1i_pf)
+
+            cpu.l1d = CHI_L1Controller(ruby_system, cpu.data_sequencer,
+                                       l1d_cache, l1d_pf)
+
+            cpu.inst_sequencer.dcache = NULL
+            cpu.data_sequencer.dcache = cpu.l1d.cache
+
+            cpu.l1d.sc_lock_enabled = True
+
+            cpu._ll_cntrls = [cpu.l1i, cpu.l1d]
+            for c in cpu._ll_cntrls:
+                self._cntrls.append(c)
+                self.connectController(c)
+                self._ll_cntrls.append(c)
+
+    def getSequencers(self):
+        return self._seqs
+
+    def getAllControllers(self):
+        return self._cntrls
+
+    def getNetworkSideControllers(self):
+        return self._cntrls
+
+    def setDownstream(self, cntrls):
+        for c in self._ll_cntrls:
+            c.downstream_destinations = cntrls
+
+    def getCpus(self):
+        return self._cpus
+
+    # Adds a private L2 for each cpu
+    def addPrivL2Cache(self, cache_type, pf_type=None):
+        self._ll_cntrls = []
+        for cpu in self._cpus:
+            l2_cache = cache_type(start_index_bit = self._block_size_bits,
+                                  is_icache = False)
+            if pf_type != None:
+                m5.fatal('Prefetching not supported yet')
+            l2_pf = NULL
+
+            cpu.l2 = CHI_L2Controller(self._ruby_system, l2_cache, l2_pf)
+
+            self._cntrls.append(cpu.l2)
+            self.connectController(cpu.l2)
+
+            self._ll_cntrls.append(cpu.l2)
+
+            for c in cpu._ll_cntrls:
+                c.downstream_destinations = [cpu.l2]
+            cpu._ll_cntrls = [cpu.l2]
+
+
+class CHI_HNF(CHI_Node):
+    '''
+    Encapsulates an HNF cache/directory controller.
+    Before the first controller is created, the class method
+    CHI_HNF.createAddrRanges must be called before creating any CHI_HNF object
+    to set-up the interleaved address ranges used by the HNFs
+    '''
+
+    _addr_ranges = []
+    @classmethod
+    def createAddrRanges(cls, sys_mem_ranges, cache_line_size, num_hnfs):
+        # Create the HNFs interleaved addr ranges
+        block_size_bits = int(math.log(cache_line_size, 2))
+        cls._addr_ranges = []
+        llc_bits = int(math.log(num_hnfs, 2))
+        numa_bit = block_size_bits + llc_bits - 1
+        for i in range(num_hnfs):
+            ranges = []
+            for r in sys_mem_ranges:
+                addr_range = AddrRange(r.start, size = r.size(),
+                                        intlvHighBit = numa_bit,
+                                        intlvBits = llc_bits,
+                                        intlvMatch = i)
+                ranges.append(addr_range)
+            cls._addr_ranges.append((ranges, numa_bit, i))
+
+    @classmethod
+    def getAddrRanges(cls, hnf_idx):
+        assert(len(cls._addr_ranges) != 0)
+        return cls._addr_ranges[hnf_idx]
+
+    # The CHI controller can be a child of this object or another if
+    # 'parent' if specified
+    def __init__(self, hnf_idx, ruby_system, llcache_type, parent):
+        super(CHI_HNF, self).__init__(ruby_system)
+
+        addr_ranges,intlvHighBit,intlvMatch = CHI_HNF.getAddrRanges(hnf_idx)
+        # All ranges should have the same interleaving
+        assert(len(addr_ranges) >= 1)
+        assert(intlvMatch == hnf_idx)
+
+        ll_cache = llcache_type(start_index_bit = intlvHighBit + 1)
+        self._cntrl = CHI_HNFController(ruby_system, ll_cache, NULL,
+                                        addr_ranges)
+
+        if parent == None:
+            self.cntrl = self._cntrl
+        else:
+            parent.cntrl = self._cntrl
+
+        self.connectController(self._cntrl)
+
+    def getAllControllers(self):
+        return [self._cntrl]
+
+    def getNetworkSideControllers(self):
+        return [self._cntrl]
+
+
+class CHI_SNF_Base(CHI_Node):
+    '''
+    Creates CHI node controllers for the memory controllers
+    '''
+
+    # The CHI controller can be a child of this object or another if
+    # 'parent' if specified
+    def __init__(self, ruby_system, parent):
+        super(CHI_SNF_Base, self).__init__(ruby_system)
+
+        self._cntrl = Memory_Controller(
+                          version = Versions.getVersion(Memory_Controller),
+                          ruby_system = ruby_system,
+                          triggerQueue = TriggerMessageBuffer(),
+                          responseFromMemory = MessageBuffer(),
+                          requestToMemory = MessageBuffer(ordered = True),
+                          reqRdy = TriggerMessageBuffer())
+
+        self.connectController(self._cntrl)
+
+        if parent:
+            parent.cntrl = self._cntrl
+        else:
+            self.cntrl = self._cntrl
+
+    def getAllControllers(self):
+        return [self._cntrl]
+
+    def getNetworkSideControllers(self):
+        return [self._cntrl]
+
+    def getMemRange(self, mem_ctrl):
+        # TODO need some kind of transparent API for
+        # MemCtrl+DRAM vs SimpleMemory
+        if hasattr(mem_ctrl, 'range'):
+            return mem_ctrl.range
+        else:
+            return mem_ctrl.dram.range
+
+class CHI_SNF_BootMem(CHI_SNF_Base):
+    '''
+    Create the SNF for the boot memory
+    '''
+    def __init__(self, ruby_system, parent, bootmem):
+        super(CHI_SNF_BootMem, self).__init__(ruby_system, parent)
+        self._cntrl.memory_out_port = bootmem.port
+        self._cntrl.addr_ranges = self.getMemRange(bootmem)
+
+class CHI_SNF_MainMem(CHI_SNF_Base):
+    '''
+    Create the SNF for a list main memory controllers
+    '''
+    def __init__(self, ruby_system, parent, mem_ctrl = None):
+        super(CHI_SNF_MainMem, self).__init__(ruby_system, parent)
+        if mem_ctrl:
+            self._cntrl.memory_out_port = mem_ctrl.port
+            self._cntrl.addr_ranges = self.getMemRange(mem_ctrl)
+        # else bind ports and range later
+
+class CHI_RNI_Base(CHI_Node):
+    '''
+    Request node without cache / DMA
+    '''
+
+    # The CHI controller can be a child of this object or another if
+    # 'parent' if specified
+    def __init__(self, ruby_system, parent):
+        super(CHI_RNI_Base, self).__init__(ruby_system)
+
+        self._sequencer = RubySequencer(version = Versions.getSeqId(),
+                                         ruby_system = ruby_system,
+                                         clk_domain = ruby_system.clk_domain)
+        self._cntrl = CHI_DMAController(ruby_system, self._sequencer)
+
+        if parent:
+            parent.cntrl = self._cntrl
+        else:
+            self.cntrl = self._cntrl
+
+        self.connectController(self._cntrl)
+
+    def getAllControllers(self):
+        return [self._cntrl]
+
+    def getNetworkSideControllers(self):
+        return [self._cntrl]
+
+class CHI_RNI_DMA(CHI_RNI_Base):
+    '''
+    DMA controller wiredup to a given dma port
+    '''
+    def __init__(self, ruby_system, dma_port, parent):
+        super(CHI_RNI_DMA, self).__init__(ruby_system, parent)
+        assert(dma_port != None)
+        self._sequencer.in_ports = dma_port
+
+class CHI_RNI_IO(CHI_RNI_Base):
+    '''
+    DMA controller wiredup to ruby_system IO port
+    '''
+    def __init__(self, ruby_system, parent):
+        super(CHI_RNI_IO, self).__init__(ruby_system, parent)
+        ruby_system._io_port = self._sequencer
+
+def noc_params_from_config(config, noc_params):
+    # mesh options
+    noc_params.num_rows = config['mesh']['num_rows']
+    noc_params.num_cols = config['mesh']['num_cols']
+    if 'router_latency' in config['mesh']:
+        noc_params.router_latency = config['mesh']['router_latency']
+    if 'link_latency' in config['mesh']:
+        noc_params.router_link_latency = config['mesh']['link_latency']
+        noc_params.node_link_latency = config['mesh']['link_latency']
+    if 'router_link_latency' in config['mesh']:
+        noc_params.router_link_latency = config['mesh']['router_link_latency']
+    if 'node_link_latency' in config['mesh']:
+        noc_params.node_link_latency = config['mesh']['node_link_latency']
+    if 'cross_links' in config['mesh']:
+        noc_params.cross_link_latency = \
+                                config['mesh']['cross_link_latency']
+        noc_params.cross_links = []
+        for x, y in config['mesh']['cross_links']:
+            noc_params.cross_links.append((x, y))
+            noc_params.cross_links.append((y, x))
+    else:
+        noc_params.cross_links = []
+        noc_params.cross_link_latency = 0
+
+    # CHI_RNF options
+    noc_params.CHI_RNF = config['CHI_RNF']
+
+    # CHI_RNI_IO
+    noc_params.CHI_RNI_IO = config['CHI_RNI_IO']
+
+    # CHI_HNF options
+    noc_params.CHI_HNF = config['CHI_HNF']
+    if 'pairing' in config['CHI_HNF']:
+        noc_params.pairing = config['CHI_HNF']['pairing']
+
+    # CHI_SNF_MainMem
+    noc_params.CHI_SNF_MainMem = config['CHI_SNF_MainMem']
+
+    # CHI_SNF_IO (applies to CHI_SNF_Bootmem)
+    noc_params.CHI_SNF_IO = config['CHI_SNF_IO']
+
+
+def create_system(options, full_system, system, dma_ports, bootmem,
+                  ruby_system):
+
+    if buildEnv['PROTOCOL'] != 'CHI':
+        m5.panic("This script requires the CHI build")
+
+    if options.num_dirs < 1:
+        m5.fatal('--num-dirs must be at least 1')
+
+    if options.num_l3caches < 1:
+        m5.fatal('--num-l3caches must be at least 1')
+
+    # Default parameters for the network
+    class NoC_Params(object):
+        def __init__(self):
+            self.topology = options.topology
+            self.network = options.network
+            self.router_link_latency = 1
+            self.node_link_latency = 1
+            self.router_latency = 1
+            self.router_buffer_size = 4
+            self.cntrl_msg_size = 8
+            self.data_width = 32
+    params = NoC_Params()
+
+    # read additional configurations from yaml file if provided
+    if options.noc_config:
+        with open(options.noc_config, 'r') as file:
+            noc_params_from_config(yaml.load(file), params)
+    elif params.topology == 'CustomMesh':
+        m5.fatal('--noc-config must be provided if topology is CustomMesh')
+
+    # Declare caches and controller types used by the protocol
+    # Notice tag and data accesses are not concurrent, so the a cache hit
+    # latency = tag + data + response latencies.
+    # Default response latencies are 1 cy for all controllers.
+    # For L1 controllers the mandatoryQueue enqueue latency is always 1 cy and
+    # this is deducted from the initial tag read latency for sequencer requests
+    # dataAccessLatency may be set to 0 if one wants to consider parallel
+    # data and tag lookups
+    class L1ICache(RubyCache):
+        dataAccessLatency = 1
+        tagAccessLatency = 1
+        size = options.l1i_size
+        assoc = options.l1i_assoc
+
+    class L1DCache(RubyCache):
+        dataAccessLatency = 2
+        tagAccessLatency = 1
+        size = options.l1d_size
+        assoc = options.l1d_assoc
+
+    class L2Cache(RubyCache):
+        dataAccessLatency = 6
+        tagAccessLatency = 2
+        size = options.l2_size
+        assoc = options.l2_assoc
+
+    class HNFCache(RubyCache):
+        dataAccessLatency = 10
+        tagAccessLatency = 2
+        size = options.l3_size
+        assoc = options.l3_assoc
+
+    # other functions use system.cache_line_size assuming it has been set
+    assert(system.cache_line_size.value == options.cacheline_size)
+
+    cpu_sequencers = []
+    mem_cntrls = []
+    mem_dests = []
+    network_nodes = []
+    network_cntrls = []
+    hnf_dests = []
+    all_cntrls = []
+
+    # Creates on RNF per cpu with priv l2 caches
+    assert(len(system.cpu) == options.num_cpus)
+    ruby_system.rnf = [ CHI_RNF([cpu], ruby_system, L1ICache, L1DCache,
+                                system.cache_line_size.value)
+                        for cpu in system.cpu ]
+    for rnf in ruby_system.rnf:
+        rnf.addPrivL2Cache(L2Cache)
+        cpu_sequencers.extend(rnf.getSequencers())
+        all_cntrls.extend(rnf.getAllControllers())
+        network_nodes.append(rnf)
+        network_cntrls.extend(rnf.getNetworkSideControllers())
+
+    # Look for other memories
+    other_memories = []
+    if bootmem:
+        other_memories.append(bootmem)
+    if getattr(system, 'sram', None):
+        other_memories.append(getattr(system, 'sram', None))
+    on_chip_mem_ports = getattr(system, '_on_chip_mem_ports', None)
+    if on_chip_mem_ports:
+        other_memories.extend([p.simobj for p in on_chip_mem_ports])
+
+    # Create the LLCs cntrls
+    sysranges = [] + system.mem_ranges
+
+    for m in other_memories:
+        sysranges.append(m.range)
+
+    CHI_HNF.createAddrRanges(sysranges, system.cache_line_size.value,
+                             options.num_l3caches)
+    ruby_system.hnf = [ CHI_HNF(i, ruby_system, HNFCache, None)
+                        for i in range(options.num_l3caches) ]
+
+    for hnf in ruby_system.hnf:
+        network_nodes.append(hnf)
+        network_cntrls.extend(hnf.getNetworkSideControllers())
+        assert(hnf.getAllControllers() == hnf.getNetworkSideControllers())
+        all_cntrls.extend(hnf.getAllControllers())
+        hnf_dests.extend(hnf.getAllControllers())
+
+    # Create the memory controllers
+    # Notice we don't define a Directory_Controller type so we don't use
+    # create_directories shared by other protocols.
+
+    ruby_system.snf = [ CHI_SNF_MainMem(ruby_system, None, None)
+                        for i in range(options.num_dirs) ]
+    for snf in ruby_system.snf:
+        network_nodes.append(snf)
+        network_cntrls.extend(snf.getNetworkSideControllers())
+        assert(snf.getAllControllers() == snf.getNetworkSideControllers())
+        mem_cntrls.extend(snf.getAllControllers())
+        all_cntrls.extend(snf.getAllControllers())
+        mem_dests.extend(snf.getAllControllers())
+
+    if len(other_memories) > 0:
+        ruby_system.rom_snf = [ CHI_SNF_BootMem(ruby_system, None, m)
+                                 for m in other_memories ]
+        for snf in ruby_system.rom_snf:
+            network_nodes.append(snf)
+            network_cntrls.extend(snf.getNetworkSideControllers())
+            all_cntrls.extend(snf.getAllControllers())
+            mem_dests.extend(snf.getAllControllers())
+
+
+    # Creates the controller for dma ports and io
+
+    if len(dma_ports) > 0:
+        ruby_system.dma_rni = [ CHI_RNI_DMA(ruby_system, dma_port, None)
+                                for dma_port in dma_ports ]
+        for rni in ruby_system.dma_rni:
+            network_nodes.append(rni)
+            network_cntrls.extend(rni.getNetworkSideControllers())
+            all_cntrls.extend(rni.getAllControllers())
+
+    if full_system:
+        ruby_system.io_rni = CHI_RNI_IO(ruby_system, None)
+        network_nodes.append(ruby_system.io_rni)
+        network_cntrls.extend(ruby_system.io_rni.getNetworkSideControllers())
+        all_cntrls.extend(ruby_system.io_rni.getAllControllers())
+
+
+    # Assign downstream destinations
+    for rnf in ruby_system.rnf:
+        rnf.setDownstream(hnf_dests)
+    if len(dma_ports) > 0:
+        for rni in ruby_system.dma_rni:
+            rni.setDownstream(hnf_dests)
+    if full_system:
+        ruby_system.io_rni.setDownstream(hnf_dests)
+    for hnf in ruby_system.hnf:
+        hnf.setDownstream(mem_dests)
+
+    # Setup data message size for all controllers
+    for cntrl in all_cntrls:
+        cntrl.data_channel_size = params.data_width
+
+    # Network configurations
+    # virtual networks: 0=request, 1=snoop, 2=response, 3=data
+    ruby_system.network.number_of_virtual_networks = 4
+
+    ruby_system.network.control_msg_size = params.cntrl_msg_size
+    ruby_system.network.data_msg_size = params.data_width
+    ruby_system.network.buffer_size = params.router_buffer_size
+
+    if params.topology == 'CustomMesh':
+        topology = create_topology(network_nodes, params)
+    elif params.topology in ['Crossbar', 'Pt2Pt']:
+        topology = create_topology(network_cntrls, params)
+    else:
+        m5.fatal("%s not supported!" % params.topology)
+
+    # Incorporate the params into options so it's propagated to
+    # makeTopology by the parent script
+    for k in dir(params):
+        if not k.startswith('__'):
+            setattr(options, k, getattr(params, k))
+
+    return (cpu_sequencers, mem_cntrls, topology)
--- a/configs/ruby/MESI_Three_Level_HTM.py
+++ b/configs/ruby/MESI_Three_Level_HTM.py
@@ -78,10 +78,10 @@ def create_system(options, full_system, system, dma_ports, bootmem,
    dma_cntrl_nodes = []

    assert (options.num_cpus % options.num_clusters == 0)
-    num_cpus_per_cluster = options.num_cpus / options.num_clusters
+    num_cpus_per_cluster = options.num_cpus // options.num_clusters

    assert (options.num_l2caches % options.num_clusters == 0)
-    num_l2caches_per_cluster = options.num_l2caches / options.num_clusters
+    num_l2caches_per_cluster = options.num_l2caches // options.num_clusters

    l2_bits = int(math.log(num_l2caches_per_cluster, 2))
    block_size_bits = int(math.log(options.cacheline_size, 2))
@@ -141,7 +141,6 @@ def create_system(options, full_system, system, dma_ports, bootmem,
                   ruby_system = ruby_system)

            cpu_seq = RubyHTMSequencer(version = i * num_cpus_per_cluster + j,
-                                       icache = l0i_cache,
                                       clk_domain = clk_domain,
                                       dcache = l0d_cache,
                                       ruby_system = ruby_system)
--- a/configs/topologies/CustomMesh.py
+++ b/configs/topologies/CustomMesh.py
@@ -0,0 +1,444 @@
+# Copyright (c) 2021 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+import math
+
+from m5.util import fatal
+from m5.params import *
+from m5.objects import *
+
+from m5.defines import buildEnv
+if buildEnv['PROTOCOL'] == 'CHI':
+    import ruby.CHI as CHI
+
+from topologies.BaseTopology import SimpleTopology
+
+class CustomMesh(SimpleTopology):
+    description = 'CustomMesh'
+
+    def __init__(self, controllers):
+        self.nodes = controllers
+
+    #--------------------------------------------------------------------------
+    # _makeMesh
+    #--------------------------------------------------------------------------
+
+    def _makeMesh(self, IntLink, link_latency, num_rows, num_columns,
+                  cross_links, cross_link_latency):
+
+        # East->West, West->East, North->South, South->North
+        # XY routing weights
+        link_weights = [1, 1, 2, 2]
+
+        # East output to West input links
+        for row in range(num_rows):
+            for col in range(num_columns):
+                if (col + 1 < num_columns):
+                    east_out = col + (row * num_columns)
+                    west_in = (col + 1) + (row * num_columns)
+                    llat = cross_link_latency \
+                                if (east_out, west_in) in cross_links \
+                                else link_latency
+                    self._int_links.append(\
+                                IntLink(link_id=self._link_count,
+                                        src_node=self._routers[east_out],
+                                        dst_node=self._routers[west_in],
+                                        dst_inport="West",
+                                        latency = llat,
+                                        weight=link_weights[0]))
+                    self._link_count += 1
+
+        # West output to East input links
+        for row in range(num_rows):
+            for col in range(num_columns):
+                if (col + 1 < num_columns):
+                    east_in = col + (row * num_columns)
+                    west_out = (col + 1) + (row * num_columns)
+                    llat = cross_link_latency \
+                                if (west_out, east_in) in cross_links \
+                                else link_latency
+                    self._int_links.append(\
+                                IntLink(link_id=self._link_count,
+                                        src_node=self._routers[west_out],
+                                        dst_node=self._routers[east_in],
+                                        dst_inport="East",
+                                        latency = llat,
+                                        weight=link_weights[1]))
+                    self._link_count += 1
+
+        # North output to South input links
+        for col in range(num_columns):
+            for row in range(num_rows):
+                if (row + 1 < num_rows):
+                    north_out = col + (row * num_columns)
+                    south_in = col + ((row + 1) * num_columns)
+                    llat = cross_link_latency \
+                            if (north_out, south_in) in cross_links \
+                            else link_latency
+                    self._int_links.append(\
+                                IntLink(link_id=self._link_count,
+                                        src_node=self._routers[north_out],
+                                        dst_node=self._routers[south_in],
+                                        dst_inport="South",
+                                        latency = llat,
+                                        weight=link_weights[2]))
+                    self._link_count += 1
+
+        # South output to North input links
+        for col in range(num_columns):
+            for row in range(num_rows):
+                if (row + 1 < num_rows):
+                    north_in = col + (row * num_columns)
+                    south_out = col + ((row + 1) * num_columns)
+                    llat = cross_link_latency \
+                            if (south_out, north_in) in cross_links \
+                            else link_latency
+                    self._int_links.append(\
+                                IntLink(link_id=self._link_count,
+                                        src_node=self._routers[south_out],
+                                        dst_node=self._routers[north_in],
+                                        dst_inport="North",
+                                        latency = llat,
+                                        weight=link_weights[3]))
+                    self._link_count += 1
+
+    #--------------------------------------------------------------------------
+    # distributeNodes
+    #--------------------------------------------------------------------------
+
+    def _createRNFRouter(self, mesh_router):
+        # Create a zero-latency router bridging node controllers
+        # and the mesh router
+        node_router = self._Router(router_id = len(self._routers),
+                                    latency = 0)
+        self._routers.append(node_router)
+
+        # connect node_router <-> mesh router
+        self._int_links.append(self._IntLink( \
+                                    link_id = self._link_count,
+                                    src_node = node_router,
+                                    dst_node = mesh_router,
+                            latency = self._router_link_latency))
+        self._link_count += 1
+
+        self._int_links.append(self._IntLink( \
+                                    link_id = self._link_count,
+                                    src_node = mesh_router,
+                                    dst_node = node_router,
+                            latency = self._router_link_latency))
+        self._link_count += 1
+
+        return node_router
+
+    def distributeNodes(self, num_nodes_per_router, router_idx_list,
+                        node_list):
+
+        if num_nodes_per_router:
+            # evenly distribute nodes to all listed routers
+            assert(len(router_idx_list)*num_nodes_per_router == len(node_list))
+
+            for idx, node in enumerate(node_list):
+                mesh_router_idx = router_idx_list[idx // num_nodes_per_router]
+                router = self._routers[mesh_router_idx]
+
+                # Create another router bridging RNF node controllers
+                # and the mesh router
+                # for non-RNF nodes, node router is mesh router
+                if isinstance(node, CHI.CHI_RNF):
+                    router = self._createRNFRouter(router)
+
+                # connect all ctrls in the node to node_router
+                ctrls = node.getNetworkSideControllers()
+                for c in ctrls:
+                    self._ext_links.append(self._ExtLink(
+                                    link_id = self._link_count,
+                                    ext_node = c,
+                                    int_node = router,
+                                    latency = self._node_link_latency))
+                    self._link_count += 1
+        else:
+            # try to circulate all nodes to all routers, some routers may be
+            # connected to zero or more than one node.
+            idx = 0
+            for node in node_list:
+                ridx = router_idx_list[idx]
+                router = self._routers[ridx]
+
+                if isinstance(node, CHI.CHI_RNF):
+                    router = self._createRNFRouter(router)
+                ctrls = node.getNetworkSideControllers()
+                for c in ctrls:
+                    self._ext_links.append(self._ExtLink( \
+                                                 link_id = self._link_count,
+                                                 ext_node = c,
+                                                 int_node = router,
+                                            latency = self._node_link_latency))
+                    self._link_count += 1
+                idx = (idx + 1) % len(router_idx_list)
+
+    #--------------------------------------------------------------------------
+    # makeTopology
+    #--------------------------------------------------------------------------
+
+    def makeTopology(self, options, network, IntLink, ExtLink, Router):
+        assert(buildEnv['PROTOCOL'] == 'CHI')
+
+        num_rows = options.num_rows
+        num_cols = options.num_cols
+        num_mesh_routers = num_rows * num_cols
+
+        self._IntLink = IntLink
+        self._ExtLink = ExtLink
+        self._Router = Router
+
+        if hasattr(options, 'router_link_latency'):
+            self._router_link_latency = options.router_link_latency
+            self._node_link_latency = options.node_link_latency
+        else:
+            print("WARNING: router/node link latencies not provided")
+            self._router_link_latency = options.link_latency
+            self._node_link_latency = options.link_latency
+
+        # classify nodes into different types
+        rnf_list = []
+        hnf_list = []
+        mem_ctrls = []
+        io_mem_ctrls = []
+        io_rni_ctrls = []
+
+        for n in self.nodes:
+            if isinstance(n, CHI.CHI_RNF):
+                rnf_list.append(n)
+            elif isinstance(n, CHI.CHI_HNF):
+                hnf_list.append(n)
+            elif isinstance(n, CHI.CHI_SNF_MainMem):
+                mem_ctrls.append(n)
+            elif isinstance(n, CHI.CHI_SNF_BootMem):
+                io_mem_ctrls.append(n)
+            elif isinstance(n, CHI.CHI_RNI_DMA):
+                io_rni_ctrls.append(n)
+            elif isinstance(n, CHI.CHI_RNI_IO):
+                io_rni_ctrls.append(n)
+            else:
+                fatal('topologies.CustomMesh: {} not supported'
+                            .format(n.__class__.__name__))
+
+        # Create all mesh routers
+        self._routers = [Router(router_id=i, latency = options.router_latency)\
+                                    for i in range(num_mesh_routers)]
+
+        self._link_count = 0
+        self._int_links = []
+        self._ext_links = []
+
+        # Create all the mesh internal links.
+        self._makeMesh(IntLink, self._router_link_latency, num_rows, num_cols,
+                       options.cross_links, options.cross_link_latency)
+
+        # Place CHI_RNF on the mesh
+        num_nodes_per_router = options.CHI_RNF['num_nodes_per_router'] \
+                if 'num_nodes_per_router' in options.CHI_RNF else None
+        self.distributeNodes(num_nodes_per_router,
+                             options.CHI_RNF['router_list'],
+                             rnf_list)
+
+        # Place CHI_HNF on the mesh
+        num_nodes_per_router = options.CHI_HNF['num_nodes_per_router'] \
+                if 'num_nodes_per_router' in options.CHI_HNF else None
+        self.distributeNodes(num_nodes_per_router,
+                             options.CHI_HNF['router_list'],
+                             hnf_list)
+
+        # Place CHI_SNF_MainMem on the mesh
+        num_nodes_per_router = options.CHI_SNF_MainMem['num_nodes_per_router']\
+                if 'num_nodes_per_router' in options.CHI_SNF_MainMem else None
+        self.distributeNodes(num_nodes_per_router,
+                             options.CHI_SNF_MainMem['router_list'],
+                             mem_ctrls)
+
+        # Place all IO mem nodes on the mesh
+        num_nodes_per_router = options.CHI_SNF_IO['num_nodes_per_router'] \
+                if 'num_nodes_per_router' in options.CHI_SNF_IO else None
+        self.distributeNodes(num_nodes_per_router,
+                             options.CHI_SNF_IO['router_list'],
+                             io_mem_ctrls)
+
+        # Place all IO request nodes on the mesh
+        num_nodes_per_router = options.CHI_RNI_IO['num_nodes_per_router'] \
+                if 'num_nodes_per_router' in options.CHI_RNI_IO else None
+        self.distributeNodes(num_nodes_per_router,
+                             options.CHI_RNI_IO['router_list'],
+                             io_rni_ctrls)
+
+        # Set up
+        network.int_links = self._int_links
+        network.ext_links = self._ext_links
+        network.routers = self._routers
+
+        pairing = getattr(options, 'pairing', None)
+        if pairing != None:
+            self._autoPairHNFandSNF(hnf_list, mem_ctrls, pairing)
+
+    #--------------------------------------------------------------------------
+    # _autoPair
+    #--------------------------------------------------------------------------
+    def _autoPairHNFandSNF(self, cache_ctrls, mem_ctrls, pairing):
+        # Use the pairing defined by the configuration to reassign the
+        # memory ranges
+        pair_debug = False
+
+        print("Pairing HNFs to SNFs")
+        print(pairing)
+
+        all_cache = []
+        for c in cache_ctrls: all_cache.extend(c.getNetworkSideControllers())
+        all_mem = []
+        for c in mem_ctrls: all_mem.extend(c.getNetworkSideControllers())
+
+        # checks and maps index from pairing map to component
+        assert(len(pairing) == len(all_cache))
+
+        def _tolist(val): return val if isinstance(val, list) else [val]
+
+        for m in all_mem: m._pairing = []
+
+        pairing_check = max(1, len(all_mem) / len(all_cache))
+        for cidx,c in enumerate(all_cache):
+            c._pairing = []
+            for midx in _tolist(pairing[cidx]):
+                c._pairing.append(all_mem[midx])
+                if c not in all_mem[midx]._pairing:
+                    all_mem[midx]._pairing.append(c)
+            assert(len(c._pairing) == pairing_check)
+            if pair_debug:
+                print(c.path())
+                for r in c.addr_ranges:
+                    print("%s" % r)
+                for p in c._pairing:
+                    print("\t"+p.path())
+                    for r in p.addr_ranges:
+                        print("\t%s" % r)
+
+        # all must be paired
+        for c in all_cache: assert(len(c._pairing) > 0)
+        for m in all_mem: assert(len(m._pairing) > 0)
+
+        # only support a single range for the main memory controllers
+        tgt_range_start = all_mem[0].addr_ranges[0].start.value
+        for mem in all_mem:
+            for r in mem.addr_ranges:
+                if r.start.value != tgt_range_start:
+                    fatal('topologies.CustomMesh: not supporting pairing of '\
+                          'main memory with multiple ranges')
+
+        # reassign ranges for a 1 -> N paring
+        def _rerange(src_cntrls, tgt_cntrls, fix_tgt_peer):
+            assert(len(tgt_cntrls) >= len(src_cntrls))
+
+            def _rangeToBit(addr_ranges):
+                bit = None
+                for r in addr_ranges:
+                    if bit == None:
+                        bit = r.intlvMatch
+                    else:
+                        assert(bit == r.intlvMatch)
+                return bit
+
+            def _getPeer(cntrl):
+                return cntrl.memory_out_port.peer.simobj
+
+            sorted_src = list(src_cntrls)
+            sorted_src.sort(key = lambda x: _rangeToBit(x.addr_ranges))
+
+            # paired controllers need to have seq. interleaving match values
+            intlvMatch = 0
+            for src in sorted_src:
+                for tgt in src._pairing:
+                    for r in tgt.addr_ranges:
+                        r.intlvMatch = intlvMatch
+                    if fix_tgt_peer:
+                        _getPeer(tgt).range.intlvMatch = intlvMatch
+                    intlvMatch = intlvMatch + 1
+
+            # recreate masks
+            for src in sorted_src:
+                for src_range in src.addr_ranges:
+                    if src_range.start.value != tgt_range_start:
+                        continue
+                    new_src_mask = []
+                    for m in src_range.masks:
+                        # TODO should mask all the way to the max range size
+                        new_src_mask.append(m | (m*2) | (m*4) |
+                                                  (m*8) | (m*16))
+                    for tgt in src._pairing:
+                        paired = False
+                        for tgt_range in tgt.addr_ranges:
+                            if tgt_range.start.value == \
+                               src_range.start.value:
+                                src_range.masks = new_src_mask
+                                new_tgt_mask = []
+                                lsbs = len(tgt_range.masks) - \
+                                       len(new_src_mask)
+                                for i in range(lsbs):
+                                    new_tgt_mask.append(tgt_range.masks[i])
+                                for m in new_src_mask:
+                                    new_tgt_mask.append(m)
+                                tgt_range.masks = new_tgt_mask
+                                if fix_tgt_peer:
+                                    _getPeer(tgt).range.masks = new_tgt_mask
+                                paired = True
+                        if not paired:
+                            fatal('topologies.CustomMesh: could not ' \
+                                    'reassign ranges {} {}'.format(
+                                    src.path(), tgt.path()))
+        if len(all_mem) >= len(all_cache):
+            _rerange(all_cache, all_mem, True)
+        else:
+            _rerange(all_mem, all_cache, False)
+
+        if pair_debug:
+            print("")
+            for cidx,c in enumerate(all_cache):
+                assert(len(c._pairing) == pairing_check)
+                print(c.path())
+                for r in c.addr_ranges:
+                    print("%s" % r)
+                for p in c._pairing:
+                    print("\t"+p.path())
+                    for r in p.addr_ranges:
+                        print("\t%s" % r)
+
+
--- a/ext/systemc/src/systemc.h
+++ b/ext/systemc/src/systemc.h
@@ -117,7 +117,7 @@
    using std::fputs;
    using std::getc;
    using std::getchar;
-    using std::gets;
+    //using std::gets;
    using std::putc;
    using std::putchar;
    using std::puts;
--- a/src/SConscript
+++ b/src/SConscript
@@ -917,7 +917,7 @@ PySource('m5', 'python/m5/info.py')
 # Create all of the SimObject param headers and enum headers
 #

-def createSimObjectParamStruct(target, source, env):
+def createSimObjectParamDecl(target, source, env):
    assert len(target) == 1 and len(source) == 1

    name = source[0].get_text_contents()
@@ -927,6 +927,16 @@ def createSimObjectParamStruct(target, source, env):
    obj.cxx_param_decl(code)
    code.write(target[0].abspath)

+def createSimObjectParamDef(target, source, env):
+    assert len(target) == 1 and len(source) == 1
+
+    name = source[0].get_text_contents()
+    obj = sim_objects[name]
+
+    code = code_formatter()
+    obj.cxx_param_def(code)
+    code.write(target[0].abspath)
+
 def createSimObjectCxxConfig(is_header):
    def body(target, source, env):
        assert len(target) == 1 and len(source) == 1
@@ -987,9 +997,16 @@ for name,simobj in sorted(sim_objects.items()):
    hh_file = File('params/%s.hh' % name)
    params_hh_files.append(hh_file)
    env.Command(hh_file, Value(name),
-                MakeAction(createSimObjectParamStruct, Transform("SO PARAM")))
+                MakeAction(createSimObjectParamDecl, Transform("SOPARMHH")))
    env.Depends(hh_file, depends + extra_deps)

+    if not getattr(simobj, 'abstract', False) and hasattr(simobj, 'type'):
+        cc_file = File('params/%s.cc' % name)
+        env.Command(cc_file, Value(name),
+                    MakeAction(createSimObjectParamDef, Transform("SOPARMCC")))
+        env.Depends(cc_file, depends + extra_deps)
+        Source(cc_file)
+
 # C++ parameter description files
 if GetOption('with_cxx_config'):
    for name,simobj in sorted(sim_objects.items()):
--- a/src/arch/generic/mmu.hh
+++ b/src/arch/generic/mmu.hh
@@ -103,7 +103,7 @@ class BaseMMU : public SimObject
        return getTlb(mode)->finalizePhysical(req, tc, mode);
    }

-    void takeOverFrom(BaseMMU *old_mmu);
+    virtual void takeOverFrom(BaseMMU *old_mmu);

  public:
    BaseTLB* dtb;
--- a/src/arch/riscv/RiscvMMU.py
+++ b/src/arch/riscv/RiscvMMU.py
@@ -35,6 +35,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+from m5.params import *
+
 from m5.objects.BaseMMU import BaseMMU
 from m5.objects.RiscvTLB import RiscvTLB
 from m5.objects.PMAChecker import PMAChecker
@@ -45,7 +47,7 @@ class RiscvMMU(BaseMMU):
    cxx_header = 'arch/riscv/mmu.hh'
    itb = RiscvTLB()
    dtb = RiscvTLB()
-    pma_checker = PMAChecker()
+    pma_checker = Param.PMAChecker(PMAChecker(), "PMA Checker")

    @classmethod
    def walkerPorts(cls):
--- a/src/arch/riscv/RiscvTLB.py
+++ b/src/arch/riscv/RiscvTLB.py
@@ -43,7 +43,7 @@ class RiscvPagetableWalker(ClockedObject):
    num_squash_per_cycle = Param.Unsigned(4,
            "Number of outstanding walks that can be squashed per cycle")
    # Grab the pma_checker from the MMU
-    pma_checker = Param.PMAChecker(Parent.any, "PMA Chekcer")
+    pma_checker = Param.PMAChecker(Parent.any, "PMA Checker")

 class RiscvTLB(BaseTLB):
    type = 'RiscvTLB'
@@ -53,4 +53,4 @@ class RiscvTLB(BaseTLB):
    walker = Param.RiscvPagetableWalker(\
            RiscvPagetableWalker(), "page table walker")
    # Grab the pma_checker from the MMU
-    pma_checker = Param.PMAChecker(Parent.any, "PMA Chekcer")
+    pma_checker = Param.PMAChecker(Parent.any, "PMA Checker")
--- a/src/arch/riscv/mmu.hh
+++ b/src/arch/riscv/mmu.hh
@@ -40,6 +40,7 @@

 #include "arch/generic/mmu.hh"
 #include "arch/riscv/isa.hh"
+#include "arch/riscv/pma_checker.hh"
 #include "arch/riscv/tlb.hh"

 #include "params/RiscvMMU.hh"
@@ -49,8 +50,10 @@ namespace RiscvISA {
 class MMU : public BaseMMU
 {
  public:
+    PMAChecker *pma;
+
    MMU(const RiscvMMUParams &p)
-      : BaseMMU(p)
+      : BaseMMU(p), pma(p.pma_checker)
    {}

    PrivilegeMode
@@ -64,6 +67,14 @@ class MMU : public BaseMMU
    {
        return static_cast<TLB*>(dtb)->getWalker();
    }
+
+    void
+    takeOverFrom(BaseMMU *old_mmu) override
+    {
+      MMU *ommu = dynamic_cast<MMU*>(old_mmu);
+      BaseMMU::takeOverFrom(ommu);
+      pma->takeOverFrom(ommu->pma);
+    }
 };

 } // namespace RiscvISA
--- a/src/arch/riscv/pma_checker.cc
+++ b/src/arch/riscv/pma_checker.cc
@@ -81,3 +81,9 @@ PMAChecker::isUncacheable(PacketPtr pkt)
 {
    return isUncacheable(pkt->getAddrRange());
 }
+
+void
+PMAChecker::takeOverFrom(PMAChecker *old)
+{
+    uncacheable = old->uncacheable;
+}
--- a/src/arch/riscv/pma_checker.hh
+++ b/src/arch/riscv/pma_checker.hh
@@ -74,6 +74,8 @@ class PMAChecker : public SimObject
    bool isUncacheable(const AddrRange &range);
    bool isUncacheable(const Addr &addr, const unsigned size);
    bool isUncacheable(PacketPtr pkt);
+
+    void takeOverFrom(PMAChecker *old);
 };

 #endif // __ARCH_RISCV_PMA_CHECKER_HH__
--- a/src/arch/riscv/remote_gdb.cc
+++ b/src/arch/riscv/remote_gdb.cc
@@ -211,7 +211,7 @@ RemoteGDB::RiscvGdbRegCache::getRegs(ThreadContext *context)
    // U mode CSR
    r.ustatus = context->readMiscRegNoEffect(
        CSRData.at(CSR_USTATUS).physIndex) & CSRMasks.at(CSR_USTATUS);
-    r.uie = context->readMiscRegNoEffect(
+    r.uie = context->readMiscReg(
        CSRData.at(CSR_UIE).physIndex) & CSRMasks.at(CSR_UIE);
    r.utvec = context->readMiscRegNoEffect(
        CSRData.at(CSR_UTVEC).physIndex);
@@ -223,7 +223,7 @@ RemoteGDB::RiscvGdbRegCache::getRegs(ThreadContext *context)
        CSRData.at(CSR_UCAUSE).physIndex);
    r.utval = context->readMiscRegNoEffect(
        CSRData.at(CSR_UTVAL).physIndex);
-    r.uip = context->readMiscRegNoEffect(
+    r.uip = context->readMiscReg(
        CSRData.at(CSR_UIP).physIndex) & CSRMasks.at(CSR_UIP);

    // S mode CSR
@@ -233,7 +233,7 @@ RemoteGDB::RiscvGdbRegCache::getRegs(ThreadContext *context)
        CSRData.at(CSR_SEDELEG).physIndex);
    r.sideleg = context->readMiscRegNoEffect(
        CSRData.at(CSR_SIDELEG).physIndex);
-    r.sie = context->readMiscRegNoEffect(
+    r.sie = context->readMiscReg(
        CSRData.at(CSR_SIE).physIndex) & CSRMasks.at(CSR_SIE);
    r.stvec = context->readMiscRegNoEffect(
        CSRData.at(CSR_STVEC).physIndex);
@@ -247,7 +247,7 @@ RemoteGDB::RiscvGdbRegCache::getRegs(ThreadContext *context)
        CSRData.at(CSR_SCAUSE).physIndex);
    r.stval = context->readMiscRegNoEffect(
        CSRData.at(CSR_STVAL).physIndex);
-    r.sip = context->readMiscRegNoEffect(
+    r.sip = context->readMiscReg(
        CSRData.at(CSR_SIP).physIndex) & CSRMasks.at(CSR_SIP);
    r.satp = context->readMiscRegNoEffect(
        CSRData.at(CSR_SATP).physIndex);
@@ -269,7 +269,7 @@ RemoteGDB::RiscvGdbRegCache::getRegs(ThreadContext *context)
        CSRData.at(CSR_MEDELEG).physIndex);
    r.mideleg = context->readMiscRegNoEffect(
        CSRData.at(CSR_MIDELEG).physIndex);
-    r.mie = context->readMiscRegNoEffect(
+    r.mie = context->readMiscReg(
        CSRData.at(CSR_MIE).physIndex) & CSRMasks.at(CSR_MIE);
    r.mtvec = context->readMiscRegNoEffect(
        CSRData.at(CSR_MTVEC).physIndex);
@@ -283,7 +283,7 @@ RemoteGDB::RiscvGdbRegCache::getRegs(ThreadContext *context)
        CSRData.at(CSR_MCAUSE).physIndex);
    r.mtval = context->readMiscRegNoEffect(
        CSRData.at(CSR_MTVAL).physIndex);
-    r.mip = context->readMiscRegNoEffect(
+    r.mip = context->readMiscReg(
        CSRData.at(CSR_MIP).physIndex) & CSRMasks.at(CSR_MIP);

    // H mode CSR (to be implemented)
@@ -340,11 +340,11 @@ RemoteGDB::RiscvGdbRegCache::setRegs(ThreadContext *context) const
    newVal = (oldVal & ~mask) | (r.ustatus & mask);
    context->setMiscRegNoEffect(
        CSRData.at(CSR_USTATUS).physIndex, newVal);
-    oldVal = context->readMiscRegNoEffect(
+    oldVal = context->readMiscReg(
        CSRData.at(CSR_UIE).physIndex);
    mask = CSRMasks.at(CSR_UIE);
    newVal = (oldVal & ~mask) | (r.uie & mask);
-    context->setMiscRegNoEffect(
+    context->setMiscReg(
        CSRData.at(CSR_UIE).physIndex, newVal);
    context->setMiscRegNoEffect(
        CSRData.at(CSR_UTVEC).physIndex, r.utvec);
@@ -356,11 +356,11 @@ RemoteGDB::RiscvGdbRegCache::setRegs(ThreadContext *context) const
        CSRData.at(CSR_UCAUSE).physIndex, r.ucause);
    context->setMiscRegNoEffect(
        CSRData.at(CSR_UTVAL).physIndex, r.utval);
-    oldVal = context->readMiscRegNoEffect(
+    oldVal = context->readMiscReg(
        CSRData.at(CSR_UIP).physIndex);
    mask = CSRMasks.at(CSR_UIP);
    newVal = (oldVal & ~mask) | (r.uip & mask);
-    context->setMiscRegNoEffect(
+    context->setMiscReg(
        CSRData.at(CSR_UIP).physIndex, newVal);

    // S mode CSR
@@ -374,11 +374,11 @@ RemoteGDB::RiscvGdbRegCache::setRegs(ThreadContext *context) const
        CSRData.at(CSR_SEDELEG).physIndex, r.sedeleg);
    context->setMiscRegNoEffect(
        CSRData.at(CSR_SIDELEG).physIndex, r.sideleg);
-    oldVal = context->readMiscRegNoEffect(
+    oldVal = context->readMiscReg(
        CSRData.at(CSR_SIE).physIndex);
    mask = CSRMasks.at(CSR_SIE);
    newVal = (oldVal & ~mask) | (r.sie & mask);
-    context->setMiscRegNoEffect(
+    context->setMiscReg(
        CSRData.at(CSR_SIE).physIndex, newVal);
    context->setMiscRegNoEffect(
        CSRData.at(CSR_STVEC).physIndex, r.stvec);
@@ -392,11 +392,11 @@ RemoteGDB::RiscvGdbRegCache::setRegs(ThreadContext *context) const
        CSRData.at(CSR_SCAUSE).physIndex, r.scause);
    context->setMiscRegNoEffect(
        CSRData.at(CSR_STVAL).physIndex, r.stval);
-    oldVal = context->readMiscRegNoEffect(
+    oldVal = context->readMiscReg(
        CSRData.at(CSR_SIP).physIndex);
    mask = CSRMasks.at(CSR_SIP);
    newVal = (oldVal & ~mask) | (r.sip & mask);
-    context->setMiscRegNoEffect(
+    context->setMiscReg(
        CSRData.at(CSR_SIP).physIndex, newVal);
    context->setMiscRegNoEffect(
        CSRData.at(CSR_SATP).physIndex, r.satp);
@@ -426,11 +426,11 @@ RemoteGDB::RiscvGdbRegCache::setRegs(ThreadContext *context) const
        CSRData.at(CSR_MEDELEG).physIndex, r.medeleg);
    context->setMiscRegNoEffect(
        CSRData.at(CSR_MIDELEG).physIndex, r.mideleg);
-    oldVal = context->readMiscRegNoEffect(
+    oldVal = context->readMiscReg(
        CSRData.at(CSR_MIE).physIndex);
    mask = CSRMasks.at(CSR_MIE);
    newVal = (oldVal & ~mask) | (r.mie & mask);
-    context->setMiscRegNoEffect(
+    context->setMiscReg(
        CSRData.at(CSR_MIE).physIndex, newVal);
    context->setMiscRegNoEffect(
        CSRData.at(CSR_MTVEC).physIndex, r.mtvec);
@@ -444,11 +444,11 @@ RemoteGDB::RiscvGdbRegCache::setRegs(ThreadContext *context) const
        CSRData.at(CSR_MCAUSE).physIndex, r.mcause);
    context->setMiscRegNoEffect(
        CSRData.at(CSR_MTVAL).physIndex, r.mtval);
-    oldVal = context->readMiscRegNoEffect(
+    oldVal = context->readMiscReg(
        CSRData.at(CSR_MIP).physIndex);
    mask = CSRMasks.at(CSR_MIP);
    newVal = (oldVal & ~mask) | (r.mip & mask);
-    context->setMiscRegNoEffect(
+    context->setMiscReg(
        CSRData.at(CSR_MIP).physIndex, newVal);

    // H mode CSR (to be implemented)
--- a/src/arch/riscv/tlb.cc
+++ b/src/arch/riscv/tlb.cc
@@ -519,3 +519,9 @@ TLB::TlbStats::TlbStats(Stats::Group *parent)
             readAccesses + writeAccesses)
 {
 }
+
+Port *
+TLB::getTableWalkerPort()
+{
+    return &walker->getPort("port");
+}
--- a/src/arch/riscv/tlb.hh
+++ b/src/arch/riscv/tlb.hh
@@ -92,7 +92,7 @@ class TLB : public BaseTLB

    Walker *getWalker();

-    void takeOverFrom(BaseTLB *otlb) override {}
+    void takeOverFrom(BaseTLB *old) override {}

    TlbEntry *insert(Addr vpn, const TlbEntry &entry);
    void flushAll() override;
@@ -108,6 +108,18 @@ class TLB : public BaseTLB
    void serialize(CheckpointOut &cp) const override;
    void unserialize(CheckpointIn &cp) override;

+    /**
+     * Get the table walker port. This is used for
+     * migrating port connections during a CPU takeOverFrom()
+     * call. For architectures that do not have a table walker,
+     * NULL is returned, hence the use of a pointer rather than a
+     * reference. For RISC-V this method will always return a valid
+     * port pointer.
+     *
+     * @return A pointer to the walker port
+     */
+    Port *getTableWalkerPort() override;
+
    Addr translateWithTLB(Addr vaddr, uint16_t asid, Mode mode);

    Fault translateAtomic(const RequestPtr &req,
--- a/src/base/stats/units.hh
+++ b/src/base/stats/units.hh
@@ -47,10 +47,9 @@
 #define UNIT_RATE(T1, T2) Stats::Units::Rate<T1, T2>::get()
 #define UNIT_RATIO Stats::Units::Ratio::get()
 #define UNIT_COUNT Stats::Units::Count::get()
+#define UNIT_WATT Stats::Units::Watt::get()
 #define UNIT_UNSPECIFIED Stats::Units::Unspecified::get()

-#define UNIT_WATT UNIT_RATE(Stats::Units::Joule, Stats::Units::Second)
-
 namespace Stats {

 /**
--- a/src/cpu/testers/gpu_ruby_test/cpu_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/cpu_thread.cc
@@ -43,12 +43,6 @@ CpuThread::CpuThread(const Params &p)
    assert(numLanes == 1);
 }

-CpuThread*
-CpuThreadParams::create() const
-{
-    return new CpuThread(*this);
-}
-
 void
 CpuThread::issueLoadOps()
 {
--- a/src/cpu/testers/gpu_ruby_test/dma_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/dma_thread.cc
@@ -48,12 +48,6 @@ DmaThread::~DmaThread()

 }

-DmaThread*
-DmaThreadParams::create() const
-{
-    return new DmaThread(*this);
-}
-
 void
 DmaThread::issueLoadOps()
 {
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
@@ -48,12 +48,6 @@ GpuWavefront::~GpuWavefront()

 }

-GpuWavefront*
-GpuWavefrontParams::create() const
-{
-    return new GpuWavefront(*this);
-}
-
 void
 GpuWavefront::issueLoadOps()
 {
--- a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
+++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
@@ -357,9 +357,3 @@ ProtocolTester::SeqPort::recvTimingResp(PacketPtr pkt)

    return true;
 }
-
-ProtocolTester*
-ProtocolTesterParams::create() const
-{
-    return new ProtocolTester(*this);
-}
--- a/src/dev/arm/smmu_v3_caches.cc
+++ b/src/dev/arm/smmu_v3_caches.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2018-2019 ARM Limited
+ * Copyright (c) 2014, 2018-2019, 2021 Arm Limited
 * All rights reserved
 *
 * The license below extends only to copyright in the software and shall
@@ -59,12 +59,12 @@
 */

 SMMUv3BaseCache::SMMUv3BaseCache(const std::string &policy_name, uint32_t seed,
-                                 Stats::Group *parent) :
-    replacementPolicy(decodePolicyName(policy_name)),
+                                 Stats::Group *parent, const std::string &name)
+  : replacementPolicy(decodePolicyName(policy_name)),
    nextToReplace(0),
    random(seed),
    useStamp(0),
-    baseCacheStats(parent)
+    baseCacheStats(parent, name)
 {}

 int
@@ -82,8 +82,9 @@ SMMUv3BaseCache::decodePolicyName(const std::string &policy_name)
 }

 SMMUv3BaseCache::
-SMMUv3BaseCacheStats::SMMUv3BaseCacheStats(Stats::Group *parent)
-    : Stats::Group(parent),
+SMMUv3BaseCacheStats::SMMUv3BaseCacheStats(
+    Stats::Group *parent, const std::string &name)
+    : Stats::Group(parent, name.c_str()),
      ADD_STAT(averageLookups,
               UNIT_RATE(Stats::Units::Count, Stats::Units::Second),
               "Average number lookups per second"),
@@ -144,9 +145,10 @@ SMMUv3BaseCacheStats::SMMUv3BaseCacheStats(Stats::Group *parent)
 */

 SMMUTLB::SMMUTLB(unsigned numEntries, unsigned _associativity,
-                 const std::string &policy, Stats::Group *parent)
+                 const std::string &policy, Stats::Group *parent,
+                 const std::string &name)
 :
-    SMMUv3BaseCache(policy, SMMUTLB_SEED, parent),
+    SMMUv3BaseCache(policy, SMMUTLB_SEED, parent, name),
    associativity(_associativity)
 {
    if (associativity == 0)
@@ -426,7 +428,7 @@ SMMUTLB::pickEntryIdxToReplace(const Set &set, AllocPolicy alloc)
 ARMArchTLB::ARMArchTLB(unsigned numEntries, unsigned _associativity,
                       const std::string &policy, Stats::Group *parent)
 :
-    SMMUv3BaseCache(policy, ARMARCHTLB_SEED, parent),
+    SMMUv3BaseCache(policy, ARMARCHTLB_SEED, parent, "tlb"),
    associativity(_associativity)
 {
    if (associativity == 0)
@@ -625,7 +627,7 @@ ARMArchTLB::pickEntryIdxToReplace(const Set &set)
 IPACache::IPACache(unsigned numEntries, unsigned _associativity,
                   const std::string &policy, Stats::Group *parent)
 :
-    SMMUv3BaseCache(policy, IPACACHE_SEED, parent),
+    SMMUv3BaseCache(policy, IPACACHE_SEED, parent, "ipa"),
    associativity(_associativity)
 {
    if (associativity == 0)
@@ -805,7 +807,7 @@ IPACache::pickEntryIdxToReplace(const Set &set)
 ConfigCache::ConfigCache(unsigned numEntries, unsigned _associativity,
                         const std::string &policy, Stats::Group *parent)
 :
-    SMMUv3BaseCache(policy, CONFIGCACHE_SEED, parent),
+    SMMUv3BaseCache(policy, CONFIGCACHE_SEED, parent, "cfg"),
    associativity(_associativity)
 {
    if (associativity == 0)
@@ -969,7 +971,7 @@ ConfigCache::pickEntryIdxToReplace(const Set &set)
 WalkCache::WalkCache(const std::array<unsigned, 2*WALK_CACHE_LEVELS> &_sizes,
                     unsigned _associativity, const std::string &policy,
                     Stats::Group *parent) :
-    SMMUv3BaseCache(policy, WALKCACHE_SEED, parent),
+    SMMUv3BaseCache(policy, WALKCACHE_SEED, parent, "walk"),
    walkCacheStats(&(SMMUv3BaseCache::baseCacheStats)),
    associativity(_associativity),
    sizes()
@@ -1041,10 +1043,8 @@ WalkCache::lookup(Addr va, Addr vaMask,
        if (result == NULL)
            baseCacheStats.totalMisses++;

-        walkCacheStats.lookupsByStageLevel[stage-1][level]++;
        walkCacheStats.totalLookupsByStageLevel[stage-1][level]++;
        if (result == NULL) {
-            walkCacheStats.missesByStageLevel[stage-1][level]++;
            walkCacheStats.totalMissesByStageLevel[stage-1][level]++;
        }
    }
@@ -1077,7 +1077,6 @@ WalkCache::store(const Entry &incoming)
    }

    baseCacheStats.totalUpdates++;
-    walkCacheStats.updatesByStageLevel[incoming.stage-1][incoming.level]++;
    walkCacheStats
             .totalUpdatesByStageLevel[incoming.stage-1][incoming.level]++;
 }
@@ -1226,68 +1225,105 @@ WalkCache::pickEntryIdxToReplace(const Set &set,

 WalkCache::
 WalkCacheStats::WalkCacheStats(Stats::Group *parent)
-    : Stats::Group(parent, "WalkCache")
+    : Stats::Group(parent),
+      ADD_STAT(totalLookupsByStageLevel, UNIT_COUNT,
+          "Total number of lookups"),
+      ADD_STAT(totalMissesByStageLevel, UNIT_COUNT,
+          "Total number of misses"),
+      ADD_STAT(totalUpdatesByStageLevel, UNIT_COUNT,
+          "Total number of updates"),
+      ADD_STAT(insertionsByStageLevel, UNIT_COUNT,
+          "Number of insertions (not replacements)")
 {
    using namespace Stats;

+    totalLookupsByStageLevel
+        .init(2, WALK_CACHE_LEVELS)
+        .flags(pdf);
+    totalMissesByStageLevel
+        .init(2, WALK_CACHE_LEVELS)
+        .flags(pdf);
+    totalUpdatesByStageLevel
+        .init(2, WALK_CACHE_LEVELS)
+        .flags(pdf);
+    insertionsByStageLevel
+        .init(2, WALK_CACHE_LEVELS)
+        .flags(pdf);
+
    for (int s = 0; s < 2; s++) {
+        totalLookupsByStageLevel.subname(s, csprintf("S%d", s + 1));
+        totalMissesByStageLevel.subname(s, csprintf("S%d", s + 1));
+        totalUpdatesByStageLevel.subname(s, csprintf("S%d", s + 1));
+        insertionsByStageLevel.subname(s, csprintf("S%d", s + 1));
+
        for (int l = 0; l < WALK_CACHE_LEVELS; l++) {
-            averageLookupsByStageLevel[s][l]
-                .name(csprintf("averageLookupsS%dL%d", s+1, l))
-                .desc("Average number lookups per second")
-                .flags(pdf);
+            totalLookupsByStageLevel.ysubname(l, csprintf("L%d", l));
+            totalMissesByStageLevel.ysubname(l, csprintf("L%d", l));
+            totalUpdatesByStageLevel.ysubname(l, csprintf("L%d", l));
+            insertionsByStageLevel.ysubname(l, csprintf("L%d", l));

-            totalLookupsByStageLevel[s][l]
-                .name(csprintf("totalLookupsS%dL%d", s+1, l))
-                .desc("Total number of lookups")
-                .flags(pdf);
+            auto avg_lookup = new Stats::Formula(
+                this,
+                csprintf("averageLookups_S%dL%d", s+1, l).c_str(),
+                UNIT_RATE(Stats::Units::Count, Stats::Units::Second),
+                "Average number lookups per second");
+            avg_lookup->flags(pdf);
+            averageLookupsByStageLevel.push_back(avg_lookup);

-            averageLookupsByStageLevel[s][l] =
+            *avg_lookup =
                totalLookupsByStageLevel[s][l] / simSeconds;

+            auto avg_misses = new Stats::Formula(
+                this,
+                csprintf("averageMisses_S%dL%d", s+1, l).c_str(),
+                UNIT_RATE(Stats::Units::Count, Stats::Units::Second),
+                "Average number misses per second");
+            avg_misses->flags(pdf);
+            averageMissesByStageLevel.push_back(avg_misses);

-            averageMissesByStageLevel[s][l]
-                .name(csprintf("averageMissesS%dL%d", s+1, l))
-                .desc("Average number misses per second")
-                .flags(pdf);
-
-            totalMissesByStageLevel[s][l]
-                .name(csprintf("totalMissesS%dL%d", s+1, l))
-                .desc("Total number of misses")
-                .flags(pdf);
-
-            averageMissesByStageLevel[s][l] =
+            *avg_misses =
                totalMissesByStageLevel[s][l] / simSeconds;

+            auto avg_updates = new Stats::Formula(
+                this,
+                csprintf("averageUpdates_S%dL%d", s+1, l).c_str(),
+                UNIT_RATE(Stats::Units::Count, Stats::Units::Second),
+                "Average number updates per second");
+            avg_updates->flags(pdf);
+            averageUpdatesByStageLevel.push_back(avg_updates);

-            averageUpdatesByStageLevel[s][l]
-                .name(csprintf("averageUpdatesS%dL%d", s+1, l))
-                .desc("Average number updates per second")
-                .flags(pdf);
-
-            totalUpdatesByStageLevel[s][l]
-                .name(csprintf("totalUpdatesS%dL%d", s+1, l))
-                .desc("Total number of updates")
-                .flags(pdf);
-
-            averageUpdatesByStageLevel[s][l] =
+            *avg_updates =
                totalUpdatesByStageLevel[s][l] / simSeconds;

+            auto avg_hitrate = new Stats::Formula(
+                this,
+                csprintf("averageHitRate_S%dL%d", s+1, l).c_str(),
+                UNIT_RATIO,
+                "Average hit rate");
+            avg_hitrate->flags(pdf);
+            averageHitRateByStageLevel.push_back(avg_hitrate);

-            averageHitRateByStageLevel[s][l]
-                .name(csprintf("averageHitRateS%dL%d", s+1, l))
-                .desc("Average hit rate")
-                .flags(pdf);
-
-            averageHitRateByStageLevel[s][l] =
+            *avg_hitrate =
                (totalLookupsByStageLevel[s][l] -
                 totalMissesByStageLevel[s][l])
                / totalLookupsByStageLevel[s][l];

-            insertionsByStageLevel[s][l]
-                .name(csprintf("insertionsS%dL%d", s+1, l))
-                .desc("Number of insertions (not replacements)")
-                .flags(pdf);
        }
    }
 }
+
+WalkCache::
+WalkCacheStats::~WalkCacheStats()
+{
+    for (auto avg_lookup : averageLookupsByStageLevel)
+        delete avg_lookup;
+
+    for (auto avg_miss : averageMissesByStageLevel)
+        delete avg_miss;
+
+    for (auto avg_update : averageUpdatesByStageLevel)
+        delete avg_update;
+
+    for (auto avg_hitrate : averageHitRateByStageLevel)
+        delete avg_hitrate;
+}
--- a/src/dev/arm/smmu_v3_caches.hh
+++ b/src/dev/arm/smmu_v3_caches.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2018-2019 ARM Limited
+ * Copyright (c) 2014, 2018-2019, 2021 Arm Limited
 * All rights reserved
 *
 * The license below extends only to copyright in the software and shall
@@ -67,7 +67,7 @@ class SMMUv3BaseCache

    struct SMMUv3BaseCacheStats : public Stats::Group
    {
-        SMMUv3BaseCacheStats(Stats::Group *parent);
+        SMMUv3BaseCacheStats(Stats::Group *parent, const std::string &name);

        Stats::Formula averageLookups;
        Stats::Scalar totalLookups;
@@ -87,7 +87,7 @@ class SMMUv3BaseCache

  public:
    SMMUv3BaseCache(const std::string &policy_name, uint32_t seed,
-                    Stats::Group *parent);
+                    Stats::Group *parent, const std::string &name);
    virtual ~SMMUv3BaseCache() {}
 };

@@ -122,7 +122,8 @@ class SMMUTLB : public SMMUv3BaseCache
    };

    SMMUTLB(unsigned numEntries, unsigned _associativity,
-            const std::string &policy, Stats::Group *parent);
+            const std::string &policy, Stats::Group *parent,
+            const std::string &name);
    SMMUTLB(const SMMUTLB& tlb) = delete;
    virtual ~SMMUTLB() {}

@@ -324,22 +325,20 @@ class WalkCache : public SMMUv3BaseCache
    struct WalkCacheStats : public Stats::Group
    {
        WalkCacheStats(Stats::Group *parent);
+        ~WalkCacheStats();

-        unsigned int lookupsByStageLevel[2][WALK_CACHE_LEVELS];
-        Stats::Formula averageLookupsByStageLevel[2][WALK_CACHE_LEVELS];
-        Stats::Scalar totalLookupsByStageLevel[2][WALK_CACHE_LEVELS];
+        std::vector<Stats::Formula*> averageLookupsByStageLevel;
+        Stats::Vector2d totalLookupsByStageLevel;

-        unsigned int missesByStageLevel[2][WALK_CACHE_LEVELS];
-        Stats::Formula averageMissesByStageLevel[2][WALK_CACHE_LEVELS];
-        Stats::Scalar totalMissesByStageLevel[2][WALK_CACHE_LEVELS];
+        std::vector<Stats::Formula*> averageMissesByStageLevel;
+        Stats::Vector2d totalMissesByStageLevel;

-        unsigned int updatesByStageLevel[2][WALK_CACHE_LEVELS];
-        Stats::Formula averageUpdatesByStageLevel[2][WALK_CACHE_LEVELS];
-        Stats::Scalar totalUpdatesByStageLevel[2][WALK_CACHE_LEVELS];
+        std::vector<Stats::Formula*> averageUpdatesByStageLevel;
+        Stats::Vector2d totalUpdatesByStageLevel;

-        Stats::Formula averageHitRateByStageLevel[2][WALK_CACHE_LEVELS];
+        std::vector<Stats::Formula*> averageHitRateByStageLevel;

-        Stats::Scalar insertionsByStageLevel[2][WALK_CACHE_LEVELS];
+        Stats::Vector2d insertionsByStageLevel;
    } walkCacheStats;
  private:
    typedef std::vector<Entry> Set;
--- a/src/dev/arm/smmu_v3_deviceifc.cc
+++ b/src/dev/arm/smmu_v3_deviceifc.cc
@@ -49,11 +49,11 @@ SMMUv3DeviceInterface::SMMUv3DeviceInterface(
    microTLB(new SMMUTLB(p.utlb_entries,
                         p.utlb_assoc,
                         p.utlb_policy,
-                         this)),
+                         this, "utlb")),
    mainTLB(new SMMUTLB(p.tlb_entries,
                        p.tlb_assoc,
                        p.tlb_policy,
-                        this)),
+                        this, "maintlb")),
    microTLBEnable(p.utlb_enable),
    mainTLBEnable(p.tlb_enable),
    devicePortSem(1),
--- a/src/dev/riscv/plic.cc
+++ b/src/dev/riscv/plic.cc
@@ -354,17 +354,18 @@ Plic::readClaim(Register32& reg, const int context_id)
                context_id, max_int_id);
            clear(max_int_id);
            reg.update(max_int_id);
+            return reg.get();
        } else {
            DPRINTF(Plic,
                "Claim already cleared - context: %d, interrupt ID: %d\n",
                context_id, max_int_id);
-            reg.update(0);
+            return 0;
        }
    } else {
-        warn("PLIC claim failed (not completed) - context: %d", context_id);
-        reg.update(0);
+        warn("PLIC claim repeated (not completed) - context: %d, last: %d",
+            context_id, lastID[context_id]);
+        return lastID[context_id];
    }
-    return reg.get();
 }

 void
@@ -381,6 +382,7 @@ Plic::writeClaim(Register32& reg, const uint32_t& data, const int context_id)
    DPRINTF(Plic,
        "Complete - context: %d, interrupt ID: %d\n",
        context_id, reg.get());
+    updateInt();
 }

 void
@@ -445,11 +447,11 @@ Plic::updateInt()
        uint32_t max_id = output.maxID[i];
        uint32_t priority = output.maxPriority[i];
        uint32_t threshold = registers.threshold[i].get();
-        if (priority > threshold && max_id > 0) {
+        if (priority > threshold && max_id > 0 && lastID[i] == 0) {
            DPRINTF(Plic,
                "Int posted - thread: %d, int id: %d, ",
                thread_id, int_id);
-            DPRINTF(Plic,
+            DPRINTFR(Plic,
                "pri: %d, thres: %d\n", priority, threshold);
            intrctrl->post(thread_id, int_id, 0);
        } else {
@@ -457,7 +459,7 @@ Plic::updateInt()
                DPRINTF(Plic,
                    "Int filtered - thread: %d, int id: %d, ",
                    thread_id, int_id);
-                DPRINTF(Plic,
+                DPRINTFR(Plic,
                    "pri: %d, thres: %d\n", priority, threshold);
            }
            intrctrl->clear(thread_id, int_id, 0);
@@ -499,6 +501,12 @@ Plic::serialize(CheckpointOut &cp) const
    SERIALIZE_SCALAR(n_outputs);
    SERIALIZE_CONTAINER(output.maxID);
    SERIALIZE_CONTAINER(output.maxPriority);
+    SERIALIZE_CONTAINER(pendingPriority);
+    for (int i=0; i < effPriority.size(); i++) {
+        arrayParamOut(cp, std::string("effPriority") +
+            std::to_string(i), effPriority[i]);
+    }
+    SERIALIZE_CONTAINER(lastID);
 }

 void
@@ -541,4 +549,11 @@ Plic::unserialize(CheckpointIn &cp)
    }
    UNSERIALIZE_CONTAINER(output.maxID);
    UNSERIALIZE_CONTAINER(output.maxPriority);
+    UNSERIALIZE_CONTAINER(pendingPriority);
+    for (int i=0; i < effPriority.size(); i++) {
+        arrayParamIn(cp, std::string("effPriority") +
+            std::to_string(i), effPriority[i]);
+    }
+    UNSERIALIZE_CONTAINER(lastID);
+    updateInt();
 }
--- a/src/mem/ruby/SConscript
+++ b/src/mem/ruby/SConscript
@@ -114,9 +114,11 @@ MakeInclude('slicc_interface/RubyRequest.hh')
 MakeInclude('common/Address.hh')
 MakeInclude('common/BoolVec.hh')
 MakeInclude('common/DataBlock.hh')
+MakeInclude('common/ExpectedMap.hh')
 MakeInclude('common/IntVec.hh')
 MakeInclude('common/MachineID.hh')
 MakeInclude('common/NetDest.hh')
+MakeInclude('common/TriggerQueue.hh')
 MakeInclude('common/Set.hh')
 MakeInclude('common/WriteMask.hh')
 MakeInclude('network/MessageBuffer.hh')
--- a/src/mem/ruby/common/ExpectedMap.hh
+++ b/src/mem/ruby/common/ExpectedMap.hh
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2021 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MEM_RUBY_COMMON_EXPECTEDMAP_HH__
+#define __MEM_RUBY_COMMON_EXPECTEDMAP_HH__
+
+#include <cassert>
+#include <iostream>
+#include <unordered_map>
+
+// ExpectedMap helper class is used to facilitate tracking of pending
+// response and data messages in the CHI protocol. It offers additional
+// functionality when compared to plain counters:
+//  - tracks the expected type for received messages
+//  - tracks segmented data messages (i.e. when a line transfer is split in
+//    multiple messages)
+
+template<typename RespType, typename DataType>
+class ExpectedMap
+{
+  private:
+
+    template<typename Type>
+    struct ExpectedState
+    {
+        struct EnumClassHash
+        {
+            std::size_t operator()(Type t) const
+            {
+                return static_cast<std::size_t>(t);
+            }
+        };
+
+      private:
+        // chunks is the number segmented messages we expect to receive
+        // before incrementing numReceived. This is tipically always 1 for all
+        // non-data messages
+        int chunks;
+        int currChunk;
+        int numReceived;
+        std::unordered_map<Type, bool, EnumClassHash> expectedTypes;
+
+      public:
+        ExpectedState()
+            :chunks(1), currChunk(0), numReceived(0)
+        {}
+
+        void
+        clear(int msg_chunks)
+        {
+            chunks = msg_chunks;
+            currChunk = 0;
+            numReceived = 0;
+            expectedTypes.clear();
+        }
+
+        void
+        addExpectedType(const Type &val)
+        {
+            expectedTypes[val] = false;
+        }
+
+        int received() const { return numReceived; }
+
+        bool
+        increaseReceived(const Type &val)
+        {
+            if (expectedTypes.find(val) == expectedTypes.end())
+                return false;
+
+            expectedTypes[val] = true;
+            ++currChunk;
+            if (currChunk == chunks) {
+                ++numReceived;
+                currChunk = 0;
+            }
+
+            return true;
+        }
+
+        bool
+        receivedType(const Type &val) const
+        {
+            auto i = expectedTypes.find(val);
+            if (i != expectedTypes.end())
+                return i->second;
+            else
+                return false;
+        }
+    };
+
+    ExpectedState<DataType> expectedData;
+    ExpectedState<RespType> expectedResp;
+    int totalExpected;
+
+  public:
+    ExpectedMap()
+        :expectedData(), expectedResp(), totalExpected(0)
+    {}
+
+    // Clear the tracking state and specified the number of chunks are required
+    // to receive a complete data message
+    void
+    clear(int dataChunks)
+    {
+        expectedData.clear(dataChunks);
+        expectedResp.clear(1);
+        totalExpected = 0;
+    }
+
+    // Register an expected response message type
+    void
+    addExpectedRespType(const RespType &val)
+    {
+        expectedResp.addExpectedType(val);
+    }
+
+    // Register an expected data message type
+    void
+    addExpectedDataType(const DataType &val)
+    {
+        expectedData.addExpectedType(val);
+    }
+
+    // Set the number of expected messages
+    void setExpectedCount(int val) { totalExpected = val; }
+
+    void addExpectedCount(int val) { totalExpected += val; }
+
+    // Returns the number of messages received.
+    // Notice that a data message counts as received only after all of
+    // its chunks are received.
+    int
+    received() const
+    {
+        return expectedData.received() + expectedResp.received();
+    }
+
+    // Returns the remaining number of expected messages
+    int expected() const { return totalExpected - received(); }
+
+    // Has any expected message ?
+    bool hasExpected() const { return expected() != 0; }
+
+    // Has received any data ?
+    bool hasReceivedData() const { return expectedData.received() != 0; }
+
+    // Has received any response ?
+    bool hasReceivedResp() const { return expectedResp.received() != 0; }
+
+
+    // Notifies that a response message was received
+    bool
+    receiveResp(const RespType &val)
+    {
+        assert(received() < totalExpected);
+        return expectedResp.increaseReceived(val);
+    }
+
+    // Notifies that a data message chunk was received
+    bool
+    receiveData(const DataType &val)
+    {
+        assert(received() <= totalExpected);
+        return expectedData.increaseReceived(val);
+    }
+
+    // Has received any data of the given type ?
+    bool
+    receivedDataType(const DataType &val) const
+    {
+        return expectedData.receivedType(val);
+    }
+
+    // Has received any response of the given type ?
+    bool
+    receivedRespType(const RespType &val) const
+    {
+        return expectedResp.receivedType(val);
+    }
+
+    void
+    print(std::ostream& out) const
+    {
+        out << expected();
+    }
+};
+
+template<typename RespType, typename DataType>
+inline std::ostream&
+operator<<(std::ostream& out, const ExpectedMap<RespType,DataType>& obj)
+{
+    obj.print(out);
+    return out;
+}
+
+
+#endif // __MEM_RUBY_COMMON_EXPECTEDMAP_HH__
--- a/src/mem/ruby/common/TriggerQueue.hh
+++ b/src/mem/ruby/common/TriggerQueue.hh
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2021 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MEM_RUBY_COMMON_QUEUE_HH__
+#define __MEM_RUBY_COMMON_QUEUE_HH__
+
+#include <deque>
+#include <iostream>
+
+// TriggerQueue helper class is used keep a list of events that trigger the
+// actions that need to be executed before an ouststanding transaction
+// completes in the CHI protocol. When a transaction no longer has pending
+// respose or data messages, this queue is checked and the event at the head
+// of the queue is triggered. If the queue is empty, the transactions is
+// finalized. Events can be marked as NB (non-blocking). NB are triggered by
+// the protocol even if the transactions has pending data/responses.
+
+template<typename T>
+class TriggerQueue
+{
+  private:
+    struct ValType {
+      T val;
+      bool non_blocking;
+    };
+    std::deque<ValType> queue;
+
+  public:
+    // Returns the head of the queue
+    const T& front() const { return queue.front().val; }
+
+    // Returns the head of the queue
+    // NOTE: SLICC won't allow to reuse front() or different
+    // values of the template parameter, thus we use an additional
+    // def. to workaround that
+    const T& next() const { return queue.front().val; }
+
+    // Returns the end of the queue
+    const T& back() const { return queue.back().val; }
+
+    // Is the head event non-blocking ?
+    bool frontNB() const { return queue.front().non_blocking; }
+
+    // Is the last event non-blocking ?
+    bool backNB() const { return queue.back().non_blocking; }
+
+    // Is the queue empty ?
+    bool empty() const { return queue.empty(); }
+
+    // put an event at the end of the queue
+    void push(const T &elem) { queue.push_back({elem,false}); }
+
+    // emplace an event at the end of the queue
+    template<typename... Ts>
+    void
+    emplace(Ts&&... args)
+    {
+        queue.push_back({T(std::forward<Ts>(args)...),false});
+    }
+
+    // put an event at the head of the queue
+    void pushFront(const T &elem) { queue.push_front({elem,false}); }
+
+    // put a non-blocking event at the end of the queue
+    void pushNB(const T &elem) { queue.push_back({elem,true}); }
+
+    // put a non-blocking event at the head of the queue
+    void pushFrontNB(const T &elem) { queue.push_front({elem,true}); }
+
+    // pop the head of the queue
+    void pop() { queue.pop_front(); }
+
+    void print(std::ostream& out) const;
+};
+
+template<class T>
+inline std::ostream&
+operator<<(std::ostream& out, const TriggerQueue<T>& obj)
+{
+    obj.print(out);
+    out << std::flush;
+    return out;
+}
+
+template<class T>
+inline void
+TriggerQueue<T>::print(std::ostream& out) const
+{
+}
+
+#endif // __MEM_RUBY_COMMON_QUEUE_HH__
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -262,7 +262,9 @@ enumeration(MachineType, desc="...", default="MachineType_NULL") {
    TCCdir,      desc="Directory at the GPU L2 Cache (TCC)";
    SQC,         desc="GPU L1 Instr Cache (Sequencer Cache)";
    RegionDir,   desc="Region-granular directory";
-    RegionBuffer,desc="Region buffer for CPU and GPU";
+    RegionBuffer, desc="Region buffer for CPU and GPU";
+    Cache,       desc="Generic coherent cache controller";
+    Memory,      desc="Memory controller interface";
    NULL,        desc="null mach type";
 }

--- a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
--- a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
--- a/src/mem/ruby/protocol/chi/CHI-cache-ports.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-ports.sm
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2021 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Outbound port definitions
+
+out_port(reqOutPort, CHIRequestMsg, reqOut);
+out_port(snpOutPort, CHIRequestMsg, snpOut);
+out_port(rspOutPort, CHIResponseMsg, rspOut);
+out_port(datOutPort, CHIDataMsg, datOut);
+out_port(triggerOutPort, TriggerMsg, triggerQueue);
+out_port(retryTriggerOutPort, RetryTriggerMsg, retryTriggerQueue);
+out_port(replTriggerOutPort, TriggerMsg, replTriggerQueue);
+out_port(reqRdyOutPort, CHIRequestMsg, reqRdy);
+out_port(snpRdyOutPort, CHIRequestMsg, snpRdy);
+
+
+// Include helper functions here. Some of them require the outports to be
+// already defined
+// Notice 'processNextState' and 'wakeupPending*' functions are defined after
+// the required input ports. Currently the SLICC compiler does not support
+// separate declaration and definition of functions in the .sm files.
+include "CHI-cache-funcs.sm";
+
+
+// Inbound port definitions and internal triggers queues
+// Notice we never stall input ports connected to the network
+// Incoming data and responses are always consumed.
+// Incoming requests/snoop are moved to the respective internal rdy queue
+// if a TBE can be allocated, or retried otherwise.
+
+// Trigger events from the UD_T state
+in_port(useTimerTable_in, Addr, useTimerTable, rank=11) {
+  if (useTimerTable_in.isReady(clockEdge())) {
+      Addr readyAddress := useTimerTable.nextAddress();
+      trigger(Event:UseTimeout, readyAddress, getCacheEntry(readyAddress),
+              getCurrentActiveTBE(readyAddress));
+  }
+}
+
+
+// Response
+in_port(rspInPort, CHIResponseMsg, rspIn, rank=10,
+        rsc_stall_handler=rspInPort_rsc_stall_handler) {
+  if (rspInPort.isReady(clockEdge())) {
+    printResources();
+    peek(rspInPort, CHIResponseMsg) {
+      TBE tbe := getCurrentActiveTBE(in_msg.addr);
+      trigger(respToEvent(in_msg.type, tbe), in_msg.addr,
+              getCacheEntry(in_msg.addr), tbe);
+    }
+  }
+}
+bool rspInPort_rsc_stall_handler() {
+  error("rspInPort must never stall\n");
+  return false;
+}
+
+
+// Data
+in_port(datInPort, CHIDataMsg, datIn, rank=9,
+        rsc_stall_handler=datInPort_rsc_stall_handler) {
+  if (datInPort.isReady(clockEdge())) {
+    printResources();
+    peek(datInPort, CHIDataMsg) {
+      int received := in_msg.bitMask.count();
+      assert((received <= data_channel_size) && (received > 0));
+      trigger(dataToEvent(in_msg.type), in_msg.addr,
+              getCacheEntry(in_msg.addr), getCurrentActiveTBE(in_msg.addr));
+    }
+  }
+}
+bool datInPort_rsc_stall_handler() {
+  error("datInPort must never stall\n");
+  return false;
+}
+
+
+// Snoops with an allocated TBE
+in_port(snpRdyPort, CHIRequestMsg, snpRdy, rank=8,
+        rsc_stall_handler=snpRdyPort_rsc_stall_handler) {
+  if (snpRdyPort.isReady(clockEdge())) {
+    printResources();
+    peek(snpRdyPort, CHIRequestMsg) {
+      assert(in_msg.allowRetry == false);
+      TBE tbe := getCurrentActiveTBE(in_msg.addr);
+      if (is_valid(tbe) && tbe.hasUseTimeout) {
+        // we may be in the BUSY_INTR waiting for a cache block, but if
+        // the timeout is set the snoop must still wait, so trigger the
+        // stall form here to prevent creating other states
+        trigger(Event:SnpStalled, in_msg.addr,
+                getCacheEntry(in_msg.addr), tbe);
+      } else {
+        trigger(snpToEvent(in_msg.type), in_msg.addr,
+                getCacheEntry(in_msg.addr), tbe);
+      }
+    }
+  }
+}
+bool snpRdyPort_rsc_stall_handler() {
+  error("snpRdyPort must never stall\n");
+  return false;
+}
+void wakeupPendingSnps(TBE tbe) {
+  if (tbe.wakeup_pending_snp) {
+    Addr addr := tbe.addr;
+    wakeup_port(snpRdyPort, addr);
+    tbe.wakeup_pending_snp := false;
+  }
+}
+
+
+// Incoming snoops
+// Not snoops are not retried, so the snoop channel is stalled if no
+// Snp TBEs available
+in_port(snpInPort, CHIRequestMsg, snpIn, rank=7) {
+  if (snpInPort.isReady(clockEdge())) {
+    assert(is_HN == false);
+    printResources();
+    peek(snpInPort, CHIRequestMsg) {
+      assert(in_msg.allowRetry == false);
+      trigger(Event:AllocSnoop, in_msg.addr,
+              getCacheEntry(in_msg.addr), getCurrentActiveTBE(in_msg.addr));
+    }
+  }
+}
+
+
+// Retry action triggers
+// These are handled before other triggers since a retried request should
+// be enqueued ahead of a new request
+// TODO: consider moving DoRetry to the triggerQueue
+in_port(retryTriggerInPort, RetryTriggerMsg, retryTriggerQueue, rank=6,
+        rsc_stall_handler=retryTriggerInPort_rsc_stall_handler) {
+  if (retryTriggerInPort.isReady(clockEdge())) {
+    printResources();
+    peek(retryTriggerInPort, RetryTriggerMsg) {
+      Event ev := in_msg.event;
+      TBE tbe := getCurrentActiveTBE(in_msg.addr);
+      assert((ev == Event:SendRetryAck) || (ev == Event:SendPCrdGrant) ||
+              (ev == Event:DoRetry));
+      if (ev == Event:DoRetry) {
+        assert(is_valid(tbe));
+        if (tbe.is_req_hazard || tbe.is_repl_hazard) {
+          ev := Event:DoRetry_Hazard;
+        }
+      }
+      trigger(ev, in_msg.addr, getCacheEntry(in_msg.addr), tbe);
+    }
+  }
+}
+bool retryTriggerInPort_rsc_stall_handler() {
+  DPRINTF(RubySlicc, "Retry trigger queue resource stall\n");
+  retryTriggerInPort.recycle(clockEdge(), cyclesToTicks(stall_recycle_lat));
+  return true;
+}
+
+
+// Action triggers
+in_port(triggerInPort, TriggerMsg, triggerQueue, rank=5,
+        rsc_stall_handler=triggerInPort_rsc_stall_handler) {
+  if (triggerInPort.isReady(clockEdge())) {
+    printResources();
+    peek(triggerInPort, TriggerMsg) {
+      TBE tbe := getCurrentActiveTBE(in_msg.addr);
+      assert(is_valid(tbe));
+      if (in_msg.from_hazard != (tbe.is_req_hazard || tbe.is_repl_hazard)) {
+        // possible when handling a snoop hazard and an action from the
+        // the initial transaction got woken up. Stall the action until the
+        // hazard ends
+        assert(in_msg.from_hazard == false);
+        assert(tbe.is_req_hazard || tbe.is_repl_hazard);
+        trigger(Event:ActionStalledOnHazard, in_msg.addr,
+                getCacheEntry(in_msg.addr), tbe);
+      } else {
+        trigger(tbe.pendAction, in_msg.addr, getCacheEntry(in_msg.addr), tbe);
+      }
+    }
+  }
+}
+bool triggerInPort_rsc_stall_handler() {
+  DPRINTF(RubySlicc, "Trigger queue resource stall\n");
+  triggerInPort.recycle(clockEdge(), cyclesToTicks(stall_recycle_lat));
+  return true;
+}
+void wakeupPendingTgrs(TBE tbe) {
+  if (tbe.wakeup_pending_tgr) {
+    Addr addr := tbe.addr;
+    wakeup_port(triggerInPort, addr);
+    tbe.wakeup_pending_tgr := false;
+  }
+}
+
+
+// internally triggered evictions
+// no stall handler for this one since it doesn't make sense try the next
+// request when out of TBEs
+in_port(replTriggerInPort, ReplacementMsg, replTriggerQueue, rank=4) {
+  if (replTriggerInPort.isReady(clockEdge())) {
+    printResources();
+    peek(replTriggerInPort, ReplacementMsg) {
+      TBE tbe := getCurrentActiveTBE(in_msg.addr);
+      CacheEntry cache_entry := getCacheEntry(in_msg.addr);
+      Event trigger := Event:null;
+      if (is_valid(cache_entry) &&
+          ((upstreamHasUnique(cache_entry.state) && dealloc_backinv_unique) ||
+          (upstreamHasShared(cache_entry.state) && dealloc_backinv_shared))) {
+        trigger := Event:Global_Eviction;
+      } else {
+        if (is_HN) {
+          trigger := Event:LocalHN_Eviction;
+        } else {
+          trigger := Event:Local_Eviction;
+        }
+      }
+      trigger(trigger, in_msg.addr, cache_entry, tbe);
+    }
+  }
+}
+
+
+// Requests with an allocated TBE
+in_port(reqRdyPort, CHIRequestMsg, reqRdy, rank=3,
+        rsc_stall_handler=reqRdyPort_rsc_stall_handler) {
+  if (reqRdyPort.isReady(clockEdge())) {
+    printResources();
+    peek(reqRdyPort, CHIRequestMsg) {
+      CacheEntry cache_entry := getCacheEntry(in_msg.addr);
+      TBE tbe := getCurrentActiveTBE(in_msg.addr);
+
+      DirEntry dir_entry := getDirEntry(in_msg.addr);
+
+      // Special case for possibly stale writebacks or evicts
+      if (in_msg.type == CHIRequestType:WriteBackFull) {
+        if (is_invalid(dir_entry) || (dir_entry.ownerExists == false) ||
+            (dir_entry.owner != in_msg.requestor)) {
+          trigger(Event:WriteBackFull_Stale, in_msg.addr, cache_entry, tbe);
+        }
+      } else if (in_msg.type == CHIRequestType:WriteEvictFull) {
+        if (is_invalid(dir_entry) || (dir_entry.ownerExists == false) ||
+            (dir_entry.ownerIsExcl == false) || (dir_entry.owner != in_msg.requestor)) {
+          trigger(Event:WriteEvictFull_Stale, in_msg.addr, cache_entry, tbe);
+        }
+      } else if (in_msg.type == CHIRequestType:WriteCleanFull) {
+        if (is_invalid(dir_entry) || (dir_entry.ownerExists == false) ||
+            (dir_entry.ownerIsExcl == false) || (dir_entry.owner != in_msg.requestor)) {
+          trigger(Event:WriteCleanFull_Stale, in_msg.addr, cache_entry, tbe);
+        }
+      } else if (in_msg.type == CHIRequestType:Evict) {
+        if (is_invalid(dir_entry) ||
+            (dir_entry.sharers.isElement(in_msg.requestor) == false)) {
+          trigger(Event:Evict_Stale, in_msg.addr, cache_entry, tbe);
+        }
+      }
+
+      // Normal request path
+      trigger(reqToEvent(in_msg.type, in_msg.is_local_pf), in_msg.addr, cache_entry, tbe);
+    }
+  }
+}
+bool reqRdyPort_rsc_stall_handler() {
+  DPRINTF(RubySlicc, "ReqRdy queue resource stall\n");
+  reqRdyPort.recycle(clockEdge(), cyclesToTicks(stall_recycle_lat));
+  return true;
+}
+void wakeupPendingReqs(TBE tbe) {
+  if (tbe.wakeup_pending_req) {
+    Addr addr := tbe.addr;
+    wakeup_port(reqRdyPort, addr);
+    tbe.wakeup_pending_req := false;
+  }
+}
+
+
+// Incoming new requests
+in_port(reqInPort, CHIRequestMsg, reqIn, rank=2,
+        rsc_stall_handler=reqInPort_rsc_stall_handler) {
+  if (reqInPort.isReady(clockEdge())) {
+    printResources();
+    peek(reqInPort, CHIRequestMsg) {
+      if (in_msg.allowRetry) {
+        trigger(Event:AllocRequest, in_msg.addr,
+              getCacheEntry(in_msg.addr), getCurrentActiveTBE(in_msg.addr));
+      } else {
+        trigger(Event:AllocRequestWithCredit, in_msg.addr,
+              getCacheEntry(in_msg.addr), getCurrentActiveTBE(in_msg.addr));
+      }
+    }
+  }
+}
+bool reqInPort_rsc_stall_handler() {
+  error("reqInPort must never stall\n");
+  return false;
+}
+
+
+// Incoming new sequencer requests
+in_port(seqInPort, RubyRequest, mandatoryQueue, rank=1) {
+  if (seqInPort.isReady(clockEdge())) {
+    printResources();
+    peek(seqInPort, RubyRequest) {
+      trigger(Event:AllocSeqRequest, in_msg.LineAddress,
+              getCacheEntry(in_msg.LineAddress),
+              getCurrentActiveTBE(in_msg.LineAddress));
+    }
+  }
+}
+
+
+// Incoming new prefetch requests
+in_port(pfInPort, RubyRequest, prefetchQueue, rank=0) {
+  if (pfInPort.isReady(clockEdge())) {
+    printResources();
+    peek(pfInPort, RubyRequest) {
+      trigger(Event:AllocPfRequest, in_msg.LineAddress,
+              getCacheEntry(in_msg.LineAddress),
+              getCurrentActiveTBE(in_msg.LineAddress));
+    }
+  }
+}
+
+void processNextState(Addr address, TBE tbe, CacheEntry cache_entry) {
+  assert(is_valid(tbe));
+  DPRINTF(RubySlicc, "GoToNextState expected_req_resp=%d expected_snp_resp=%d snd_pendEv=%d snd_pendBytes=%d\n",
+                      tbe.expected_req_resp.expected(),
+                      tbe.expected_snp_resp.expected(),
+                      tbe.snd_pendEv, tbe.snd_pendBytes.count());
+
+  // if no pending trigger and not expecting to receive anything, enqueue
+  // next
+  bool has_nb_trigger := (tbe.actions.empty() == false) &&
+                          tbe.actions.frontNB() &&
+                          (tbe.snd_pendEv == false);
+  int expected_msgs := tbe.expected_req_resp.expected() +
+                        tbe.expected_snp_resp.expected() +
+                        tbe.snd_pendBytes.count();
+  if ((tbe.pendAction == Event:null) && ((expected_msgs == 0) || has_nb_trigger)) {
+    Cycles trigger_latency := intToCycles(0);
+    if (tbe.delayNextAction > curTick()) {
+      trigger_latency := ticksToCycles(tbe.delayNextAction) -
+                          ticksToCycles(curTick());
+      tbe.delayNextAction := intToTick(0);
+    }
+
+    tbe.pendAction := Event:null;
+    if (tbe.actions.empty()) {
+      // time to go to the final state
+      tbe.pendAction := Event:Final;
+    } else {
+      tbe.pendAction := tbe.actions.front();
+      tbe.actions.pop();
+    }
+    assert(tbe.pendAction != Event:null);
+    enqueue(triggerOutPort, TriggerMsg, trigger_latency) {
+      out_msg.addr := tbe.addr;
+      out_msg.from_hazard := tbe.is_req_hazard || tbe.is_repl_hazard;
+    }
+  }
+
+  printTBEState(tbe);
+
+  // we might be going to BUSY_INTERRUPTABLE so wakeup pending snoops
+  // if any
+  wakeupPendingSnps(tbe);
+}
--- a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
--- a/src/mem/ruby/protocol/chi/CHI-cache.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache.sm
@@ -0,0 +1,775 @@
+/*
+ * Copyright (c) 2021 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+machine(MachineType:Cache, "Cache coherency protocol") :
+  // Sequencer to insert Load/Store requests.
+  // May be null if this is not a L1 cache
+  Sequencer * sequencer;
+
+  // Cache for storing local lines.
+  // NOTE: it is assumed that a cache tag and directory lookups and updates
+  // happen in parallel. The cache tag latency is used for both cases.
+  CacheMemory * cache;
+
+  // Additional pipeline latency modeling for the different request types
+  // When defined, these are applied after the initial tag array read and
+  // sending necessary snoops.
+  Cycles read_hit_latency := 0;
+  Cycles read_miss_latency := 0;
+  Cycles write_fe_latency := 0; // Front-end: Rcv req -> Snd req
+  Cycles write_be_latency := 0; // Back-end: Rcv ack -> Snd data
+  Cycles fill_latency := 0; // Fill latency
+  Cycles snp_latency := 0; // Applied before handling any snoop
+  Cycles snp_inv_latency := 0; // Additional latency for invalidating snoops
+
+  // Waits for cache data array write to complete before executing next action
+  // Note a new write will always block if bank stalls are enabled in the cache
+  bool wait_for_cache_wr := "False";
+
+  // Request TBE allocation latency
+  Cycles allocation_latency := 0;
+
+  // Enqueue latencies for outgoing messages
+  // NOTE: should remove this and only use parameters above?
+  Cycles request_latency := 1;
+  Cycles response_latency := 1;
+  Cycles snoop_latency := 1;
+  Cycles data_latency := 1;
+
+  // When an SC fails, unique lines are locked to this controller for a period
+  // proportional to the number of consecutive failed SC requests. See
+  // the usage of sc_lock_multiplier and llscCheckMonitor for details
+  int sc_lock_base_latency_cy  := 4;
+  int sc_lock_multiplier_inc   := 4;
+  int sc_lock_multiplier_decay := 1;
+  int sc_lock_multiplier_max   := 256;
+  bool sc_lock_enabled;
+
+  // Recycle latency on resource stalls
+  Cycles stall_recycle_lat := 1;
+
+  // Notify the sequencer when a line is evicted. This should be set is the
+  // sequencer is not null and handled LL/SC request types.
+  bool send_evictions;
+
+  // Number of entries in the snoop and replacement TBE tables
+  // notice the "number_of_TBEs" parameter is defined by AbstractController
+  int number_of_snoop_TBEs;
+  int number_of_repl_TBEs;
+
+  // replacements use the same TBE slot as the request that triggered it
+  // in this case the number_of_repl_TBEs parameter is ignored
+  bool unify_repl_TBEs;
+
+  // wait for the final tag update to complete before deallocating TBE and
+  // going to final stable state
+  bool dealloc_wait_for_tag := "False";
+
+  // Width of the data channel. Data transfer are split in multiple messages
+  // at the protocol level when this is less than the cache line size.
+  int data_channel_size;
+
+  // Set when this is used as the home node and point of coherency of the
+  // system. Must be false for every other cache level.
+  bool is_HN;
+
+  // Enables direct memory transfers between SNs and RNs when the data is
+  // not cache in the HN.
+  bool enable_DMT;
+
+  // Use ReadNoSnpSep instead of ReadNoSnp for DMT requests, which allows
+  // the TBE to be deallocated at HNFs before the requester receives the data
+  bool enable_DMT_early_dealloc := "False";
+
+  // Enables direct cache transfers, i.e., use forwarding snoops whenever
+  // possible.
+  bool enable_DCT;
+
+  // Use separate Comp/DBIDResp responses for WriteUnique
+  bool comp_wu := "False";
+  // additional latency for the WU Comp response
+  Cycles comp_wu_latency := 0;
+
+  // Controls cache clusivity for different request types.
+  // set all alloc_on* to false to completelly disable caching
+  bool alloc_on_readshared;
+  bool alloc_on_readunique;
+  bool alloc_on_readonce;
+  bool alloc_on_writeback;
+  bool alloc_on_seq_acc;
+  bool alloc_on_seq_line_write;
+  // Controls if the clusivity is strict.
+  bool dealloc_on_unique;
+  bool dealloc_on_shared;
+  bool dealloc_backinv_unique;
+  bool dealloc_backinv_shared;
+
+  // If the responder has the line in UC or UD state, propagate this state
+  // on a ReadShared. Notice data won't be deallocated if dealloc_on_unique is
+  // set
+  bool fwd_unique_on_readshared := "False";
+
+  // Allow receiving data in SD state.
+  bool allow_SD;
+
+  // stall new requests to destinations with a pending retry
+  bool throttle_req_on_retry := "True";
+
+  // Use prefetcher
+  bool use_prefetcher, default="false";
+
+  // Message Queues
+
+  // Interface to the network
+  // Note vnet_type is used by Garnet only. "response" type is assumed to
+  // have data, so use it for data channels and "none" for the rest.
+  // network="To" for outbound queue; network="From" for inbound
+  // virtual networks: 0=request, 1=snoop, 2=response, 3=data
+
+  MessageBuffer * reqOut,   network="To", virtual_network="0", vnet_type="none";
+  MessageBuffer * snpOut,   network="To", virtual_network="1", vnet_type="none";
+  MessageBuffer * rspOut,   network="To", virtual_network="2", vnet_type="none";
+  MessageBuffer * datOut,   network="To", virtual_network="3", vnet_type="response";
+
+  MessageBuffer * reqIn,   network="From", virtual_network="0", vnet_type="none";
+  MessageBuffer * snpIn,   network="From", virtual_network="1", vnet_type="none";
+  MessageBuffer * rspIn,   network="From", virtual_network="2", vnet_type="none";
+  MessageBuffer * datIn,   network="From", virtual_network="3", vnet_type="response";
+
+  // Mandatory queue for receiving requests from the sequencer
+  MessageBuffer * mandatoryQueue;
+
+  // Internal queue for trigger events
+  MessageBuffer * triggerQueue;
+
+  // Internal queue for retry trigger events
+  MessageBuffer * retryTriggerQueue;
+
+  // Internal queue for accepted requests
+  MessageBuffer * reqRdy;
+
+  // Internal queue for accepted snoops
+  MessageBuffer * snpRdy;
+
+  // Internal queue for eviction requests
+  MessageBuffer * replTriggerQueue;
+
+  // Prefetch queue for receiving prefetch requests from prefetcher
+  MessageBuffer * prefetchQueue;
+
+  // Requests that originated from a prefetch in a upstream cache are treated
+  // as demand access in this cache. Notice the demand access stats are still
+  // updated only on true demand requests.
+  bool upstream_prefetch_trains_prefetcher := "False";
+
+{
+
+  ////////////////////////////////////////////////////////////////////////////
+  // States
+  ////////////////////////////////////////////////////////////////////////////
+
+  state_declaration(State, default="Cache_State_null") {
+    // Stable states
+
+    I, AccessPermission:Invalid,    desk="Invalid / not present locally or upstream";
+
+    // States when block is present in local cache only
+    SC, AccessPermission:Read_Only,     desc="Shared Clean";
+    UC, AccessPermission:Read_Write,    desc="Unique Clean";
+    SD, AccessPermission:Read_Only,     desc="Shared Dirty";
+    UD, AccessPermission:Read_Write,    desc="Unique Dirty";
+    UD_T, AccessPermission:Read_Write,  desc="UD with use timeout";
+
+    // Invalid in local cache but present in upstream caches
+    RU, AccessPermission:Invalid,   desk="Upstream requester has line in UD/UC";
+    RSC, AccessPermission:Invalid,  desk="Upstream requester has line in SC";
+    RSD, AccessPermission:Invalid,  desk="Upstream requester has line in SD and maybe SC";
+    RUSC, AccessPermission:Invalid, desk="RSC + this node stills has exclusive access";
+    RUSD, AccessPermission:Invalid, desk="RSD + this node stills has exclusive access";
+
+    // Both in local and upstream caches. In some cases local maybe stale
+    SC_RSC, AccessPermission:Read_Only,    desk="SC + RSC";
+    SD_RSC, AccessPermission:Read_Only,    desk="SD + RSC";
+    SD_RSD, AccessPermission:Read_Only,    desk="SD + RSD";
+    UC_RSC, AccessPermission:Read_Write,   desk="UC + RSC";
+    UC_RU, AccessPermission:Invalid,       desk="UC + RU";
+    UD_RU, AccessPermission:Invalid,       desk="UD + RU";
+    UD_RSD, AccessPermission:Read_Write,   desk="UD + RSD";
+    UD_RSC, AccessPermission:Read_Write,   desk="UD + RSC";
+
+    // Generic transient state
+    // There is only a transient "BUSY" state. The actions taken at this state
+    // and the final stable state are defined by information in the TBE.
+    // While on BUSY_INTR, we will reply to incoming snoops and the
+    // state of the cache line may change. While on BUSY_BLKD snoops
+    // are blocked
+    BUSY_INTR, AccessPermission:Busy, desc="Waiting for data and/or ack";
+    BUSY_BLKD, AccessPermission:Busy, desc="Waiting for data and/or ack; blocks snoops";
+
+    // Null state for debugging
+    null, AccessPermission:Invalid, desc="Null state";
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Events
+  ////////////////////////////////////////////////////////////////////////////
+
+  enumeration(Event) {
+    // Events triggered by incoming requests. Allocate TBE and move
+    // request or snoop to the ready queue
+    AllocRequest,           desc="Allocates a TBE for a request. Triggers a retry if table is full";
+    AllocRequestWithCredit, desc="Allocates a TBE for a request. Always succeeds.";
+    AllocSeqRequest,        desc="Allocates a TBE for a sequencer request. Stalls requests if table is full";
+    AllocPfRequest,         desc="Allocates a TBE for a prefetch request. Stalls requests if table is full";
+    AllocSnoop,             desc="Allocates a TBE for a snoop. Stalls snoop if table is full";
+
+    // Events triggered by sequencer requests or snoops in the rdy queue
+    // See CHIRequestType in CHi-msg.sm for descriptions
+    Load;
+    Store;
+    Prefetch;
+    ReadShared;
+    ReadNotSharedDirty;
+    ReadUnique;
+    ReadUnique_PoC;
+    ReadOnce;
+    CleanUnique;
+    Evict;
+    WriteBackFull;
+    WriteEvictFull;
+    WriteCleanFull;
+    WriteUnique;
+    WriteUniquePtl_PoC;
+    WriteUniqueFull_PoC;
+    WriteUniqueFull_PoC_Alloc;
+    SnpCleanInvalid;
+    SnpShared;
+    SnpSharedFwd;
+    SnpNotSharedDirtyFwd;
+    SnpUnique;
+    SnpUniqueFwd;
+    SnpOnce;
+    SnpOnceFwd;
+    SnpStalled; // A snoop stall triggered from the inport
+
+    // Events triggered by incoming response messages
+    // See CHIResponseType in CHi-msg.sm for descriptions
+    CompAck;
+    Comp_I;
+    Comp_UC;
+    Comp_SC;
+    CompDBIDResp;
+    DBIDResp;
+    Comp;
+    ReadReceipt;
+    RespSepData;
+    SnpResp_I;
+    SnpResp_I_Fwded_UC;
+    SnpResp_I_Fwded_UD_PD;
+    SnpResp_SC;
+    SnpResp_SC_Fwded_SC;
+    SnpResp_SC_Fwded_SD_PD;
+    SnpResp_UC_Fwded_I;
+    SnpResp_UD_Fwded_I;
+    SnpResp_SC_Fwded_I;
+    SnpResp_SD_Fwded_I;
+    RetryAck;
+    RetryAck_PoC;
+    PCrdGrant;
+    PCrdGrant_PoC;
+    RetryAck_Hazard;
+    RetryAck_PoC_Hazard;
+    PCrdGrant_Hazard;
+    PCrdGrant_PoC_Hazard;
+
+    // Events triggered by incoming data response messages
+    // See CHIDataType in CHi-msg.sm for descriptions
+    CompData_I;
+    CompData_UC;
+    CompData_SC;
+    CompData_UD_PD;
+    CompData_SD_PD;
+    DataSepResp_UC;
+    CBWrData_I;
+    CBWrData_UC;
+    CBWrData_SC;
+    CBWrData_UD_PD;
+    CBWrData_SD_PD;
+    NCBWrData;
+    SnpRespData_I;
+    SnpRespData_I_PD;
+    SnpRespData_SC;
+    SnpRespData_SC_PD;
+    SnpRespData_SD;
+    SnpRespData_UC;
+    SnpRespData_UD;
+    SnpRespData_SC_Fwded_SC;
+    SnpRespData_SC_Fwded_SD_PD;
+    SnpRespData_SC_PD_Fwded_SC;
+    SnpRespData_I_Fwded_SD_PD;
+    SnpRespData_I_PD_Fwded_SC;
+    SnpRespData_I_Fwded_SC;
+
+    // We use special events for requests that we detect to be stale. This is
+    // done for debugging only. We sent a stale response so the requester can
+    // confirm the request is indeed stale and this is not a protocol bug.
+    // A Write or Evict becomes stale when the requester receives a snoop that
+    // changes the state of the data while the request was pending.
+    // Actual CHI implementations don't have this check.
+    Evict_Stale;
+    WriteBackFull_Stale;
+    WriteEvictFull_Stale;
+    WriteCleanFull_Stale;
+
+    // Cache fill handling
+    CheckCacheFill,   desc="Check if need to write or update the cache and trigger any necessary allocation and evictions";
+
+    // Internal requests generated to evict or writeback a local copy
+    // to free-up cache space
+    Local_Eviction,   desc="Evicts/WB the local copy of the line";
+    LocalHN_Eviction, desc="Local_Eviction triggered when is HN";
+    Global_Eviction,  desc="Local_Eviction + back-invalidate line in all upstream requesters";
+
+    // Events triggered from tbe.actions
+    // In general, for each event we define a single transition from
+    // BUSY_BLKD and/or BUSY_INTR.
+    // See processNextState functions and Initiate_* actions.
+    // All triggered transitions execute in the same cycle until it has to wait
+    // for pending pending responses or data (set by expected_req_resp and
+    // expected_snp_resp). Triggers queued with pushNB are executed even if
+    // there are pending messages.
+
+    // Cache/directory access events. Notice these only model the latency.
+    TagArrayRead,         desc="Read the cache and directory tag array";
+    TagArrayWrite,        desc="Write the cache and directory tag array";
+    DataArrayRead,        desc="Read the cache data array";
+    DataArrayWrite,       desc="Write the cache data array";
+    DataArrayWriteOnFill, desc="Write the cache data array (cache fill)";
+
+    // Events for modeling the pipeline latency
+    ReadHitPipe,  desc="Latency of reads served from local cache";
+    ReadMissPipe, desc="Latency of reads not served from local cache";
+    WriteFEPipe,  desc="Front-end latency of write requests";
+    WriteBEPipe,  desc="Back-end latency of write requests";
+    FillPipe,     desc="Cache fill latency";
+    SnpSharedPipe, desc="Latency for SnpShared requests";
+    SnpInvPipe,    desc="Latency for SnpUnique and SnpCleanInv requests";
+    SnpOncePipe,   desc="Latency for SnpOnce requests";
+
+    // Send a read request downstream.
+    SendReadShared,       desc="Send a ReadShared or ReadNotSharedDirty is allow_SD is false";
+    SendReadOnce,         desc="Send a ReadOnce";
+    SendReadNoSnp,        desc="Send a SendReadNoSnp";
+    SendReadNoSnpDMT,     desc="Send a SendReadNoSnp using DMT";
+    SendReadUnique,       desc="Send a ReadUnique";
+    SendCompAck,          desc="Send CompAck";
+    // Read handling at the completer
+    SendCompData,    desc="Send CompData";
+    WaitCompAck,     desc="Expect to receive CompAck";
+    SendRespSepData, desc="Send RespSepData for a DMT request";
+
+    // Send a write request downstream.
+    SendWriteBackOrWriteEvict, desc="Send a WriteBackFull (if line is UD or SD) or WriteEvictFull (if UC)";
+    SendWriteClean,            desc="Send a WriteCleanFull";
+    SendWriteNoSnp,            desc="Send a WriteNoSnp for a full line";
+    SendWriteNoSnpPartial,     desc="Send a WriteNoSnpPtl";
+    SendWriteUnique,           desc="Send a WriteUniquePtl";
+    SendWBData,                desc="Send writeback data";
+    SendWUData,                desc="Send write unique data";
+    SendWUDataCB,              desc="Send write unique data from a sequencer callback";
+    // Write handling at the completer
+    SendCompDBIDResp,      desc="Ack WB with CompDBIDResp";
+    SendCompDBIDRespStale, desc="Ack stale WB with CompDBIDResp";
+    SendCompDBIDResp_WU,   desc="Ack WU with CompDBIDResp and set expected data";
+    SendDBIDResp_WU,       desc="Ack WU with DBIDResp and set expected data";
+    SendComp_WU,           desc="Ack WU completion";
+
+    // Dataless requests
+    SendEvict,      desc="Send a Evict";
+    SendCompIResp,  desc="Ack Evict with Comp_I";
+    SendCleanUnique,desc="Send a CleanUnique";
+    SendCompUCResp, desc="Ack CleanUnique with Comp_UC";
+
+    // Checks if an upgrade using a CleanUnique was sucessfull
+    CheckUpgrade_FromStore, desc="Upgrade needed by a Store";
+    CheckUpgrade_FromCU,    desc="Upgrade needed by an upstream CleanUnique";
+    CheckUpgrade_FromRU,    desc="Upgrade needed by an upstream ReadUnique";
+
+    // Snoop requests
+    // SnpNotSharedDirty are sent instead of SnpShared for ReadNotSharedDirty
+    SendSnpShared,            desc="Send a SnpShared/SnpNotSharedDirty to sharer in UC,UD, or SD state";
+    SendSnpSharedFwdToOwner,  desc="Send a SnpSharedFwd/SnpNotSharedDirtyFwd to sharer in UC,UD, or SD state";
+    SendSnpSharedFwdToSharer, desc="Send a SnpSharedFwd/SnpNotSharedDirtyFwd to a sharer in SC state";
+    SendSnpOnce,              desc="Send a SnpOnce to a sharer";
+    SendSnpOnceFwd,           desc="Send a SnpOnceFwd to a sharer";
+    SendSnpUnique,            desc="Send a SnpUnique to all sharers";
+    SendSnpUniqueRetToSrc,    desc="Send a SnpUnique to all sharers. Sets RetToSrc for only one sharer.";
+    SendSnpUniqueFwd,         desc="Send a SnpUniqueFwd to a single sharer";
+    SendSnpCleanInvalid,      desc="Send a SnpCleanInvalid to all sharers";
+    SendSnpCleanInvalidNoReq, desc="Send a SnpCleanInvalid to all sharers except requestor";
+    // Snoop responses
+    SendSnpData,                      desc="Send SnpRespData as snoop reply";
+    SendSnpIResp,                     desc="Send SnpResp_I as snoop reply";
+    SendInvSnpResp,                   desc="Check data state and queue either SendSnpIResp or SendSnpData";
+    SendSnpUniqueFwdCompData,         desc="Send CompData to SnpUniqueFwd target and queue either SendSnpFwdedData or SendSnpFwdedResp";
+    SendSnpSharedFwdCompData,         desc="Send CompData to SnpUniqueFwd target and queue either SendSnpFwdedData or SendSnpFwdedResp";
+    SendSnpNotSharedDirtyFwdCompData, desc="Send CompData to SnpNotSharedDirtyFwd target and queue either SendSnpFwdedData or SendSnpFwdedResp";
+    SendSnpOnceFwdCompData,           desc="Send CompData to SnpOnceFwd target and queue either SendSnpFwdedData or SendSnpFwdedResp";
+    SendSnpFwdedData,                 desc="Send SnpResp for a forwarding snoop";
+    SendSnpFwdedResp,                 desc="Send SnpRespData for a forwarding snoop";
+
+    // Retry handling
+    SendRetryAck,   desc="Send RetryAck";
+    SendPCrdGrant,  desc="Send PCrdGrant";
+    DoRetry,        desc="Resend the current pending request";
+    DoRetry_Hazard,        desc="DoRetry during a hazard";
+
+    // Misc triggers
+    LoadHit,  desc="Complete a load hit";
+    StoreHit, desc="Complete a store hit";
+    UseTimeout, desc="Transition from UD_T -> UD";
+    RestoreFromHazard, desc="Restore from a snoop hazard";
+    TX_Data, desc="Transmit pending data messages";
+    MaintainCoherence, desc="Queues a WriteBack or Evict before droping the only valid copy of the block";
+    FinishCleanUnique, desc="Sends acks and perform any writeback after a CleanUnique";
+    ActionStalledOnHazard, desc="Stall a trigger action because until finish handling snoop hazard";
+
+    // This is triggered once a transaction doesn't have
+    // any queued action and is not expecting responses/data. The transaction
+    // is finalized and the next stable state is stored in the cache/directory
+    // See the processNextState and makeFinalState functions
+    Final;
+
+    null;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Data structures
+  ////////////////////////////////////////////////////////////////////////////
+
+  // Cache block size
+  int blockSize, default="RubySystem::getBlockSizeBytes()";
+
+  // CacheEntry
+  structure(CacheEntry, interface="AbstractCacheEntry") {
+    State state,        desc="SLICC line state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool HWPrefetched,  default="false", desc="Set if this cache entry was prefetched";
+  }
+
+  // Directory entry
+  structure(DirEntry, interface="AbstractCacheEntry", main="false") {
+    NetDest sharers,   desc="All upstream controllers that have this line (includes ownwer)";
+    MachineID owner,   desc="Controller that has the line in UD,UC, or SD state";
+    bool ownerExists, default="false", desc="true if owner exists";
+    bool ownerIsExcl, default="false", desc="true if owner is UD or UC";
+    State state,       desc="SLICC line state";
+  }
+
+  // Helper class for tracking expected response and data messages
+  structure(ExpectedMap, external ="yes") {
+    void clear(int dataChunks);
+    void addExpectedRespType(CHIResponseType);
+    void addExpectedDataType(CHIDataType);
+    void setExpectedCount(int val);
+    void addExpectedCount(int val);
+    bool hasExpected();
+    bool hasReceivedResp();
+    bool hasReceivedData();
+    int expected();
+    int received();
+    bool receiveResp(CHIResponseType);
+    bool receiveData(CHIDataType);
+    bool receivedDataType(CHIDataType);
+    bool receivedRespType(CHIResponseType);
+  }
+
+  // Tracks a pending retry
+  structure(RetryQueueEntry) {
+    Addr addr,           desc="Line address";
+    MachineID retryDest, desc="Retry destination";
+  }
+
+  // Queue for event triggers. Used to specify a list of actions that need
+  // to be performed across multiple transitions.
+  // This class is also used to track pending retries
+  structure(TriggerQueue, external ="yes") {
+    Event front();
+    Event back();
+    bool frontNB();
+    bool backNB();
+    bool empty();
+    void push(Event);
+    void pushNB(Event);
+    void pushFront(Event);
+    void pushFrontNB(Event);
+    void pop();
+    // For the retry queue
+    void emplace(Addr,MachineID);
+    RetryQueueEntry next(); //SLICC won't allow to reuse front()
+  }
+
+  // TBE fields
+  structure(TBE, desc="Transaction buffer entry definition") {
+    // in which table was this allocated
+    bool is_req_tbe, desc="Allocated in the request table";
+    bool is_snp_tbe, desc="Allocated in the snoop table";
+    bool is_repl_tbe, desc="Allocated in the replacements table";
+
+    int storSlot, desc="Slot in the storage tracker occupied by this entry";
+
+    // Transaction info mostly extracted from the request message
+    Addr addr,              desc="Line address for this TBE";
+    Addr accAddr,           desc="Access address for Load/Store/WriteUniquePtl; otherwisse == addr";
+    int accSize,            desc="Access size for Load/Store/WriteUniquePtl; otherwisse == blockSize";
+    CHIRequestType reqType, desc="Request type that initiated this transaction";
+    MachineID requestor,    desc="Requestor ID";
+    MachineID fwdRequestor, desc="Requestor to receive data on fwding snoops";
+    bool use_DMT,           desc="Use DMT for this transaction";
+    bool use_DCT,           desc="Use DCT for this transaction";
+
+    // if either is set prefetchers are not notified on miss/hit/fill and
+    // demand hit/miss stats are not incremented
+    bool is_local_pf,       desc="Request generated by a local prefetcher";
+    bool is_remote_pf,      desc="Request generated a prefetcher in another cache";
+
+    // NOTE: seqReq is a smart pointer pointing to original CPU request object
+    // that triggers transactions associated with this TBE. seqReq carries some
+    // information (e.g., PC of requesting instruction, virtual address of this
+    // request, etc.). Not all transactions have this field set if they are not
+    // triggered directly by a demand request from CPU.
+    RequestPtr seqReq,      default="nullptr", desc="Pointer to original request from CPU/sequencer";
+    bool isSeqReqValid,     default="false",   desc="Set if seqReq is valid (not nullptr)";
+
+    // Transaction state information
+    State state,    desc="SLICC line state";
+
+    // Transient state information. These are set at the beggining of a
+    // transactions and updated as data and responses are received. After
+    // finalizing the transactions these are used to create the next SLICC
+    // stable state.
+    bool hasUseTimeout,           desc="Line is locked under store/use timeout";
+    DataBlock dataBlk,            desc="Local copy of the line";
+    WriteMask dataBlkValid,       desc="Marks which bytes in the DataBlock are valid";
+    bool dataValid,               desc="Local copy is valid";
+    bool dataDirty,               desc="Local copy is dirtry";
+    bool dataMaybeDirtyUpstream,  desc="Line maybe dirty upstream";
+    bool dataUnique,              desc="Line is unique either locally or upsatream";
+    bool dataToBeInvalid,         desc="Local copy will be invalidated at the end of transaction";
+    bool dataToBeSharedClean,     desc="Local copy will become SC at the end of transaction";
+    NetDest dir_sharers,          desc="Upstream controllers that have the line (includes owner)";
+    MachineID dir_owner,          desc="Owner ID";
+    bool dir_ownerExists,         desc="Owner ID is valid";
+    bool dir_ownerIsExcl,         desc="Owner is UD or UC; SD otherwise";
+    bool doCacheFill,             desc="Write valid data to the cache when completing transaction";
+    // NOTE: dataMaybeDirtyUpstream and dir_ownerExists are the same except
+    // when we had just sent dirty data upstream and are waiting for ack to set
+    // dir_ownerExists
+
+    // Helper structures to track expected events and additional transient
+    // state info
+
+    // List of actions to be performed while on a transient state
+    // See the processNextState function for details
+    TriggerQueue actions, template="<Cache_Event>", desc="List of actions";
+    Event pendAction,         desc="Current pending action";
+    Tick delayNextAction,     desc="Delay next action until given tick";
+    State finalState,         desc="Final state; set when pendAction==Final";
+
+    // List of expected responses and data. Checks the type of data against the
+    // expected ones for debugging purposes
+    // See the processNextState function for details
+    ExpectedMap expected_req_resp, template="<CHIResponseType,CHIDataType>";
+    ExpectedMap expected_snp_resp, template="<CHIResponseType,CHIDataType>";
+    bool defer_expected_comp; // expect to receive Comp before the end of transaction
+    CHIResponseType slicchack1; // fix compiler not including headers
+    CHIDataType slicchack2; // fix compiler not including headers
+
+    // Tracks pending data messages that need to be generated when sending
+    // a line
+    bool snd_pendEv,            desc="Is there a pending tx event ?";
+    WriteMask snd_pendBytes,    desc="Which bytes are pending transmission";
+    CHIDataType snd_msgType,    desc="Type of message being sent";
+    MachineID snd_destination,  desc="Data destination";
+
+    // Tracks how to update the directory when receiving a CompAck
+    bool updateDirOnCompAck,          desc="Update directory on CompAck";
+    bool requestorToBeOwner,          desc="Sets dir_ownerExists";
+    bool requestorToBeExclusiveOwner, desc="Sets dir_ownerIsExcl";
+    // NOTE: requestor always added to dir_sharers if updateDirOnCompAck is set
+
+    // Set for incoming snoop requests
+    bool snpNeedsData,  desc="Set if snoop requires data as response";
+    State fwdedState,   desc="State of CompData sent due to a forwarding snoop";
+    bool is_req_hazard, desc="Snoop hazard with an outstanding request";
+    bool is_repl_hazard, desc="Snoop hazard with an outstanding writeback request";
+    bool is_stale,      desc="Request is now stale because of a snoop hazard";
+
+    // Tracks requests sent downstream
+    CHIRequestType pendReqType, desc="Sent request type";
+    bool pendReqAllowRetry,     desc="Sent request can be retried";
+    bool rcvdRetryAck,          desc="Received a RetryAck";
+    bool rcvdRetryCredit,       desc="Received a PCrdGrant";
+    // NOTE: the message is retried only after receiving both RetryAck and
+    // PCrdGrant. A request can be retried only once.
+    // These are a copy of the retry msg fields in case we need to retry
+    Addr pendReqAccAddr;
+    int pendReqAccSize;
+    NetDest pendReqDest;
+    bool pendReqD2OrigReq;
+    bool pendReqRetToSrc;
+
+    // This TBE stalled a message and thus we need to call wakeUpBuffers
+    // at some point
+    bool wakeup_pending_req;
+    bool wakeup_pending_snp;
+    bool wakeup_pending_tgr;
+  }
+
+  // TBE table definition
+  structure(TBETable, external ="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  structure(TBEStorage, external ="yes") {
+    int size();
+    int capacity();
+    int reserved();
+    int slotsAvailable();
+    bool areNSlotsAvailable(int n);
+    void incrementReserved();
+    void decrementReserved();
+    int addEntryToNewSlot();
+    void addEntryToSlot(int slot);
+    void removeEntryFromSlot(int slot);
+  }
+
+  // Directory memory definition
+  structure(PerfectCacheMemory, external = "yes") {
+    void allocate(Addr);
+    void deallocate(Addr);
+    DirEntry lookup(Addr);
+    bool isTagPresent(Addr);
+  }
+
+  // Directory
+  PerfectCacheMemory directory, template="<Cache_DirEntry>";
+
+  // Tracks unique lines locked after a store miss
+  TimerTable useTimerTable;
+
+  // Multiplies sc_lock_base_latency to obtain the lock timeout.
+  // This is incremented at Profile_Eviction and decays on
+  // store miss completion
+  int sc_lock_multiplier, default="0";
+
+  // Definitions of the TBE tables
+
+  // Main TBE table used for incoming requests
+  TBETable TBEs,      template="<Cache_TBE>", constructor="m_number_of_TBEs";
+  TBEStorage storTBEs, constructor="this, m_number_of_TBEs";
+
+  // TBE table for WriteBack/Evict requests generated by a replacement
+  // Notice storTBEs will be used when unify_repl_TBEs is set
+  TBETable replTBEs,      template="<Cache_TBE>", constructor="m_unify_repl_TBEs ? m_number_of_TBEs : m_number_of_repl_TBEs";
+  TBEStorage storReplTBEs, constructor="this, m_number_of_repl_TBEs";
+
+  // TBE table for incoming snoops
+  TBETable snpTBEs,      template="<Cache_TBE>", constructor="m_number_of_snoop_TBEs";
+  TBEStorage storSnpTBEs, constructor="this, m_number_of_snoop_TBEs";
+
+  // Retry handling
+
+  // Destinations that will be sent PCrdGrant when a TBE becomes available
+  TriggerQueue retryQueue, template="<Cache_RetryQueueEntry>";
+
+
+  // Pending RetryAck/PCrdGrant/DoRetry
+  structure(RetryTriggerMsg, interface="Message") {
+    Addr addr;
+    Event event;
+    MachineID retryDest;
+
+    bool functionalRead(Packet *pkt) { return false; }
+    bool functionalRead(Packet *pkt, WriteMask &mask) { return false; }
+    bool functionalWrite(Packet *pkt) { return false; }
+  }
+
+  // Destinations from we received a RetryAck. Sending new requests to these
+  // destinations will be blocked until a PCrdGrant is received if
+  // throttle_req_on_retry is set
+  NetDest destsWaitingRetry;
+
+  // Pending transaction actions (generated by TBE:actions)
+  structure(TriggerMsg, interface="Message") {
+    Addr addr;
+    bool from_hazard; // this actions was generate during a snoop hazard
+    bool functionalRead(Packet *pkt) { return false; }
+    bool functionalRead(Packet *pkt, WriteMask &mask) { return false; }
+    bool functionalWrite(Packet *pkt) { return false; }
+  }
+
+  // Internal replacement request
+  structure(ReplacementMsg, interface="Message") {
+    Addr addr;
+    Addr from_addr;
+    int slot; // set only when unify_repl_TBEs is set
+    bool functionalRead(Packet *pkt) { return false; }
+    bool functionalRead(Packet *pkt, WriteMask &mask) { return false; }
+    bool functionalWrite(Packet *pkt) { return false; }
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Input/output port definitions
+  ////////////////////////////////////////////////////////////////////////////
+
+  include "CHI-cache-ports.sm";
+  // CHI-cache-ports.sm also includes CHI-cache-funcs.sm
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Actions and transitions
+  ////////////////////////////////////////////////////////////////////////////
+
+  include "CHI-cache-actions.sm";
+  include "CHI-cache-transitions.sm";
+}
--- a/src/mem/ruby/protocol/chi/CHI-mem.sm
+++ b/src/mem/ruby/protocol/chi/CHI-mem.sm
@@ -0,0 +1,792 @@
+/*
+ * Copyright (c) 2021 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+machine(MachineType:Memory, "Memory controller interface") :
+
+  // no explicit modeling of allocation latency like the Caches, so add one
+  // cycle to the response enqueue latency as default
+  Cycles response_latency := 2;
+  Cycles data_latency := 1;
+  Cycles to_memory_controller_latency := 1;
+
+  int data_channel_size;
+
+  // Interface to the network
+  // Note vnet_type is used by Garnet only. "response" type is assumed to
+  // have data, so use it for data channels and "none" for the rest.
+  // network="To" for outbound queue; network="From" for inbound
+  // virtual networks: 0=request, 1=snoop, 2=response, 3=data
+
+  MessageBuffer * reqOut,   network="To", virtual_network="0", vnet_type="none";
+  MessageBuffer * snpOut,   network="To", virtual_network="1", vnet_type="none";
+  MessageBuffer * rspOut,   network="To", virtual_network="2", vnet_type="none";
+  MessageBuffer * datOut,   network="To", virtual_network="3", vnet_type="response";
+
+  MessageBuffer * reqIn,   network="From", virtual_network="0", vnet_type="none";
+  MessageBuffer * snpIn,   network="From", virtual_network="1", vnet_type="none";
+  MessageBuffer * rspIn,   network="From", virtual_network="2", vnet_type="none";
+  MessageBuffer * datIn,   network="From", virtual_network="3", vnet_type="response";
+
+  // Requests that can allocate a TBE
+  MessageBuffer * reqRdy;
+
+  // Data/ack to/from memory
+  MessageBuffer * requestToMemory;
+  MessageBuffer * responseFromMemory;
+
+  // Trigger queue for internal events
+  MessageBuffer * triggerQueue;
+
+{
+
+  ////////////////////////////////////////////////////////////////////////////
+  // States
+  ////////////////////////////////////////////////////////////////////////////
+
+  state_declaration(State, desc="Transaction states", default="Memory_State_READY") {
+    // We don't know if the line is cached, so the memory copy is maybe stable
+    READY, AccessPermission:Backing_Store, desk="Ready to transfer the line";
+
+    WAITING_NET_DATA, AccessPermission:Backing_Store_Busy, desc="Waiting data from the network";
+    SENDING_NET_DATA, AccessPermission:Backing_Store_Busy, desc="Sending data to the network";
+    READING_MEM, AccessPermission:Backing_Store_Busy, desc="Waiting data from memory";
+
+    // Null state for debugging; allow writes
+    null, AccessPermission:Backing_Store, desc="Null state";
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Events
+  ////////////////////////////////////////////////////////////////////////////
+
+  enumeration(Event, desc="Memory events") {
+    // Checks if a request can allocate a TBE be moved to reqRdy
+    CheckAllocTBE;
+    CheckAllocTBE_WithCredit;
+
+    // Requests
+    WriteNoSnpPtl;
+    WriteNoSnp;
+    ReadNoSnp;
+    ReadNoSnpSep;
+
+    // Data
+    WriteData;
+
+    // Memory side
+    MemoryData;
+    MemoryAck;
+
+    // Internal event triggers
+    Trigger_Send;
+    Trigger_SendDone;
+    Trigger_ReceiveDone;
+    Trigger_SendRetry;
+    Trigger_SendPCrdGrant;
+  }
+
+
+  // Is there a less tedious way to convert messages to events ??
+
+  Event reqToEvent (CHIRequestType type) {
+    if (type == CHIRequestType:WriteNoSnpPtl) {
+      return Event:WriteNoSnpPtl;
+    } else if (type == CHIRequestType:WriteNoSnp) {
+      return Event:WriteNoSnp;
+    } else if (type == CHIRequestType:ReadNoSnp) {
+      return Event:ReadNoSnp;
+    } else if (type == CHIRequestType:ReadNoSnpSep) {
+      return Event:ReadNoSnpSep;
+    } else {
+      error("Invalid CHIRequestType");
+    }
+  }
+
+  Event respToEvent (CHIResponseType type) {
+    error("Invalid CHIResponseType");
+  }
+
+  Event dataToEvent (CHIDataType type) {
+    if (type == CHIDataType:NCBWrData) {
+      return Event:WriteData;
+    } else {
+      error("Invalid CHIDataType");
+    }
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Data structures
+  ////////////////////////////////////////////////////////////////////////////
+
+  // Cache block size
+  int blockSize, default="RubySystem::getBlockSizeBytes()";
+
+  // TBE fields
+  structure(TBE, desc="...") {
+    int storSlot,   desc="Slot in the storage tracker occupied by this entry";
+    Addr addr,      desc="Line address for this TBE";
+    Addr accAddr,   desc="Original access address. Set only for Write*Ptl";
+    int  accSize,   desc="Access size. Set only for Write*Ptl";
+    State state,    desc="Current line state";
+    DataBlock dataBlk, desc="Transaction data";
+    WriteMask dataBlkValid, desc="valid bytes in dataBlk";
+    int rxtxBytes, desc="Bytes sent or received";
+    MachineID requestor, desc="Requestor that originated this request";
+    MachineID destination, desc="Where we are sending data";
+    bool useDataSepResp, desc="Replies with DataSepResp instead of CompData";
+  }
+
+  structure(TBETable, external ="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+    bool areNSlotsAvailable(int n, Tick curTime);
+  }
+
+  structure(TBEStorage, external ="yes") {
+    int size();
+    int capacity();
+    int reserved();
+    int slotsAvailable();
+    bool areNSlotsAvailable(int n);
+    void incrementReserved();
+    void decrementReserved();
+    int addEntryToNewSlot();
+    void removeEntryFromSlot(int slot);
+  }
+
+  TBETable TBEs, template="<Memory_TBE>", constructor="m_number_of_TBEs";
+  TBEStorage storTBEs, constructor="this, m_number_of_TBEs";
+
+  // Tracks all pending MemoryAcks (debug purposes only)
+  int pendingWrites, default="0";
+
+  structure(TriggerMsg, desc="...", interface="Message") {
+    Addr addr;
+    Event event;
+    MachineID retryDest;
+
+    bool functionalRead(Packet *pkt) { return false; }
+    bool functionalRead(Packet *pkt, WriteMask &mask) { return false; }
+    bool functionalWrite(Packet *pkt) { return false; }
+  }
+
+  // Tracks a pending credit request from a retry
+  structure(RetryQueueEntry) {
+    Addr addr,           desc="Line address";
+    MachineID retryDest, desc="Retry destination";
+  }
+
+  structure(TriggerQueue, external ="yes") {
+    void pop();
+    bool empty();
+    void emplace(Addr,MachineID);
+    RetryQueueEntry next();
+  }
+
+  TriggerQueue retryQueue, template="<Memory_RetryQueueEntry>";
+
+  ////////////////////////////////////////////////////////////////////////////
+  // External functions
+  ////////////////////////////////////////////////////////////////////////////
+
+  Tick clockEdge();
+  Tick curTick();
+  Tick cyclesToTicks(Cycles c);
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers(Addr a);
+  bool respondsTo(Addr addr);
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Interface functions required by SLICC
+  ////////////////////////////////////////////////////////////////////////////
+
+  State getState(TBE tbe, Addr addr) {
+    if (is_valid(tbe)) {
+        assert(tbe.addr == addr);
+        return tbe.state;
+    } else {
+        return State:READY;
+    }
+  }
+
+  void setState(TBE tbe, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      assert(tbe.addr == addr);
+      tbe.state := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    if (respondsTo(addr)) {
+      TBE tbe := TBEs[addr];
+      if (is_valid(tbe)) {
+        DPRINTF(RubySlicc, "%x %s,%s\n", addr, tbe.state, Memory_State_to_permission(tbe.state));
+        return Memory_State_to_permission(tbe.state);
+      } else {
+        DPRINTF(RubySlicc, "%x %s\n", addr, AccessPermission:Backing_Store);
+        return AccessPermission:Backing_Store;
+      }
+    } else {
+      DPRINTF(RubySlicc, "%x %s\n", addr, AccessPermission:NotPresent);
+      return AccessPermission:NotPresent;
+    }
+  }
+
+  void setAccessPermission(Addr addr, State state) {
+  }
+
+  void functionalRead(Addr addr, Packet *pkt, WriteMask &mask) {
+    if (respondsTo(addr)) {
+      DPRINTF(RubySlicc, "functionalRead %x\n", addr);
+      TBE tbe := TBEs[addr];
+
+      if (mask.isEmpty()) {
+        functionalMemoryRead(pkt);
+        mask.fillMask();
+        DPRINTF(RubySlicc, "functionalRead mem %x %s\n", addr, mask);
+      }
+
+      // Update with any transient data
+      //TODO additional handling of partial data ??
+      if (is_valid(tbe)) {
+        WriteMask read_mask;
+        read_mask.setMask(addressOffset(tbe.accAddr, tbe.addr), tbe.accSize);
+        read_mask.andMask(tbe.dataBlkValid);
+        if (read_mask.isEmpty() == false) {
+          testAndReadMask(addr, tbe.dataBlk, read_mask, pkt);
+          DPRINTF(RubySlicc, "functionalRead tbe %x %s %s %s\n", addr, tbe.dataBlk, read_mask, mask);
+          mask.orMask(read_mask);
+        }
+      }
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    if(respondsTo(addr)) {
+      int num_functional_writes := 0;
+      TBE tbe := TBEs[addr];
+      if (is_valid(tbe)) {
+        num_functional_writes := num_functional_writes +
+          testAndWrite(addr, tbe.dataBlk, pkt);
+        DPRINTF(RubySlicc, "functionalWrite tbe %x %s\n", addr, tbe.dataBlk);
+      }
+      num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+      DPRINTF(RubySlicc, "functionalWrite mem %x\n", addr);
+      return num_functional_writes;
+    } else {
+      return 0;
+    }
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Helper functions
+  ////////////////////////////////////////////////////////////////////////////
+
+  void printResources() {
+    DPRINTF(RubySlicc, "Resources(avail/max): TBEs=%d/%d\n",
+                  storTBEs.size(), storTBEs.capacity());
+    DPRINTF(RubySlicc, "Resources(in/out size): rdy=%d req=%d/%d rsp=%d/%d dat=%d/%d snp=%d/%d\n",
+                  reqRdy.getSize(curTick()),
+                  reqIn.getSize(curTick()), reqOut.getSize(curTick()),
+                  rspIn.getSize(curTick()), rspOut.getSize(curTick()),
+                  datIn.getSize(curTick()), datOut.getSize(curTick()),
+                  snpIn.getSize(curTick()), snpOut.getSize(curTick()));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Input/output port definitions
+  ////////////////////////////////////////////////////////////////////////////
+
+  // Outbound port definitions
+
+  out_port(reqOutPort, CHIRequestMsg, reqOut);
+  out_port(snpOutPort, CHIRequestMsg, snpOut);
+  out_port(rspOutPort, CHIResponseMsg, rspOut);
+  out_port(datOutPort, CHIDataMsg, datOut);
+  out_port(triggerOutPort, TriggerMsg, triggerQueue);
+  out_port(memQueue_out, MemoryMsg, requestToMemory);
+  out_port(reqRdyOutPort, CHIRequestMsg, reqRdy);
+
+  // Inbound port definitions
+
+  // Response
+  in_port(rspInPort, CHIResponseMsg, rspIn, rank=6) {
+    if (rspInPort.isReady(clockEdge())) {
+      printResources();
+      peek(rspInPort, CHIResponseMsg) {
+        error("Unexpected message");
+      }
+    }
+  }
+
+  // Data
+  in_port(datInPort, CHIDataMsg, datIn, rank=5) {
+    if (datInPort.isReady(clockEdge())) {
+      printResources();
+      peek(datInPort, CHIDataMsg) {
+        int received := in_msg.bitMask.count();
+        assert((received <= data_channel_size) && (received > 0));
+        trigger(dataToEvent(in_msg.type), in_msg.addr, TBEs[in_msg.addr]);
+      }
+    }
+  }
+
+  // Data/Ack from memory
+
+  in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=4) {
+    if (memQueue_in.isReady(clockEdge())) {
+      printResources();
+      peek(memQueue_in, MemoryMsg) {
+        Addr addr := makeLineAddress(in_msg.addr);
+        if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
+          trigger(Event:MemoryData, addr, TBEs[addr]);
+        } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
+          trigger(Event:MemoryAck, addr, TBEs[addr]);
+        } else {
+          error("Invalid message");
+        }
+      }
+    }
+  }
+
+   // Trigger
+  in_port(triggerInPort, TriggerMsg, triggerQueue, rank=3) {
+    if (triggerInPort.isReady(clockEdge())) {
+      printResources();
+      peek(triggerInPort, TriggerMsg) {
+        trigger(in_msg.event, in_msg.addr, TBEs[in_msg.addr]);
+      }
+    }
+  }
+
+  // Snoops
+  in_port(snpInPort, CHIRequestMsg, snpIn, rank=2) {
+    if (snpInPort.isReady(clockEdge())) {
+      printResources();
+      peek(snpInPort, CHIRequestMsg) {
+        error("Unexpected message");
+      }
+    }
+  }
+
+  // Requests
+  in_port(reqRdyInPort, CHIRequestMsg, reqRdy, rank=1) {
+    if (reqRdyInPort.isReady(clockEdge())) {
+      printResources();
+      peek(reqRdyInPort, CHIRequestMsg) {
+        trigger(reqToEvent(in_msg.type), in_msg.addr, TBEs[in_msg.addr]);
+      }
+    }
+  }
+
+  in_port(reqInPort, CHIRequestMsg, reqIn, rank=0) {
+    if (reqInPort.isReady(clockEdge())) {
+      printResources();
+      peek(reqInPort, CHIRequestMsg) {
+        if (in_msg.allowRetry) {
+          trigger(Event:CheckAllocTBE, in_msg.addr, TBEs[in_msg.addr]);
+        } else {
+          // Only expected requests that do not allow retry are the ones that
+          // are being retried after receiving credit
+          trigger(Event:CheckAllocTBE_WithCredit,
+                  in_msg.addr, TBEs[in_msg.addr]);
+        }
+      }
+    }
+  }
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Actions
+  ////////////////////////////////////////////////////////////////////////////
+
+  action(checkAllocateTBE, desc="") {
+    // Move to reqRdy if resources available, otherwise send retry
+    if (storTBEs.areNSlotsAvailable(1)) {
+      // reserve a slot for this request
+      storTBEs.incrementReserved();
+
+      peek(reqInPort, CHIRequestMsg) {
+        enqueue(reqRdyOutPort, CHIRequestMsg, 0) {
+          out_msg := in_msg;
+        }
+      }
+
+    } else {
+      peek(reqInPort, CHIRequestMsg) {
+        assert(in_msg.allowRetry);
+        enqueue(triggerOutPort, TriggerMsg, 0) {
+          out_msg.addr := in_msg.addr;
+          out_msg.event := Event:Trigger_SendRetry;
+          out_msg.retryDest := in_msg.requestor;
+          retryQueue.emplace(in_msg.addr,in_msg.requestor);
+        }
+      }
+    }
+    reqInPort.dequeue(clockEdge());
+  }
+
+  action(checkAllocateTBE_withCredit, desc="") {
+    // We must have reserved resources for this request
+    peek(reqInPort, CHIRequestMsg) {
+      assert(in_msg.allowRetry == false);
+      enqueue(reqRdyOutPort, CHIRequestMsg, 0) {
+        out_msg := in_msg;
+      }
+    }
+    reqInPort.dequeue(clockEdge());
+  }
+
+  action(allocateTBE, "atbe", desc="Allocate TBEs for a miss") {
+    // We must have reserved resources for this allocation
+    storTBEs.decrementReserved();
+    assert(storTBEs.areNSlotsAvailable(1));
+
+    TBEs.allocate(address);
+    set_tbe(TBEs[address]);
+    tbe.storSlot := storTBEs.addEntryToNewSlot();
+    tbe.addr := address;
+    tbe.rxtxBytes := 0;
+    tbe.useDataSepResp := false;
+  }
+
+  action(initializeFromReqTBE, "itbe", desc="Initialize TBE fields") {
+    peek(reqRdyInPort, CHIRequestMsg) {
+      tbe.requestor := in_msg.requestor;
+      if (in_msg.dataToFwdRequestor) {
+        tbe.destination := in_msg.fwdRequestor;
+      } else {
+        tbe.destination := in_msg.requestor;
+      }
+      tbe.accAddr := in_msg.accAddr;
+      tbe.accSize := in_msg.accSize;
+    }
+  }
+
+  action(decWritePending, "dwp", desc="Decrement pending writes") {
+    assert(pendingWrites >= 1);
+    pendingWrites := pendingWrites - 1;
+  }
+
+  action(deallocateTBE, "dtbe", desc="Deallocate TBEs") {
+    assert(is_valid(tbe));
+    storTBEs.removeEntryFromSlot(tbe.storSlot);
+    TBEs.deallocate(address);
+    unset_tbe();
+    // send credit if requestor waiting for it
+    if (retryQueue.empty() == false) {
+      assert(storTBEs.areNSlotsAvailable(1));
+      storTBEs.incrementReserved();
+      RetryQueueEntry e := retryQueue.next();
+      retryQueue.pop();
+      enqueue(triggerOutPort, TriggerMsg, 0) {
+        out_msg.addr := e.addr;
+        out_msg.retryDest := e.retryDest;
+        out_msg.event := Event:Trigger_SendPCrdGrant;
+      }
+    }
+  }
+
+  action(sendReadReceipt, "sRR", desc="Send receipt to requestor") {
+    assert(is_valid(tbe));
+    enqueue(rspOutPort, CHIResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.type := CHIResponseType:ReadReceipt;
+      out_msg.responder := machineID;
+      out_msg.Destination.add(tbe.requestor);
+    }
+    // also send different type of data when ready
+    tbe.useDataSepResp := true;
+  }
+
+  action(sendCompDBIDResp, "sCbid", desc="Send ack to requestor") {
+    assert(is_valid(tbe));
+    enqueue(rspOutPort, CHIResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.type := CHIResponseType:CompDBIDResp;
+      out_msg.responder := machineID;
+      out_msg.Destination.add(tbe.requestor);
+    }
+  }
+
+  action(sendMemoryRead, "smr", desc="Send request to memory") {
+    assert(is_valid(tbe));
+    enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
+      out_msg.addr := address;
+      out_msg.Type := MemoryRequestType:MEMORY_READ;
+      out_msg.Sender := tbe.requestor;
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Len := 0;
+    }
+  }
+
+  action(sendMemoryWrite, "smw", desc="Send request to memory") {
+    assert(is_valid(tbe));
+    enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
+      out_msg.addr := tbe.accAddr;
+      out_msg.Type := MemoryRequestType:MEMORY_WB;
+      out_msg.Sender := tbe.requestor;
+      out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      out_msg.DataBlk := tbe.dataBlk;
+      out_msg.Len := tbe.accSize;
+    }
+    tbe.dataBlkValid.clear();
+    pendingWrites := pendingWrites + 1;
+  }
+
+  action(prepareSend, "ps", desc="Copies received memory data to TBE") {
+    assert(is_valid(tbe));
+    peek(memQueue_in, MemoryMsg) {
+      tbe.dataBlk := in_msg.DataBlk;
+    }
+    tbe.rxtxBytes := 0;
+    tbe.dataBlkValid.setMask(addressOffset(tbe.accAddr, tbe.addr), tbe.accSize);
+  }
+
+  action(copyWriteDataToTBE, "cpWDat", desc="Copies received net data to TBE") {
+    peek(datInPort, CHIDataMsg) {
+      assert(is_valid(tbe));
+      tbe.dataBlk.copyPartial(in_msg.dataBlk, in_msg.bitMask);
+      tbe.dataBlkValid.orMask(in_msg.bitMask);
+      tbe.rxtxBytes := tbe.rxtxBytes + in_msg.bitMask.count();
+    }
+  }
+
+  action(sendDataAndCheck, "sd", desc="Send received data to requestor") {
+    assert(is_valid(tbe));
+    assert(tbe.rxtxBytes < blockSize);
+    enqueue(datOutPort, CHIDataMsg, data_latency) {
+      out_msg.addr := tbe.addr;
+      if (tbe.useDataSepResp) {
+        out_msg.type := CHIDataType:DataSepResp_UC;
+      } else {
+        out_msg.type := CHIDataType:CompData_UC;
+      }
+      out_msg.dataBlk := tbe.dataBlk;
+      // Called in order for the whole block so use rxtxBytes as offset
+      out_msg.bitMask.setMask(tbe.rxtxBytes, data_channel_size);
+      out_msg.Destination.add(tbe.destination);
+    }
+
+    //DPRINTF(RubySlicc, "rxtxBytes=%d\n", tbe.rxtxBytes);
+
+    tbe.rxtxBytes := tbe.rxtxBytes + data_channel_size;
+
+    // end or send next chunk next cycle
+    Event next := Event:Trigger_SendDone;
+    Cycles delay := intToCycles(0);
+    if (tbe.rxtxBytes < blockSize) {
+        next := Event:Trigger_Send;
+        delay := intToCycles(1);
+    }
+    enqueue(triggerOutPort, TriggerMsg, delay) {
+      out_msg.addr := address;
+      out_msg.event := next;
+    }
+  }
+
+  action(checkForReceiveCompletion, "cWc", desc="Check if all data is received") {
+    assert(is_valid(tbe));
+    DPRINTF(RubySlicc, "rxtxBytes=%d\n", tbe.rxtxBytes);
+    assert((tbe.rxtxBytes <= tbe.accSize) && (tbe.rxtxBytes > 0));
+    if (tbe.rxtxBytes == tbe.accSize) {
+      enqueue(triggerOutPort, TriggerMsg, 0) {
+        out_msg.addr := address;
+        out_msg.event := Event:Trigger_ReceiveDone;
+      }
+      tbe.rxtxBytes := 0;
+      assert(tbe.dataBlkValid.getMask(addressOffset(tbe.accAddr, tbe.addr), tbe.accSize));
+    }
+  }
+
+  action(popReqInQueue, "preq", desc="Pop request queue.") {
+    reqRdyInPort.dequeue(clockEdge());
+  }
+
+  action(popDataInQueue, "pdata", desc="Pop data queue.") {
+    datInPort.dequeue(clockEdge());
+  }
+
+  action(popTriggerQueue, "ptrigger", desc="Pop trigger queue.") {
+    triggerInPort.dequeue(clockEdge());
+  }
+
+  action(popMemoryQueue, "pmem", desc="Pop memory queue.") {
+    memQueue_in.dequeue(clockEdge());
+  }
+
+  // Stall/wake-up only used for requests that arrive when we are on the
+  // WAITING_NET_DATA state. For all other case the line should be either
+  // ready or we can overlap
+  action(stallRequestQueue, "str", desc="Stall and wait on the address") {
+    peek(reqRdyInPort, CHIRequestMsg){
+      stall_and_wait(reqRdyInPort, address);
+    }
+  }
+  action(wakeUpStalled, "wa", desc="Wake up any requests waiting for this address") {
+    wakeUpAllBuffers(address);
+  }
+
+  action(sendRetryAck, desc="") {
+    peek(triggerInPort, TriggerMsg) {
+      enqueue(rspOutPort, CHIResponseMsg, response_latency) {
+        out_msg.addr := in_msg.addr;
+        out_msg.type := CHIResponseType:RetryAck;
+        out_msg.responder := machineID;
+        out_msg.Destination.add(in_msg.retryDest);
+      }
+    }
+  }
+
+  action(sendPCrdGrant, desc="") {
+    peek(triggerInPort, TriggerMsg) {
+      enqueue(rspOutPort, CHIResponseMsg, response_latency) {
+        out_msg.addr := in_msg.addr;
+        out_msg.type := CHIResponseType:PCrdGrant;
+        out_msg.responder := machineID;
+        out_msg.Destination.add(in_msg.retryDest);
+      }
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Transitions
+  ////////////////////////////////////////////////////////////////////////////
+
+  transition(READY, ReadNoSnp, READING_MEM) {
+    allocateTBE;
+    initializeFromReqTBE;
+    sendMemoryRead;
+    popReqInQueue;
+  }
+
+  transition(READY, ReadNoSnpSep, READING_MEM) {
+    allocateTBE;
+    initializeFromReqTBE;
+    sendMemoryRead;
+    sendReadReceipt;
+    popReqInQueue;
+  }
+
+  transition(READING_MEM, MemoryData, SENDING_NET_DATA) {
+    prepareSend;
+    sendDataAndCheck;
+    popMemoryQueue;
+  }
+
+  transition(SENDING_NET_DATA, Trigger_Send) {
+    sendDataAndCheck;
+    popTriggerQueue;
+  }
+
+  transition(READY, WriteNoSnpPtl, WAITING_NET_DATA) {
+    allocateTBE;
+    initializeFromReqTBE;
+    sendCompDBIDResp;
+    popReqInQueue;
+  }
+
+  transition(READY, WriteNoSnp, WAITING_NET_DATA) {
+    allocateTBE;
+    initializeFromReqTBE;
+    sendCompDBIDResp;
+    popReqInQueue;
+  }
+
+  transition(WAITING_NET_DATA, WriteData) {
+    copyWriteDataToTBE;
+    checkForReceiveCompletion;
+    popDataInQueue;
+  }
+
+  transition(WAITING_NET_DATA, Trigger_ReceiveDone, READY) {
+    sendMemoryWrite;
+    deallocateTBE;
+    wakeUpStalled;
+    popTriggerQueue;
+  }
+
+  transition(SENDING_NET_DATA, Trigger_SendDone, READY) {
+    deallocateTBE;
+    wakeUpStalled;
+    popTriggerQueue;
+  }
+
+  // Just sanity check against counter of pending acks
+  transition({READING_MEM,WAITING_NET_DATA,SENDING_NET_DATA,READY},
+              MemoryAck) {
+    decWritePending;
+    popMemoryQueue;
+  }
+
+  // Notice we only use this here and call wakeUp when leaving this state
+  transition({READING_MEM,WAITING_NET_DATA,SENDING_NET_DATA},
+             {ReadNoSnp, ReadNoSnpSep, WriteNoSnpPtl}) {
+    stallRequestQueue;
+  }
+
+  transition({READING_MEM,WAITING_NET_DATA,SENDING_NET_DATA,READY},
+              Trigger_SendRetry) {
+    sendRetryAck;
+    popTriggerQueue;
+  }
+
+  transition({READING_MEM,WAITING_NET_DATA,SENDING_NET_DATA,READY},
+              Trigger_SendPCrdGrant) {
+    sendPCrdGrant;
+    popTriggerQueue;
+  }
+
+  transition({READING_MEM,WAITING_NET_DATA,SENDING_NET_DATA,READY},
+              CheckAllocTBE) {
+    checkAllocateTBE;
+  }
+
+  transition({READING_MEM,WAITING_NET_DATA,SENDING_NET_DATA,READY},
+              CheckAllocTBE_WithCredit) {
+    checkAllocateTBE_withCredit;
+  }
+
+}
--- a/src/mem/ruby/protocol/chi/CHI-msg.sm
+++ b/src/mem/ruby/protocol/chi/CHI-msg.sm
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2021 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// All CHI request and response types match the name style in the standard doc.
+// For a description of a specific message type, refer to the Arm's AMBA 5
+// CHI specification (issue D):
+// https://static.docs.arm.com/ihi0050/d/
+// IHI0050D_amba_5_chi_architecture_spec.pdf
+
+enumeration(CHIRequestType, desc="") {
+  // Incoming requests generated by the sequencer
+  Load;
+  Store;
+  StoreLine;
+
+  // CHI request types
+  ReadShared;
+  ReadNotSharedDirty;
+  ReadUnique;
+  ReadOnce;
+  CleanUnique;
+
+  Evict;
+
+  WriteBackFull;
+  WriteCleanFull;
+  WriteEvictFull;
+  WriteUniquePtl;
+  WriteUniqueFull;
+
+  SnpSharedFwd;
+  SnpNotSharedDirtyFwd;
+  SnpUniqueFwd;
+  SnpOnceFwd;
+  SnpOnce;
+  SnpShared;
+  SnpUnique;
+  SnpCleanInvalid;
+
+  WriteNoSnpPtl;
+  WriteNoSnp;
+  ReadNoSnp;
+  ReadNoSnpSep;
+
+  null;
+}
+
+structure(CHIRequestMsg, desc="", interface="Message") {
+  Addr addr,                desc="Request line address";
+  Addr accAddr,             desc="Original access address. Set for Write*Ptl and requests from the sequencer";
+  int  accSize,             desc="Access size. Set for Write*Ptl and requests from the sequencer";
+  CHIRequestType type,      desc="Request type";
+  MachineID requestor,      desc="Requestor ID";
+  MachineID fwdRequestor,   desc="Where to send data for DMT/DCT requests";
+  bool dataToFwdRequestor,  desc="Data has to be forwarded to fwdRequestor";
+  bool retToSrc,            desc="Affects whether or not a snoop resp returns data";
+  bool allowRetry,          desc="This request can be retried";
+  NetDest Destination,      desc="Message destination";
+
+  RequestPtr seqReq,        default="nullptr", desc="Pointer to original request from CPU/sequencer (nullptr if not valid)";
+  bool isSeqReqValid,       default="false",   desc="Set if seqReq is valid (not nullptr)";
+
+  bool is_local_pf,         desc="Request generated by a local prefetcher";
+  bool is_remote_pf,        desc="Request generated a prefetcher in another cache";
+
+  MessageSizeType MessageSize, default="MessageSizeType_Control";
+
+  // No data for functional access
+  bool functionalRead(Packet *pkt) { return false; }
+  bool functionalRead(Packet *pkt, WriteMask &mask) { return false; }
+  bool functionalWrite(Packet *pkt) { return false; }
+}
+
+enumeration(CHIResponseType, desc="...") {
+  // CHI response types
+  Comp_I;
+  Comp_UC;
+  Comp_SC;
+  CompAck;
+  CompDBIDResp;
+  DBIDResp;
+  Comp;
+  ReadReceipt;
+  RespSepData;
+
+  SnpResp_I;
+  SnpResp_I_Fwded_UC;
+  SnpResp_I_Fwded_UD_PD;
+  SnpResp_SC;
+  SnpResp_SC_Fwded_SC;
+  SnpResp_SC_Fwded_SD_PD;
+  SnpResp_UC_Fwded_I;
+  SnpResp_UD_Fwded_I;
+  SnpResp_SC_Fwded_I;
+  SnpResp_SD_Fwded_I;
+
+  RetryAck;
+  PCrdGrant;
+
+  null;
+}
+
+structure(CHIResponseMsg, desc="", interface="Message") {
+  Addr addr,            desc="Line address";
+  CHIResponseType type, desc="Response type";
+  MachineID responder,  desc="Responder ID";
+  NetDest Destination,  desc="Response destination";
+  bool stale,           desc="Response to a stale request";
+  //NOTE: not in CHI and for debuging only
+
+  MessageSizeType MessageSize, default="MessageSizeType_Control";
+
+  // No data for functional access
+  bool functionalRead(Packet *pkt) { return false; }
+  bool functionalRead(Packet *pkt, WriteMask &mask) { return false; }
+  bool functionalWrite(Packet *pkt) { return false; }
+}
+
+enumeration(CHIDataType, desc="...") {
+  // CHI data response types
+  CompData_I;
+  CompData_UC;
+  CompData_SC;
+  CompData_UD_PD;
+  CompData_SD_PD;
+  DataSepResp_UC;
+  CBWrData_UC;
+  CBWrData_SC;
+  CBWrData_UD_PD;
+  CBWrData_SD_PD;
+  CBWrData_I;
+  NCBWrData;
+  SnpRespData_I;
+  SnpRespData_I_PD;
+  SnpRespData_SC;
+  SnpRespData_SC_PD;
+  SnpRespData_SD;
+  SnpRespData_UC;
+  SnpRespData_UD;
+  SnpRespData_SC_Fwded_SC;
+  SnpRespData_SC_Fwded_SD_PD;
+  SnpRespData_SC_PD_Fwded_SC;
+  SnpRespData_I_Fwded_SD_PD;
+  SnpRespData_I_PD_Fwded_SC;
+  SnpRespData_I_Fwded_SC;
+  null;
+}
+
+structure(CHIDataMsg, desc="", interface="Message") {
+  Addr addr,            desc="Line address";
+  CHIDataType type,     desc="Response type";
+  MachineID responder,  desc="Responder ID";
+  NetDest Destination,  desc="Response destination";
+  DataBlock dataBlk,    desc="Line data";
+  WriteMask bitMask,    desc="Which bytes in the data block are valid";
+
+
+  MessageSizeType MessageSize, default="MessageSizeType_Data";
+
+  bool functionalRead(Packet *pkt) {
+    if(bitMask.isFull()) {
+      return testAndRead(addr, dataBlk, pkt);
+    } else {
+      return false;
+    }
+  }
+
+  bool functionalRead(Packet *pkt, WriteMask &mask) {
+    // read if bitmask has bytes not in mask or if data is dirty
+    bool is_dirty := (type == CHIDataType:CompData_UD_PD) ||
+                    (type == CHIDataType:CompData_SD_PD) ||
+                    (type == CHIDataType:CBWrData_UD_PD) ||
+                    (type == CHIDataType:CBWrData_SD_PD) ||
+                    (type == CHIDataType:NCBWrData) ||
+                    (type == CHIDataType:SnpRespData_I_PD) ||
+                    (type == CHIDataType:SnpRespData_SC_PD) ||
+                    (type == CHIDataType:SnpRespData_SD) ||
+                    (type == CHIDataType:SnpRespData_UD) ||
+                    (type == CHIDataType:SnpRespData_SC_Fwded_SD_PD) ||
+                    (type == CHIDataType:SnpRespData_SC_PD_Fwded_SC) ||
+                    (type == CHIDataType:SnpRespData_I_Fwded_SD_PD) ||
+                    (type == CHIDataType:SnpRespData_I_PD_Fwded_SC);
+    assert(bitMask.isEmpty() == false);
+    WriteMask test_mask := mask;
+    test_mask.orMask(bitMask);
+    if ((test_mask.cmpMask(mask) == false) || is_dirty) {
+      if (testAndReadMask(addr, dataBlk, bitMask, pkt)) {
+        mask.orMask(bitMask);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    return testAndWrite(addr, dataBlk, pkt);
+  }
+}
+
+
--- a/src/mem/ruby/protocol/chi/CHI.slicc
+++ b/src/mem/ruby/protocol/chi/CHI.slicc
@@ -0,0 +1,6 @@
+protocol "CHI";
+
+include "RubySlicc_interfaces.slicc";
+include "CHI-msg.sm";
+include "CHI-cache.sm";
+include "CHI-mem.sm";
--- a/src/mem/ruby/protocol/chi/SConsopts
+++ b/src/mem/ruby/protocol/chi/SConsopts
@@ -0,0 +1,47 @@
+# -*- mode:python -*-
+
+# Copyright (c) 2021 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+# Register this protocol with gem5/SCons
+
+all_protocols.append('CHI')
+
+# CHI requires Ruby's inerface to support partial functional reads
+need_partial_func_reads.append('CHI')
+
+protocol_dirs.append(Dir('.').abspath)
--- a/src/mem/ruby/system/SConscript
+++ b/src/mem/ruby/system/SConscript
@@ -45,9 +45,6 @@ if env['PROTOCOL'] == 'None':

 env.Append(CPPDEFINES=['PROTOCOL_' + env['PROTOCOL']])

-# list of protocols that require the partial functional read interface
-need_partial_func_reads = []
-
 if env['PROTOCOL'] in need_partial_func_reads:
    env.Append(CPPDEFINES=['PARTIAL_FUNC_READS'])

--- a/src/python/m5/SimObject.py
+++ b/src/python/m5/SimObject.py
@@ -368,7 +368,7 @@ def createCxxConfigDirectoryEntryFile(code, name, simobj, is_header):

    if not is_header:
        code('{')
-        if hasattr(simobj, 'abstract') and simobj.abstract:
+        if getattr(simobj, 'abstract', False):
            code('    return NULL;')
        else:
            code('    return this->create();')
@@ -700,6 +700,80 @@ class MetaSimObject(type):
    def pybind_predecls(cls, code):
        code('#include "${{cls.cxx_header}}"')

+    def cxx_param_def(cls, code):
+        code('''
+#include <type_traits>
+
+#include "base/compiler.hh"
+
+#include "${{cls.cxx_header}}"
+#include "params/${cls}.hh"
+
+''')
+        code()
+        code('namespace')
+        code('{')
+        code()
+        # If we can't define a default create() method for this params struct
+        # because the SimObject doesn't have the right constructor, use
+        # template magic to make it so we're actually defining a create method
+        # for this class instead.
+        code('class Dummy${cls}ParamsClass')
+        code('{')
+        code('  public:')
+        code('    ${{cls.cxx_class}} *create() const;')
+        code('};')
+        code()
+        code('template <class CxxClass, class Enable=void>')
+        code('class Dummy${cls}Shunt;')
+        code()
+        # This version directs to the real Params struct and the default
+        # behavior of create if there's an appropriate constructor.
+        code('template <class CxxClass>')
+        code('class Dummy${cls}Shunt<CxxClass, std::enable_if_t<')
+        code('    std::is_constructible<CxxClass,')
+        code('        const ${cls}Params &>::value>>')
+        code('{')
+        code('  public:')
+        code('    using Params = ${cls}Params;')
+        code('    static ${{cls.cxx_class}} *')
+        code('    create(const Params &p)')
+        code('    {')
+        code('        return new CxxClass(p);')
+        code('    }')
+        code('};')
+        code()
+        # This version diverts to the DummyParamsClass and a dummy
+        # implementation of create if the appropriate constructor does not
+        # exist.
+        code('template <class CxxClass>')
+        code('class Dummy${cls}Shunt<CxxClass, std::enable_if_t<')
+        code('    !std::is_constructible<CxxClass,')
+        code('        const ${cls}Params &>::value>>')
+        code('{')
+        code('  public:')
+        code('    using Params = Dummy${cls}ParamsClass;')
+        code('    static ${{cls.cxx_class}} *')
+        code('    create(const Params &p)')
+        code('    {')
+        code('        return nullptr;')
+        code('    }')
+        code('};')
+        code()
+        code('} // anonymous namespace')
+        code()
+        # An implementation of either the real Params struct's create
+        # method, or the Dummy one. Either an implementation is
+        # mandantory since this was shunted off to the dummy class, or
+        # one is optional which will override this weak version.
+        code('M5_VAR_USED ${{cls.cxx_class}} *')
+        code('Dummy${cls}Shunt<${{cls.cxx_class}}>::Params::create() const')
+        code('{')
+        code('    return Dummy${cls}Shunt<${{cls.cxx_class}}>::')
+        code('        create(*this);')
+        code('}')
+
+
    def pybind_decl(cls, code):
        py_class_name = cls.pybind_class

@@ -713,9 +787,6 @@ class MetaSimObject(type):
        code('''#include "pybind11/pybind11.h"
 #include "pybind11/stl.h"

-#include <type_traits>
-
-#include "base/compiler.hh"
 #include "params/$cls.hh"
 #include "python/pybind11/core.hh"
 #include "sim/init.hh"
@@ -797,76 +868,6 @@ module_init(py::module_ &m_internal)
        code()
        code('static EmbeddedPyBind embed_obj("${0}", module_init, "${1}");',
             cls, cls._base.type if cls._base else "")
-        if not hasattr(cls, 'abstract') or not cls.abstract:
-            if 'type' in cls.__dict__:
-                code()
-                # This namespace can't *actually* be anonymous, or the compiler
-                # gets upset about having a weak symbol init.
-                code('namespace anonymous_params')
-                code('{')
-                code()
-                # If we can't define a default create() method for this params
-                # struct because the SimObject doesn't have the right
-                # constructor, use template magic to make it so we're actually
-                # defining a create method for this class instead.
-                code('class Dummy${cls}ParamsClass')
-                code('{')
-                code('  public:')
-                code('    ${{cls.cxx_class}} *create() const;')
-                code('};')
-                code()
-                code('template <class CxxClass, class Enable=void>')
-                code('class DummyShunt;')
-                code()
-                # This version directs to the real Params struct and the
-                # default behavior of create if there's an appropriate
-                # constructor.
-                code('template <class CxxClass>')
-                code('class DummyShunt<CxxClass, std::enable_if_t<')
-                code('    std::is_constructible<CxxClass,')
-                code('        const ${cls}Params &>::value>>')
-                code('{')
-                code('  public:')
-                code('    using Params = ${cls}Params;')
-                code('    static ${{cls.cxx_class}} *')
-                code('    create(const Params &p)')
-                code('    {')
-                code('        return new CxxClass(p);')
-                code('    }')
-                code('};')
-                code()
-                # This version diverts to the DummyParamsClass and a dummy
-                # implementation of create if the appropriate constructor does
-                # not exist.
-                code('template <class CxxClass>')
-                code('class DummyShunt<CxxClass, std::enable_if_t<')
-                code('    !std::is_constructible<CxxClass,')
-                code('        const ${cls}Params &>::value>>')
-                code('{')
-                code('  public:')
-                code('    using Params = Dummy${cls}ParamsClass;')
-                code('    static ${{cls.cxx_class}} *')
-                code('    create(const Params &p)')
-                code('    {')
-                code('        return nullptr;')
-                code('    }')
-                code('};')
-                code()
-                code('} // namespace anonymous_params')
-                code()
-                code('using namespace anonymous_params;')
-                code()
-                # A weak implementation of either the real Params struct's
-                # create method, or the Dummy one if we don't want to have
-                # any default implementation. Either an implementation is
-                # mandantory since this was shunted off to the dummy class, or
-                # one is optional which will override this weak version.
-                code('M5_WEAK ${{cls.cxx_class}} *')
-                code('DummyShunt<${{cls.cxx_class}}>::Params::create() const')
-                code('{')
-                code('    return DummyShunt<${{cls.cxx_class}}>::')
-                code('        create(*this);')
-                code('}')

    _warned_about_nested_templates = False

--- a/src/python/m5/stats/gem5stats.py
+++ b/src/python/m5/stats/gem5stats.py
@@ -230,7 +230,7 @@ def _prepare_stats(group: _m5.stats.Group):
    for stat in group.getStats():
        stat.prepare()

-    for child in getStatGroups().values():
+    for child in group.getStatGroups().values():
        _prepare_stats(child)


--- a/src/sim/root.cc
+++ b/src/sim/root.cc
@@ -166,7 +166,7 @@ Root::timeSyncSpinThreshold(Time newThreshold)
    timeSyncEnable(en);
 }

-Root::Root(const RootParams &p)
+Root::Root(const RootParams &p, int)
    : SimObject(p), _enabled(false), _periodTick(p.time_sync_period),
      syncEvent([this]{ timeSync(); }, name())
 {
@@ -216,5 +216,5 @@ RootParams::create() const
    FullSystem = full_system;
    FullSystemInt = full_system ? 1 : 0;

-    return new Root(*this);
+    return new Root(*this, 0);
 }
--- a/src/sim/root.hh
+++ b/src/sim/root.hh
@@ -134,7 +134,9 @@ class Root : public SimObject

    PARAMS(Root);

-    Root(const Params &p);
+    // The int parameter is ignored, it's just so we can define a custom
+    // create() method.
+    Root(const Params &p, int);

    /** Schedule the timesync event at startup().
     */
--- a/src/systemc/core/kernel.cc
+++ b/src/systemc/core/kernel.cc
@@ -54,7 +54,7 @@ bool Kernel::endOfSimulationComplete() { return endComplete; }
 sc_core::sc_status Kernel::status() { return _status; }
 void Kernel::status(sc_core::sc_status s) { _status = s; }

-Kernel::Kernel(const Params &params) :
+Kernel::Kernel(const Params &params, int) :
    SimObject(params), t0Event(this, false, EventBase::Default_Pri - 1)
 {
    // Install ourselves as the scheduler's event manager.
@@ -187,6 +187,6 @@ SystemC_KernelParams::create() const
 {
    panic_if(sc_gem5::kernel,
            "Only one systemc kernel object may be defined.\n");
-    sc_gem5::kernel = new sc_gem5::Kernel(*this);
+    sc_gem5::kernel = new sc_gem5::Kernel(*this, 0);
    return sc_gem5::kernel;
 }
--- a/src/systemc/core/kernel.hh
+++ b/src/systemc/core/kernel.hh
@@ -46,7 +46,7 @@ class Kernel : public SimObject
 {
  public:
    typedef SystemC_KernelParams Params;
-    Kernel(const Params &params);
+    Kernel(const Params &params, int);

    void init() override;
    void regStats() override;
--- a/util/tlm/SConstruct
+++ b/util/tlm/SConstruct
@@ -55,7 +55,7 @@ env.Append(CPPPATH=[gem5_root + '/build/' + gem5_arch,
                    '#examples/common',
                    ])

-env.Append(CXXFLAGS=['-std=c++11',
+env.Append(CXXFLAGS=['-std=c++14',
                     '-DSC_INCLUDE_DYNAMIC_PROCESSES',
                     '-DTRACING_ON',
                     ])
@@ -78,6 +78,7 @@ sys.path.append(gem5_root + '/src/python')
 AddOption('--no-colors', dest='use_colors', action='store_false',
          help="Don't add color to abbreviated scons output")

+main.SConsignFile('build/systemc/sconsign')
 SConscript(gem5_root + '/ext/systemc/SConscript',
           variant_dir='build/systemc',
           exports='main')
--- a/util/tlm/src/sc_master_port.cc
+++ b/util/tlm/src/sc_master_port.cc
@@ -93,8 +93,7 @@ SCMasterPort::SCMasterPort(const std::string& name_,
    transactor(nullptr),
    simControl(simControl)
 {
-    system =
-        dynamic_cast<const ExternalMasterParams*>(owner_.params())->system;
+    system = dynamic_cast<const ExternalMasterParams&>(owner_.params()).system;
 }

 void
--- a/util/tlm/src/sc_slave_port.cc
+++ b/util/tlm/src/sc_slave_port.cc
@@ -301,7 +301,7 @@ SCSlavePort::pec(
            packet->makeResponse();
        }
        if (packet->isResponse()) {
-            need_retry = !bridgeResponsePort.sendTimingResp(packet);
+            need_retry = !sendTimingResp(packet);
        }

        if (need_retry) {