configs,mem-ruby: Remove old GPU ptls
These protocols are no longer supported, either because they are not representative of GPU protocols, or because the have not been updated to work with GCN3. Change-Id: I989eeb6826c69225766aaab209302fe638b22719 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/34197 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
committed by
Matthew Poremba
parent
173c1c6eb0
commit
80221d7e1d
@@ -236,23 +236,15 @@ shader = Shader(n_wf = options.wfs_per_simd,
|
||||
voltage_domain = VoltageDomain(
|
||||
voltage = options.gpu_voltage)))
|
||||
|
||||
# GPU_RfO(Read For Ownership) implements SC/TSO memory model.
|
||||
# Other GPU protocols implement release consistency at GPU side.
|
||||
# So, all GPU protocols other than GPU_RfO should make their writes
|
||||
# visible to the global memory and should read from global memory
|
||||
# during kernal boundary. The pipeline initiates(or do not initiate)
|
||||
# the acquire/release operation depending on these impl_kern_launch_rel
|
||||
# and impl_kern_end_rel flags. The flag=true means pipeline initiates
|
||||
# a acquire/release operation at kernel launch/end.
|
||||
# VIPER protocols (GPU_VIPER, GPU_VIPER_Region and GPU_VIPER_Baseline)
|
||||
# are write-through based, and thus only imple_kern_launch_acq needs to
|
||||
# set.
|
||||
if buildEnv['PROTOCOL'] == 'GPU_RfO':
|
||||
shader.impl_kern_launch_acq = False
|
||||
shader.impl_kern_end_rel = False
|
||||
elif (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
|
||||
buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
|
||||
buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
|
||||
# VIPER GPU protocol implements release consistency at GPU side. So,
|
||||
# we make their writes visible to the global memory and should read
|
||||
# from global memory during kernal boundary. The pipeline initiates
|
||||
# (or do not initiate) the acquire/release operation depending on
|
||||
# these impl_kern_launch_rel and impl_kern_end_rel flags. The flag=true
|
||||
# means pipeline initiates a acquire/release operation at kernel launch/end.
|
||||
# VIPER protocol is write-through based, and thus only impl_kern_launch_acq
|
||||
# needs to set.
|
||||
if (buildEnv['PROTOCOL'] == 'GPU_VIPER'):
|
||||
shader.impl_kern_launch_acq = True
|
||||
shader.impl_kern_end_rel = False
|
||||
else:
|
||||
|
||||
@@ -1,772 +0,0 @@
|
||||
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from this
|
||||
# software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import six
|
||||
import math
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.defines import buildEnv
|
||||
from m5.util import addToPath
|
||||
from .Ruby import create_topology
|
||||
from .Ruby import send_evicts
|
||||
|
||||
addToPath('../')
|
||||
|
||||
from topologies.Cluster import Cluster
|
||||
from topologies.Crossbar import Crossbar
|
||||
|
||||
if six.PY3:
|
||||
long = int
|
||||
|
||||
class CntrlBase:
|
||||
_seqs = 0
|
||||
@classmethod
|
||||
def seqCount(cls):
|
||||
# Use SeqCount not class since we need global count
|
||||
CntrlBase._seqs += 1
|
||||
return CntrlBase._seqs - 1
|
||||
|
||||
_cntrls = 0
|
||||
@classmethod
|
||||
def cntrlCount(cls):
|
||||
# Use CntlCount not class since we need global count
|
||||
CntrlBase._cntrls += 1
|
||||
return CntrlBase._cntrls - 1
|
||||
|
||||
_version = 0
|
||||
@classmethod
|
||||
def versionCount(cls):
|
||||
cls._version += 1 # Use count for this particular type
|
||||
return cls._version - 1
|
||||
|
||||
class TccDirCache(RubyCache):
|
||||
size = "512kB"
|
||||
assoc = 16
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcc_size)
|
||||
self.size.value += (options.num_compute_units *
|
||||
(MemorySize(options.tcp_size).value) *
|
||||
options.tcc_dir_factor) / long(options.num_tccs)
|
||||
self.start_index_bit = math.log(options.cacheline_size, 2) + \
|
||||
math.log(options.num_tccs, 2)
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class L1DCache(RubyCache):
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.l1d_size)
|
||||
self.assoc = options.l1d_assoc
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class L1ICache(RubyCache):
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.l1i_size)
|
||||
self.assoc = options.l1i_assoc
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class L2Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.l2_size)
|
||||
self.assoc = options.l2_assoc
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
|
||||
class CPCntrl(CorePair_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1Icache = L1ICache()
|
||||
self.L1Icache.create(options)
|
||||
self.L1D0cache = L1DCache()
|
||||
self.L1D0cache.create(options)
|
||||
self.L1D1cache = L1DCache()
|
||||
self.L1D1cache.create(options)
|
||||
self.L2cache = L2Cache()
|
||||
self.L2cache.create(options)
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1D0cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.coreid = 0
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.sequencer1 = RubySequencer()
|
||||
self.sequencer1.version = self.seqCount()
|
||||
self.sequencer1.dcache = self.L1D1cache
|
||||
self.sequencer1.ruby_system = ruby_system
|
||||
self.sequencer1.coreid = 1
|
||||
self.sequencer1.is_cpu_sequencer = True
|
||||
|
||||
# Defines icache/dcache hit latency
|
||||
self.mandatory_queue_latency = 2
|
||||
|
||||
self.issue_latency = options.cpu_to_dir_latency
|
||||
self.send_evictions = send_evicts(options)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCPCache(RubyCache):
|
||||
assoc = 8
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 4
|
||||
dataAccessLatency = 4
|
||||
tagAccessLatency = 1
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcp_size)
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class TCPCntrl(TCP_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
self.L1cache.create(options)
|
||||
|
||||
self.coalescer = RubyGPUCoalescer()
|
||||
self.coalescer.version = self.seqCount()
|
||||
self.coalescer.icache = self.L1cache
|
||||
self.coalescer.dcache = self.L1cache
|
||||
self.coalescer.ruby_system = ruby_system
|
||||
self.coalescer.support_inst_reqs = False
|
||||
self.coalescer.is_cpu_sequencer = False
|
||||
self.coalescer.max_outstanding_requests = options.simds_per_cu * \
|
||||
options.wfs_per_simd * \
|
||||
options.wf_size
|
||||
if options.tcp_deadlock_threshold:
|
||||
self.coalescer.deadlock_threshold = \
|
||||
options.tcp_deadlock_threshold
|
||||
self.coalescer.max_coalesces_per_cycle = \
|
||||
options.max_coalesces_per_cycle
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.use_seq_not_coal = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def createCP(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = TCPCache(tagAccessLatency = options.TCP_latency)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
self.L1cache.create(options)
|
||||
|
||||
self.coalescer = RubyGPUCoalescer()
|
||||
self.coalescer.version = self.seqCount()
|
||||
self.coalescer.icache = self.L1cache
|
||||
self.coalescer.dcache = self.L1cache
|
||||
self.coalescer.ruby_system = ruby_system
|
||||
self.coalescer.support_inst_reqs = False
|
||||
self.coalescer.is_cpu_sequencer = False
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.use_seq_not_coal = True
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class SQCCache(RubyCache):
|
||||
size = "32kB"
|
||||
assoc = 8
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 4
|
||||
dataAccessLatency = 4
|
||||
tagAccessLatency = 1
|
||||
def create(self, options):
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class SQCCntrl(SQC_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = SQCCache()
|
||||
self.L1cache.create(options)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.support_data_reqs = False
|
||||
self.sequencer.is_cpu_sequencer = False
|
||||
|
||||
if options.sqc_deadlock_threshold:
|
||||
self.sequencer.deadlock_threshold = \
|
||||
options.sqc_deadlock_threshold
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def createCP(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1cache = SQCCache()
|
||||
self.L1cache.create(options)
|
||||
self.L1cache.resourceStalls = options.no_resource_stalls
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.support_data_reqs = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
|
||||
class TCC(RubyCache):
|
||||
assoc = 16
|
||||
dataAccessLatency = 8
|
||||
tagAccessLatency = 2
|
||||
resourceStalls = True
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcc_size)
|
||||
self.size = self.size / options.num_tccs
|
||||
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
|
||||
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
|
||||
if ((self.size.value / long(self.assoc)) < 128):
|
||||
self.size.value = long(128 * self.assoc)
|
||||
self.start_index_bit = math.log(options.cacheline_size, 2) + \
|
||||
math.log(options.num_tccs, 2)
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class TCCCntrl(TCC_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L2cache = TCC()
|
||||
self.L2cache.create(options)
|
||||
self.l2_response_latency = options.TCC_latency
|
||||
|
||||
self.number_of_TBEs = 2048
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir,
|
||||
tcc_unblock_to_tccdir, req_to_tcc,
|
||||
probe_to_tcc, resp_to_tcc):
|
||||
self.w_reqToTCCDir = req_to_tccdir
|
||||
self.w_respToTCCDir = resp_to_tccdir
|
||||
self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir
|
||||
self.w_reqToTCC = req_to_tcc
|
||||
self.w_probeToTCC = probe_to_tcc
|
||||
self.w_respToTCC = resp_to_tcc
|
||||
|
||||
class TCCDirCntrl(TCCdir_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.directory = TccDirCache()
|
||||
self.directory.create(options)
|
||||
|
||||
self.number_of_TBEs = 1024
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_tccdir, resp_to_tccdir,
|
||||
tcc_unblock_to_tccdir, req_to_tcc,
|
||||
probe_to_tcc, resp_to_tcc):
|
||||
self.w_reqToTCCDir = req_to_tccdir
|
||||
self.w_respToTCCDir = resp_to_tccdir
|
||||
self.w_TCCUnblockToTCCDir = tcc_unblock_to_tccdir
|
||||
self.w_reqToTCC = req_to_tcc
|
||||
self.w_probeToTCC = probe_to_tcc
|
||||
self.w_respToTCC = resp_to_tcc
|
||||
|
||||
class L3Cache(RubyCache):
|
||||
assoc = 8
|
||||
dataArrayBanks = 256
|
||||
tagArrayBanks = 256
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.size = MemorySize(options.l3_size)
|
||||
self.size.value /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataAccessLatency = options.l3_data_latency
|
||||
self.tagAccessLatency = options.l3_tag_latency
|
||||
self.resourceStalls = options.no_resource_stalls
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class L3Cntrl(L3Cache_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L3cache = L3Cache()
|
||||
self.L3cache.create(options, ruby_system, system)
|
||||
|
||||
self.l3_response_latency = max(self.L3cache.dataAccessLatency,
|
||||
self.L3cache.tagAccessLatency)
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
class DirCntrl(Directory_Controller, CntrlBase):
|
||||
def create(self, options, dir_ranges, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.response_latency = 30
|
||||
|
||||
self.addr_ranges = dir_ranges
|
||||
self.directory = RubyDirectoryMemory()
|
||||
|
||||
self.L3CacheMemory = L3Cache()
|
||||
self.L3CacheMemory.create(options, ruby_system, system)
|
||||
|
||||
self.l3_hit_latency = max(self.L3CacheMemory.dataAccessLatency,
|
||||
self.L3CacheMemory.tagAccessLatency)
|
||||
|
||||
self.number_of_TBEs = options.num_tbes
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
|
||||
|
||||
def define_options(parser):
|
||||
parser.add_option("--num-subcaches", type="int", default=4)
|
||||
parser.add_option("--l3-data-latency", type="int", default=20)
|
||||
parser.add_option("--l3-tag-latency", type="int", default=15)
|
||||
parser.add_option("--cpu-to-dir-latency", type="int", default=15)
|
||||
parser.add_option("--gpu-to-dir-latency", type="int", default=160)
|
||||
parser.add_option("--no-resource-stalls", action="store_false",
|
||||
default=True)
|
||||
parser.add_option("--num-tbes", type="int", default=256)
|
||||
parser.add_option("--l2-latency", type="int", default=50) # load to use
|
||||
parser.add_option("--num-tccs", type="int", default=1,
|
||||
help="number of TCC directories and banks in the GPU")
|
||||
parser.add_option("--TCP_latency", type="int", default=4,
|
||||
help="TCP latency")
|
||||
parser.add_option("--tcp-deadlock-threshold", type='int',
|
||||
help="Set the TCP deadlock threshold to some value")
|
||||
parser.add_option("--TCC_latency", type="int", default=16,
|
||||
help="TCC latency")
|
||||
parser.add_option("--tcc-size", type='string', default='256kB',
|
||||
help="agregate tcc size")
|
||||
parser.add_option("--tcp-size", type='string', default='16kB',
|
||||
help="tcp size")
|
||||
parser.add_option("--tcc-dir-factor", type='int', default=4,
|
||||
help="TCCdir size = factor *(TCPs + TCC)")
|
||||
parser.add_option("--sqc-deadlock-threshold", type='int',
|
||||
help="Set the SQC deadlock threshold to some value")
|
||||
parser.add_option("--max-coalesces-per-cycle", type="int", default=1,
|
||||
help="Maximum insts that may coalesce in a cycle");
|
||||
|
||||
def create_system(options, full_system, system, dma_devices, bootmem,
|
||||
ruby_system):
|
||||
if buildEnv['PROTOCOL'] != 'GPU_RfO':
|
||||
panic("This script requires the GPU_RfO protocol to be built.")
|
||||
|
||||
cpu_sequencers = []
|
||||
|
||||
#
|
||||
# The ruby network creation expects the list of nodes in the system to be
|
||||
# consistent with the NetDest list. Therefore the l1 controller nodes
|
||||
# must be listed before the directory nodes and directory nodes before
|
||||
# dma nodes, etc.
|
||||
#
|
||||
cp_cntrl_nodes = []
|
||||
tcp_cntrl_nodes = []
|
||||
sqc_cntrl_nodes = []
|
||||
tcc_cntrl_nodes = []
|
||||
tccdir_cntrl_nodes = []
|
||||
dir_cntrl_nodes = []
|
||||
l3_cntrl_nodes = []
|
||||
|
||||
#
|
||||
# Must create the individual controllers before the network to ensure the
|
||||
# controller constructors are called before the network constructor
|
||||
#
|
||||
|
||||
TCC_bits = int(math.log(options.num_tccs, 2))
|
||||
|
||||
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
|
||||
# Clusters
|
||||
mainCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
|
||||
|
||||
if options.numa_high_bit:
|
||||
numa_bit = options.numa_high_bit
|
||||
else:
|
||||
# if the numa_bit is not specified, set the directory bits as the
|
||||
# lowest bits above the block offset bits, and the numa_bit as the
|
||||
# highest of those directory bits
|
||||
dir_bits = int(math.log(options.num_dirs, 2))
|
||||
block_size_bits = int(math.log(options.cacheline_size, 2))
|
||||
numa_bit = block_size_bits + dir_bits - 1
|
||||
|
||||
for i in range(options.num_dirs):
|
||||
dir_ranges = []
|
||||
for r in system.mem_ranges:
|
||||
addr_range = m5.objects.AddrRange(r.start, size = r.size(),
|
||||
intlvHighBit = numa_bit,
|
||||
intlvBits = dir_bits,
|
||||
intlvMatch = i)
|
||||
dir_ranges.append(addr_range)
|
||||
|
||||
dir_cntrl = DirCntrl(TCC_select_num_bits = TCC_bits)
|
||||
dir_cntrl.create(options, dir_ranges, ruby_system, system)
|
||||
dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
#Enough TBEs for all TCP TBEs
|
||||
|
||||
# Connect the Directory controller to the ruby network
|
||||
dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
|
||||
dir_cntrl.requestFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.responseFromCores = MessageBuffer()
|
||||
dir_cntrl.responseFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.unblockFromCores = MessageBuffer()
|
||||
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.probeToCore = MessageBuffer()
|
||||
dir_cntrl.probeToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.responseToCore = MessageBuffer()
|
||||
dir_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.requestToMemory = MessageBuffer()
|
||||
dir_cntrl.responseFromMemory = MessageBuffer()
|
||||
|
||||
exec("system.dir_cntrl%d = dir_cntrl" % i)
|
||||
dir_cntrl_nodes.append(dir_cntrl)
|
||||
|
||||
mainCluster.add(dir_cntrl)
|
||||
|
||||
# For an odd number of CPUs, still create the right number of controllers
|
||||
cpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
|
||||
for i in range((options.num_cpus + 1) // 2):
|
||||
|
||||
cp_cntrl = CPCntrl()
|
||||
cp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.cp_cntrl%d = cp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
|
||||
|
||||
# Connect the CP controllers and the network
|
||||
cp_cntrl.requestFromCore = MessageBuffer()
|
||||
cp_cntrl.requestFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.responseFromCore = MessageBuffer()
|
||||
cp_cntrl.responseFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.unblockFromCore = MessageBuffer()
|
||||
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.probeToCore = MessageBuffer()
|
||||
cp_cntrl.probeToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.responseToCore = MessageBuffer()
|
||||
cp_cntrl.responseToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
cpuCluster.add(cp_cntrl)
|
||||
|
||||
gpuCluster = Cluster(extBW = 512, intBW = 512) # 1 TB/s
|
||||
|
||||
for i in range(options.num_compute_units):
|
||||
|
||||
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
|
||||
number_of_TBEs = 2560) # max outstanding requests
|
||||
tcp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.tcp_cntrl%d = tcp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(tcp_cntrl.coalescer)
|
||||
tcp_cntrl_nodes.append(tcp_cntrl)
|
||||
|
||||
# Connect the TCP controller to the ruby network
|
||||
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
gpuCluster.add(tcp_cntrl)
|
||||
|
||||
for i in range(options.num_sqc):
|
||||
|
||||
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
|
||||
sqc_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.sqc_cntrl%d = sqc_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(sqc_cntrl.sequencer)
|
||||
|
||||
# Connect the SQC controller to the ruby network
|
||||
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
# SQC also in GPU cluster
|
||||
gpuCluster.add(sqc_cntrl)
|
||||
|
||||
for i in range(options.num_cp):
|
||||
|
||||
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
|
||||
number_of_TBEs = 2560) # max outstanding requests
|
||||
tcp_cntrl.createCP(options, ruby_system, system)
|
||||
|
||||
exec("system.tcp_cntrl%d = tcp_cntrl" % (options.num_compute_units + i))
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(tcp_cntrl.sequencer)
|
||||
tcp_cntrl_nodes.append(tcp_cntrl)
|
||||
|
||||
# Connect the TCP controller to the ruby network
|
||||
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.unblockFromCore = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
gpuCluster.add(tcp_cntrl)
|
||||
|
||||
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
|
||||
sqc_cntrl.createCP(options, ruby_system, system)
|
||||
|
||||
exec("system.sqc_cntrl%d = sqc_cntrl" % (options.num_compute_units + i))
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(sqc_cntrl.sequencer)
|
||||
|
||||
# Connect the SQC controller to the ruby network
|
||||
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.responseFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.unblockFromCore = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
# SQC also in GPU cluster
|
||||
gpuCluster.add(sqc_cntrl)
|
||||
|
||||
for i in range(options.num_tccs):
|
||||
|
||||
tcc_cntrl = TCCCntrl(TCC_select_num_bits = TCC_bits,
|
||||
number_of_TBEs = options.num_compute_units * 2560)
|
||||
#Enough TBEs for all TCP TBEs
|
||||
tcc_cntrl.create(options, ruby_system, system)
|
||||
tcc_cntrl_nodes.append(tcc_cntrl)
|
||||
|
||||
tccdir_cntrl = TCCDirCntrl(TCC_select_num_bits = TCC_bits,
|
||||
number_of_TBEs = options.num_compute_units * 2560)
|
||||
#Enough TBEs for all TCP TBEs
|
||||
tccdir_cntrl.create(options, ruby_system, system)
|
||||
tccdir_cntrl_nodes.append(tccdir_cntrl)
|
||||
|
||||
exec("system.tcc_cntrl%d = tcc_cntrl" % i)
|
||||
exec("system.tccdir_cntrl%d = tccdir_cntrl" % i)
|
||||
|
||||
# connect all of the wire buffers between L3 and dirs up
|
||||
req_to_tccdir = RubyWireBuffer()
|
||||
resp_to_tccdir = RubyWireBuffer()
|
||||
tcc_unblock_to_tccdir = RubyWireBuffer()
|
||||
req_to_tcc = RubyWireBuffer()
|
||||
probe_to_tcc = RubyWireBuffer()
|
||||
resp_to_tcc = RubyWireBuffer()
|
||||
|
||||
tcc_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir,
|
||||
tcc_unblock_to_tccdir, req_to_tcc,
|
||||
probe_to_tcc, resp_to_tcc)
|
||||
tccdir_cntrl.connectWireBuffers(req_to_tccdir, resp_to_tccdir,
|
||||
tcc_unblock_to_tccdir, req_to_tcc,
|
||||
probe_to_tcc, resp_to_tcc)
|
||||
|
||||
# Connect the TCC controller to the ruby network
|
||||
tcc_cntrl.responseFromTCC = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.responseFromTCC.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.responseToTCC = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.responseToTCC.slave = ruby_system.network.master
|
||||
|
||||
# Connect the TCC Dir controller to the ruby network
|
||||
tccdir_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tccdir_cntrl.requestFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tccdir_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tccdir_cntrl.responseFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tccdir_cntrl.unblockFromTCP = MessageBuffer(ordered = True)
|
||||
tccdir_cntrl.unblockFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tccdir_cntrl.probeToCore = MessageBuffer(ordered = True)
|
||||
tccdir_cntrl.probeToCore.master = ruby_system.network.slave
|
||||
|
||||
tccdir_cntrl.responseToCore = MessageBuffer(ordered = True)
|
||||
tccdir_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
tccdir_cntrl.probeFromNB = MessageBuffer()
|
||||
tccdir_cntrl.probeFromNB.slave = ruby_system.network.master
|
||||
|
||||
tccdir_cntrl.responseFromNB = MessageBuffer()
|
||||
tccdir_cntrl.responseFromNB.slave = ruby_system.network.master
|
||||
|
||||
tccdir_cntrl.requestToNB = MessageBuffer()
|
||||
tccdir_cntrl.requestToNB.master = ruby_system.network.slave
|
||||
|
||||
tccdir_cntrl.responseToNB = MessageBuffer()
|
||||
tccdir_cntrl.responseToNB.master = ruby_system.network.slave
|
||||
|
||||
tccdir_cntrl.unblockToNB = MessageBuffer()
|
||||
tccdir_cntrl.unblockToNB.master = ruby_system.network.slave
|
||||
|
||||
tccdir_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
# TCC cntrls added to the GPU cluster
|
||||
gpuCluster.add(tcc_cntrl)
|
||||
gpuCluster.add(tccdir_cntrl)
|
||||
|
||||
# Assuming no DMA devices
|
||||
assert(len(dma_devices) == 0)
|
||||
|
||||
# Add cpu/gpu clusters to main cluster
|
||||
mainCluster.add(cpuCluster)
|
||||
mainCluster.add(gpuCluster)
|
||||
|
||||
ruby_system.network.number_of_virtual_networks = 10
|
||||
|
||||
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
|
||||
@@ -1,614 +0,0 @@
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from this
|
||||
# software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import six
|
||||
import math
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.defines import buildEnv
|
||||
from m5.util import addToPath
|
||||
from .Ruby import create_topology
|
||||
from .Ruby import send_evicts
|
||||
|
||||
addToPath('../')
|
||||
|
||||
from topologies.Cluster import Cluster
|
||||
from topologies.Crossbar import Crossbar
|
||||
|
||||
if six.PY3:
|
||||
long = int
|
||||
|
||||
class CntrlBase:
|
||||
_seqs = 0
|
||||
@classmethod
|
||||
def seqCount(cls):
|
||||
# Use SeqCount not class since we need global count
|
||||
CntrlBase._seqs += 1
|
||||
return CntrlBase._seqs - 1
|
||||
|
||||
_cntrls = 0
|
||||
@classmethod
|
||||
def cntrlCount(cls):
|
||||
# Use CntlCount not class since we need global count
|
||||
CntrlBase._cntrls += 1
|
||||
return CntrlBase._cntrls - 1
|
||||
|
||||
_version = 0
|
||||
@classmethod
|
||||
def versionCount(cls):
|
||||
cls._version += 1 # Use count for this particular type
|
||||
return cls._version - 1
|
||||
|
||||
class L1Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
dataArrayBanks = 2
|
||||
tagArrayBanks = 2
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class L2Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
assoc = 16
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class CPCntrl(CorePair_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1Icache = L1Cache()
|
||||
self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
|
||||
self.L1D0cache = L1Cache()
|
||||
self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L1D1cache = L1Cache()
|
||||
self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L2cache = L2Cache()
|
||||
self.L2cache.create(options.l2_size, options.l2_assoc, options)
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1D0cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.coreid = 0
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.sequencer1 = RubySequencer()
|
||||
self.sequencer1.version = self.seqCount()
|
||||
self.sequencer1.dcache = self.L1D1cache
|
||||
self.sequencer1.ruby_system = ruby_system
|
||||
self.sequencer1.coreid = 1
|
||||
self.sequencer1.is_cpu_sequencer = True
|
||||
|
||||
self.issue_latency = options.cpu_to_dir_latency
|
||||
self.send_evictions = send_evicts(options)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCPCache(RubyCache):
|
||||
size = "16kB"
|
||||
assoc = 16
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
dataAccessLatency = 4
|
||||
tagAccessLatency = 1
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcp_size)
|
||||
self.dataArrayBanks = 16
|
||||
self.tagArrayBanks = 16
|
||||
self.dataAccessLatency = 4
|
||||
self.tagAccessLatency = 1
|
||||
self.resourceStalls = options.no_tcc_resource_stalls
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class TCPCntrl(TCP_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L1cache = TCPCache()
|
||||
self.L1cache.create(options)
|
||||
self.issue_latency = 1
|
||||
|
||||
self.coalescer = VIPERCoalescer()
|
||||
self.coalescer.version = self.seqCount()
|
||||
self.coalescer.icache = self.L1cache
|
||||
self.coalescer.dcache = self.L1cache
|
||||
self.coalescer.ruby_system = ruby_system
|
||||
self.coalescer.support_inst_reqs = False
|
||||
self.coalescer.is_cpu_sequencer = False
|
||||
if options.tcp_deadlock_threshold:
|
||||
self.coalescer.deadlock_threshold = \
|
||||
options.tcp_deadlock_threshold
|
||||
self.coalescer.max_coalesces_per_cycle = \
|
||||
options.max_coalesces_per_cycle
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.use_seq_not_coal = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class SQCCache(RubyCache):
|
||||
dataArrayBanks = 8
|
||||
tagArrayBanks = 8
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.sqc_size)
|
||||
self.assoc = options.sqc_assoc
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class SQCCntrl(SQC_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L1cache = SQCCache()
|
||||
self.L1cache.create(options)
|
||||
self.L1cache.resourceStalls = False
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.support_data_reqs = False
|
||||
self.sequencer.is_cpu_sequencer = False
|
||||
if options.sqc_deadlock_threshold:
|
||||
self.sequencer.deadlock_threshold = \
|
||||
options.sqc_deadlock_threshold
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCC(RubyCache):
|
||||
size = MemorySize("256kB")
|
||||
assoc = 16
|
||||
dataAccessLatency = 8
|
||||
tagAccessLatency = 2
|
||||
resourceStalls = True
|
||||
def create(self, options):
|
||||
self.assoc = options.tcc_assoc
|
||||
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
|
||||
s = options.num_compute_units
|
||||
tcc_size = s * 128
|
||||
tcc_size = str(tcc_size)+'kB'
|
||||
self.size = MemorySize(tcc_size)
|
||||
self.dataArrayBanks = 64
|
||||
self.tagArrayBanks = 64
|
||||
else:
|
||||
self.size = MemorySize(options.tcc_size)
|
||||
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
|
||||
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
|
||||
self.size.value = self.size.value / options.num_tccs
|
||||
if ((self.size.value / long(self.assoc)) < 128):
|
||||
self.size.value = long(128 * self.assoc)
|
||||
self.start_index_bit = math.log(options.cacheline_size, 2) + \
|
||||
math.log(options.num_tccs, 2)
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class TCCCntrl(TCC_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L2cache = TCC()
|
||||
self.L2cache.create(options)
|
||||
self.ruby_system = ruby_system
|
||||
self.L2cache.resourceStalls = options.no_tcc_resource_stalls
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class L3Cache(RubyCache):
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.size = MemorySize(options.l3_size)
|
||||
self.size.value /= options.num_dirs
|
||||
self.assoc = options.l3_assoc
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataAccessLatency = options.l3_data_latency
|
||||
self.tagAccessLatency = options.l3_tag_latency
|
||||
self.resourceStalls = False
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class ProbeFilter(RubyCache):
|
||||
size = "4MB"
|
||||
assoc = 16
|
||||
dataArrayBanks = 256
|
||||
tagArrayBanks = 256
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.block_size = "%dB" % (64 * options.blocks_per_region)
|
||||
self.size = options.region_dir_entries * \
|
||||
self.block_size * options.num_compute_units
|
||||
self.assoc = 8
|
||||
self.tagArrayBanks = 8
|
||||
self.tagAccessLatency = options.dir_tag_latency
|
||||
self.dataAccessLatency = 1
|
||||
self.resourceStalls = options.no_resource_stalls
|
||||
self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2))
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class L3Cntrl(L3Cache_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L3cache = L3Cache()
|
||||
self.L3cache.create(options, ruby_system, system)
|
||||
self.l3_response_latency = \
|
||||
max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
class DirCntrl(Directory_Controller, CntrlBase):
|
||||
def create(self, options, dir_ranges, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.response_latency = 30
|
||||
self.addr_ranges = dir_ranges
|
||||
self.directory = RubyDirectoryMemory()
|
||||
self.L3CacheMemory = L3Cache()
|
||||
self.L3CacheMemory.create(options, ruby_system, system)
|
||||
self.ProbeFilterMemory = ProbeFilter()
|
||||
self.ProbeFilterMemory.create(options, ruby_system, system)
|
||||
self.l3_hit_latency = \
|
||||
max(self.L3CacheMemory.dataAccessLatency,
|
||||
self.L3CacheMemory.tagAccessLatency)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
def define_options(parser):
|
||||
parser.add_option("--num-subcaches", type = "int", default = 4)
|
||||
parser.add_option("--l3-data-latency", type = "int", default = 20)
|
||||
parser.add_option("--l3-tag-latency", type = "int", default = 15)
|
||||
parser.add_option("--cpu-to-dir-latency", type = "int", default = 120)
|
||||
parser.add_option("--gpu-to-dir-latency", type = "int", default = 120)
|
||||
parser.add_option("--no-resource-stalls", action = "store_false",
|
||||
default = True)
|
||||
parser.add_option("--no-tcc-resource-stalls", action = "store_false",
|
||||
default = True)
|
||||
parser.add_option("--num-tbes", type = "int", default = 2560)
|
||||
parser.add_option("--l2-latency", type = "int", default = 50) # load to use
|
||||
parser.add_option("--num-tccs", type = "int", default = 1,
|
||||
help = "number of TCC banks in the GPU")
|
||||
parser.add_option("--sqc-size", type = 'string', default = '32kB',
|
||||
help = "SQC cache size")
|
||||
parser.add_option("--sqc-assoc", type = 'int', default = 8,
|
||||
help = "SQC cache assoc")
|
||||
parser.add_option("--sqc-deadlock-threshold", type='int',
|
||||
help="Set the SQC deadlock threshold to some value")
|
||||
|
||||
parser.add_option("--region-dir-entries", type = "int", default = 8192)
|
||||
parser.add_option("--dir-tag-latency", type = "int", default = 8)
|
||||
parser.add_option("--dir-tag-banks", type = "int", default = 4)
|
||||
parser.add_option("--blocks-per-region", type = "int", default = 1)
|
||||
parser.add_option("--use-L3-on-WT", action = "store_true", default = False)
|
||||
parser.add_option("--nonInclusiveDir", action = "store_true",
|
||||
default = False)
|
||||
parser.add_option("--WB_L1", action = "store_true",
|
||||
default = False, help = "writeback L2")
|
||||
parser.add_option("--WB_L2", action = "store_true",
|
||||
default = False, help = "writeback L2")
|
||||
parser.add_option("--TCP_latency", type = "int",
|
||||
default = 4, help = "TCP latency")
|
||||
parser.add_option("--TCC_latency", type = "int",
|
||||
default = 16, help = "TCC latency")
|
||||
parser.add_option("--tcc-size", type = 'string', default = '2MB',
|
||||
help = "agregate tcc size")
|
||||
parser.add_option("--tcc-assoc", type = 'int', default = 16,
|
||||
help = "tcc assoc")
|
||||
parser.add_option("--tcp-size", type = 'string', default = '16kB',
|
||||
help = "tcp size")
|
||||
parser.add_option("--tcp-deadlock-threshold", type='int',
|
||||
help="Set the TCP deadlock threshold to some value")
|
||||
parser.add_option("--max-coalesces-per-cycle", type="int", default=1,
|
||||
help="Maximum insts that may coalesce in a cycle");
|
||||
|
||||
parser.add_option("--sampler-sets", type = "int", default = 1024)
|
||||
parser.add_option("--sampler-assoc", type = "int", default = 16)
|
||||
parser.add_option("--sampler-counter", type = "int", default = 512)
|
||||
parser.add_option("--noL1", action = "store_true", default = False,
|
||||
help = "bypassL1")
|
||||
parser.add_option("--noL2", action = "store_true", default = False,
|
||||
help = "bypassL2")
|
||||
|
||||
def create_system(options, full_system, system, dma_devices, bootmem,
|
||||
ruby_system):
|
||||
if buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline':
|
||||
panic("This script requires the" \
|
||||
"GPU_VIPER_Baseline protocol to be built.")
|
||||
|
||||
cpu_sequencers = []
|
||||
|
||||
#
|
||||
# The ruby network creation expects the list of nodes in the system to be
|
||||
# consistent with the NetDest list. Therefore the l1 controller nodes
|
||||
# must be listed before the directory nodes and directory nodes before
|
||||
# dma nodes, etc.
|
||||
#
|
||||
cp_cntrl_nodes = []
|
||||
tcp_cntrl_nodes = []
|
||||
sqc_cntrl_nodes = []
|
||||
tcc_cntrl_nodes = []
|
||||
dir_cntrl_nodes = []
|
||||
l3_cntrl_nodes = []
|
||||
|
||||
#
|
||||
# Must create the individual controllers before the network to ensure the
|
||||
# controller constructors are called before the network constructor
|
||||
#
|
||||
|
||||
# For an odd number of CPUs, still create the right number of controllers
|
||||
TCC_bits = int(math.log(options.num_tccs, 2))
|
||||
|
||||
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
|
||||
# Clusters
|
||||
crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock
|
||||
mainCluster = Cluster(intBW = crossbar_bw)
|
||||
|
||||
if options.numa_high_bit:
|
||||
numa_bit = options.numa_high_bit
|
||||
else:
|
||||
# if the numa_bit is not specified, set the directory bits as the
|
||||
# lowest bits above the block offset bits, and the numa_bit as the
|
||||
# highest of those directory bits
|
||||
dir_bits = int(math.log(options.num_dirs, 2))
|
||||
block_size_bits = int(math.log(options.cacheline_size, 2))
|
||||
numa_bit = block_size_bits + dir_bits - 1
|
||||
|
||||
for i in range(options.num_dirs):
|
||||
dir_ranges = []
|
||||
for r in system.mem_ranges:
|
||||
addr_range = m5.objects.AddrRange(r.start, size = r.size(),
|
||||
intlvHighBit = numa_bit,
|
||||
intlvBits = dir_bits,
|
||||
intlvMatch = i)
|
||||
dir_ranges.append(addr_range)
|
||||
|
||||
dir_cntrl = DirCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits)
|
||||
dir_cntrl.create(options, dir_ranges, ruby_system, system)
|
||||
dir_cntrl.number_of_TBEs = options.num_tbes
|
||||
dir_cntrl.useL3OnWT = options.use_L3_on_WT
|
||||
dir_cntrl.inclusiveDir = not options.nonInclusiveDir
|
||||
|
||||
# Connect the Directory controller to the ruby network
|
||||
dir_cntrl.requestFromCores = MessageBuffer(ordered = True)
|
||||
dir_cntrl.requestFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.responseFromCores = MessageBuffer()
|
||||
dir_cntrl.responseFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.unblockFromCores = MessageBuffer()
|
||||
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.probeToCore = MessageBuffer()
|
||||
dir_cntrl.probeToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.responseToCore = MessageBuffer()
|
||||
dir_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.requestToMemory = MessageBuffer()
|
||||
dir_cntrl.responseFromMemory = MessageBuffer()
|
||||
|
||||
exec("system.dir_cntrl%d = dir_cntrl" % i)
|
||||
dir_cntrl_nodes.append(dir_cntrl)
|
||||
mainCluster.add(dir_cntrl)
|
||||
|
||||
cpuCluster = Cluster(extBW = crossbar_bw, intBW=crossbar_bw)
|
||||
for i in range((options.num_cpus + 1) // 2):
|
||||
|
||||
cp_cntrl = CPCntrl()
|
||||
cp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.cp_cntrl%d = cp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
|
||||
|
||||
# Connect the CP controllers and the network
|
||||
cp_cntrl.requestFromCore = MessageBuffer()
|
||||
cp_cntrl.requestFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.responseFromCore = MessageBuffer()
|
||||
cp_cntrl.responseFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.unblockFromCore = MessageBuffer()
|
||||
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.probeToCore = MessageBuffer()
|
||||
cp_cntrl.probeToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.responseToCore = MessageBuffer()
|
||||
cp_cntrl.responseToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
cpuCluster.add(cp_cntrl)
|
||||
|
||||
gpuCluster = Cluster(extBW = crossbar_bw, intBW = crossbar_bw)
|
||||
for i in range(options.num_compute_units):
|
||||
|
||||
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
|
||||
issue_latency = 1,
|
||||
number_of_TBEs = 2560)
|
||||
# TBEs set to max outstanding requests
|
||||
tcp_cntrl.create(options, ruby_system, system)
|
||||
tcp_cntrl.WB = options.WB_L1
|
||||
tcp_cntrl.disableL1 = options.noL1
|
||||
|
||||
exec("system.tcp_cntrl%d = tcp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(tcp_cntrl.coalescer)
|
||||
tcp_cntrl_nodes.append(tcp_cntrl)
|
||||
|
||||
# Connect the CP (TCP) controllers to the ruby network
|
||||
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.unblockFromCore = MessageBuffer()
|
||||
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
gpuCluster.add(tcp_cntrl)
|
||||
|
||||
for i in range(options.num_sqc):
|
||||
|
||||
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
|
||||
sqc_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.sqc_cntrl%d = sqc_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(sqc_cntrl.sequencer)
|
||||
|
||||
# Connect the SQC controller to the ruby network
|
||||
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
# SQC also in GPU cluster
|
||||
gpuCluster.add(sqc_cntrl)
|
||||
|
||||
# Because of wire buffers, num_tccs must equal num_tccdirs
|
||||
numa_bit = 6
|
||||
|
||||
for i in range(options.num_tccs):
|
||||
|
||||
tcc_cntrl = TCCCntrl()
|
||||
tcc_cntrl.create(options, ruby_system, system)
|
||||
tcc_cntrl.l2_request_latency = options.gpu_to_dir_latency
|
||||
tcc_cntrl.l2_response_latency = options.TCC_latency
|
||||
tcc_cntrl_nodes.append(tcc_cntrl)
|
||||
tcc_cntrl.WB = options.WB_L2
|
||||
tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
|
||||
# Connect the TCC controllers to the ruby network
|
||||
tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.probeFromNB = MessageBuffer()
|
||||
tcc_cntrl.probeFromNB.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.responseFromNB = MessageBuffer()
|
||||
tcc_cntrl.responseFromNB.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.requestToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.responseToNB = MessageBuffer()
|
||||
tcc_cntrl.responseToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.unblockToNB = MessageBuffer()
|
||||
tcc_cntrl.unblockToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
exec("system.tcc_cntrl%d = tcc_cntrl" % i)
|
||||
# connect all of the wire buffers between L3 and dirs up
|
||||
# TCC cntrls added to the GPU cluster
|
||||
gpuCluster.add(tcc_cntrl)
|
||||
|
||||
# Assuming no DMA devices
|
||||
assert(len(dma_devices) == 0)
|
||||
|
||||
# Add cpu/gpu clusters to main cluster
|
||||
mainCluster.add(cpuCluster)
|
||||
mainCluster.add(gpuCluster)
|
||||
|
||||
ruby_system.network.number_of_virtual_networks = 10
|
||||
|
||||
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
|
||||
@@ -1,780 +0,0 @@
|
||||
# Copyright (c) 2015 Advanced Micro Devices, Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# For use for simulation and test purposes only
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from this
|
||||
# software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import six
|
||||
import math
|
||||
import m5
|
||||
from m5.objects import *
|
||||
from m5.defines import buildEnv
|
||||
from m5.util import addToPath
|
||||
from .Ruby import send_evicts
|
||||
|
||||
addToPath('../')
|
||||
|
||||
from topologies.Cluster import Cluster
|
||||
|
||||
if six.PY3:
|
||||
long = int
|
||||
|
||||
class CntrlBase:
|
||||
_seqs = 0
|
||||
@classmethod
|
||||
def seqCount(cls):
|
||||
# Use SeqCount not class since we need global count
|
||||
CntrlBase._seqs += 1
|
||||
return CntrlBase._seqs - 1
|
||||
|
||||
_cntrls = 0
|
||||
@classmethod
|
||||
def cntrlCount(cls):
|
||||
# Use CntlCount not class since we need global count
|
||||
CntrlBase._cntrls += 1
|
||||
return CntrlBase._cntrls - 1
|
||||
|
||||
_version = 0
|
||||
@classmethod
|
||||
def versionCount(cls):
|
||||
cls._version += 1 # Use count for this particular type
|
||||
return cls._version - 1
|
||||
|
||||
#
|
||||
# Note: the L1 Cache latency is only used by the sequencer on fast path hits
|
||||
#
|
||||
class L1Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
dataArrayBanks = 2
|
||||
tagArrayBanks = 2
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class L2Cache(RubyCache):
|
||||
resourceStalls = False
|
||||
assoc = 16
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
def create(self, size, assoc, options):
|
||||
self.size = MemorySize(size)
|
||||
self.assoc = assoc
|
||||
self.replacement_policy = TreePLRURP()
|
||||
|
||||
class CPCntrl(CorePair_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
|
||||
self.L1Icache = L1Cache()
|
||||
self.L1Icache.create(options.l1i_size, options.l1i_assoc, options)
|
||||
self.L1D0cache = L1Cache()
|
||||
self.L1D0cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L1D1cache = L1Cache()
|
||||
self.L1D1cache.create(options.l1d_size, options.l1d_assoc, options)
|
||||
self.L2cache = L2Cache()
|
||||
self.L2cache.create(options.l2_size, options.l2_assoc, options)
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1D0cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.coreid = 0
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.sequencer1 = RubySequencer()
|
||||
self.sequencer1.version = self.seqCount()
|
||||
self.sequencer1.dcache = self.L1D1cache
|
||||
self.sequencer1.ruby_system = ruby_system
|
||||
self.sequencer1.coreid = 1
|
||||
self.sequencer1.is_cpu_sequencer = True
|
||||
|
||||
self.issue_latency = 1
|
||||
self.send_evictions = send_evicts(options)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCPCache(RubyCache):
|
||||
size = "16kB"
|
||||
assoc = 16
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
dataAccessLatency = 4
|
||||
tagAccessLatency = 1
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.tcp_size)
|
||||
self.dataArrayBanks = 16
|
||||
self.tagArrayBanks = 16
|
||||
self.dataAccessLatency = 4
|
||||
self.tagAccessLatency = 1
|
||||
self.resourceStalls = options.no_tcc_resource_stalls
|
||||
self.replacement_policy = TreePLRURP(num_leaves = self.assoc)
|
||||
|
||||
class TCPCntrl(TCP_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L1cache = TCPCache(dataAccessLatency = options.TCP_latency)
|
||||
self.L1cache.create(options)
|
||||
self.issue_latency = 1
|
||||
|
||||
self.coalescer = VIPERCoalescer()
|
||||
self.coalescer.version = self.seqCount()
|
||||
self.coalescer.icache = self.L1cache
|
||||
self.coalescer.dcache = self.L1cache
|
||||
self.coalescer.ruby_system = ruby_system
|
||||
self.coalescer.support_inst_reqs = False
|
||||
self.coalescer.is_cpu_sequencer = False
|
||||
if options.tcp_deadlock_threshold:
|
||||
self.coalescer.deadlock_threshold = \
|
||||
options.tcp_deadlock_threshold
|
||||
self.coalescer.max_coalesces_per_cycle = \
|
||||
options.max_coalesces_per_cycle
|
||||
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.is_cpu_sequencer = True
|
||||
|
||||
self.use_seq_not_coal = False
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class SQCCache(RubyCache):
|
||||
dataArrayBanks = 8
|
||||
tagArrayBanks = 8
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
|
||||
def create(self, options):
|
||||
self.size = MemorySize(options.sqc_size)
|
||||
self.assoc = options.sqc_assoc
|
||||
self.replacement_policy = TreePLRURP(num_leaves = self.assoc)
|
||||
|
||||
class SQCCntrl(SQC_Controller, CntrlBase):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L1cache = SQCCache()
|
||||
self.L1cache.create(options)
|
||||
self.L1cache.resourceStalls = False
|
||||
self.sequencer = RubySequencer()
|
||||
self.sequencer.version = self.seqCount()
|
||||
self.sequencer.dcache = self.L1cache
|
||||
self.sequencer.ruby_system = ruby_system
|
||||
self.sequencer.support_data_reqs = False
|
||||
self.sequencer.is_cpu_sequencer = False
|
||||
if options.sqc_deadlock_threshold:
|
||||
self.sequencer.deadlock_threshold = \
|
||||
options.sqc_deadlock_threshold
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class TCC(RubyCache):
|
||||
size = MemorySize("256kB")
|
||||
assoc = 16
|
||||
dataAccessLatency = 8
|
||||
tagAccessLatency = 2
|
||||
resourceStalls = False
|
||||
def create(self, options):
|
||||
self.assoc = options.tcc_assoc
|
||||
if hasattr(options, 'bw_scalor') and options.bw_scalor > 0:
|
||||
s = options.num_compute_units
|
||||
tcc_size = s * 128
|
||||
tcc_size = str(tcc_size)+'kB'
|
||||
self.size = MemorySize(tcc_size)
|
||||
self.dataArrayBanks = 64
|
||||
self.tagArrayBanks = 64
|
||||
else:
|
||||
self.size = MemorySize(options.tcc_size)
|
||||
self.dataArrayBanks = 256 / options.num_tccs #number of data banks
|
||||
self.tagArrayBanks = 256 / options.num_tccs #number of tag banks
|
||||
self.size.value = self.size.value / options.num_tccs
|
||||
if ((self.size.value / long(self.assoc)) < 128):
|
||||
self.size.value = long(128 * self.assoc)
|
||||
self.start_index_bit = math.log(options.cacheline_size, 2) + \
|
||||
math.log(options.num_tccs, 2)
|
||||
self.replacement_policy = TreePLRURP(num_leaves = self.assoc)
|
||||
|
||||
class TCCCntrl(TCC_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L2cache = TCC()
|
||||
self.L2cache.create(options)
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
class L3Cache(RubyCache):
|
||||
dataArrayBanks = 16
|
||||
tagArrayBanks = 16
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.size = MemorySize(options.l3_size)
|
||||
self.size.value /= options.num_dirs
|
||||
self.assoc = options.l3_assoc
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataArrayBanks /= options.num_dirs
|
||||
self.tagArrayBanks /= options.num_dirs
|
||||
self.dataAccessLatency = options.l3_data_latency
|
||||
self.tagAccessLatency = options.l3_tag_latency
|
||||
self.resourceStalls = False
|
||||
self.replacement_policy = TreePLRURP(num_leaves = self.assoc)
|
||||
|
||||
class L3Cntrl(L3Cache_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.L3cache = L3Cache()
|
||||
self.L3cache.create(options, ruby_system, system)
|
||||
self.l3_response_latency = \
|
||||
max(self.L3cache.dataAccessLatency, self.L3cache.tagAccessLatency)
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
# Directory controller: Contains directory memory, L3 cache and associated
|
||||
# state machine which is used to accurately redirect a data request to L3 cache
|
||||
# or memory. The permissions requests do not come to this directory for region
|
||||
# based protocols as they are handled exclusively by the region directory.
|
||||
# However, region directory controller uses this directory controller for
|
||||
# sending probe requests and receiving probe responses.
|
||||
class DirCntrl(Directory_Controller, CntrlBase):
|
||||
def create(self, options, dir_ranges, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.response_latency = 25
|
||||
self.response_latency_regionDir = 1
|
||||
self.addr_ranges = dir_ranges
|
||||
self.directory = RubyDirectoryMemory()
|
||||
self.L3CacheMemory = L3Cache()
|
||||
self.L3CacheMemory.create(options, ruby_system, system)
|
||||
self.l3_hit_latency = \
|
||||
max(self.L3CacheMemory.dataAccessLatency,
|
||||
self.L3CacheMemory.tagAccessLatency)
|
||||
|
||||
self.ruby_system = ruby_system
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
def connectWireBuffers(self, req_to_dir, resp_to_dir, l3_unblock_to_dir,
|
||||
req_to_l3, probe_to_l3, resp_to_l3):
|
||||
self.reqToDir = req_to_dir
|
||||
self.respToDir = resp_to_dir
|
||||
self.l3UnblockToDir = l3_unblock_to_dir
|
||||
self.reqToL3 = req_to_l3
|
||||
self.probeToL3 = probe_to_l3
|
||||
self.respToL3 = resp_to_l3
|
||||
|
||||
# Region directory : Stores region permissions
|
||||
class RegionDir(RubyCache):
|
||||
|
||||
def create(self, options, ruby_system, system):
|
||||
self.block_size = "%dB" % (64 * options.blocks_per_region)
|
||||
self.size = options.region_dir_entries * \
|
||||
self.block_size * options.num_compute_units
|
||||
self.assoc = 8
|
||||
self.tagArrayBanks = 8
|
||||
self.tagAccessLatency = options.dir_tag_latency
|
||||
self.dataAccessLatency = 1
|
||||
self.resourceStalls = options.no_resource_stalls
|
||||
self.start_index_bit = 6 + int(math.log(options.blocks_per_region, 2))
|
||||
self.replacement_policy = TreePLRURP(num_leaves = self.assoc)
|
||||
# Region directory controller : Contains region directory and associated state
|
||||
# machine for dealing with region coherence requests.
|
||||
class RegionCntrl(RegionDir_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.cacheMemory = RegionDir()
|
||||
self.cacheMemory.create(options, ruby_system, system)
|
||||
self.blocksPerRegion = options.blocks_per_region
|
||||
self.toDirLatency = \
|
||||
max(self.cacheMemory.dataAccessLatency,
|
||||
self.cacheMemory.tagAccessLatency)
|
||||
self.ruby_system = ruby_system
|
||||
self.always_migrate = options.always_migrate
|
||||
self.sym_migrate = options.symmetric_migrate
|
||||
self.asym_migrate = options.asymmetric_migrate
|
||||
if self.always_migrate:
|
||||
assert(not self.asym_migrate and not self.sym_migrate)
|
||||
if self.sym_migrate:
|
||||
assert(not self.always_migrate and not self.asym_migrate)
|
||||
if self.asym_migrate:
|
||||
assert(not self.always_migrate and not self.sym_migrate)
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
|
||||
# Region Buffer: A region directory cache which avoids some potential
|
||||
# long latency lookup of region directory for getting region permissions
|
||||
class RegionBuffer(RubyCache):
|
||||
assoc = 4
|
||||
dataArrayBanks = 256
|
||||
tagArrayBanks = 256
|
||||
dataAccessLatency = 1
|
||||
tagAccessLatency = 1
|
||||
resourceStalls = True
|
||||
|
||||
class RBCntrl(RegionBuffer_Controller, CntrlBase):
|
||||
def create(self, options, ruby_system, system):
|
||||
self.version = self.versionCount()
|
||||
self.cacheMemory = RegionBuffer()
|
||||
self.cacheMemory.resourceStalls = options.no_tcc_resource_stalls
|
||||
self.cacheMemory.dataArrayBanks = 64
|
||||
self.cacheMemory.tagArrayBanks = 64
|
||||
self.blocksPerRegion = options.blocks_per_region
|
||||
self.cacheMemory.block_size = "%dB" % (64 * self.blocksPerRegion)
|
||||
self.cacheMemory.start_index_bit = \
|
||||
6 + int(math.log(self.blocksPerRegion, 2))
|
||||
self.cacheMemory.size = options.region_buffer_entries * \
|
||||
self.cacheMemory.block_size * options.num_compute_units
|
||||
self.toDirLatency = options.gpu_to_dir_latency
|
||||
self.toRegionDirLatency = options.cpu_to_dir_latency
|
||||
self.noTCCdir = True
|
||||
TCC_bits = int(math.log(options.num_tccs, 2))
|
||||
self.TCC_select_num_bits = TCC_bits
|
||||
self.ruby_system = ruby_system
|
||||
|
||||
if options.recycle_latency:
|
||||
self.recycle_latency = options.recycle_latency
|
||||
self.cacheMemory.replacement_policy = \
|
||||
TreePLRURP(num_leaves = self.cacheMemory.assoc)
|
||||
|
||||
def define_options(parser):
|
||||
parser.add_option("--num-subcaches", type="int", default=4)
|
||||
parser.add_option("--l3-data-latency", type="int", default=20)
|
||||
parser.add_option("--l3-tag-latency", type="int", default=15)
|
||||
parser.add_option("--cpu-to-dir-latency", type="int", default=120)
|
||||
parser.add_option("--gpu-to-dir-latency", type="int", default=60)
|
||||
parser.add_option("--no-resource-stalls", action="store_false",
|
||||
default=True)
|
||||
parser.add_option("--no-tcc-resource-stalls", action="store_false",
|
||||
default=True)
|
||||
parser.add_option("--num-tbes", type="int", default=32)
|
||||
parser.add_option("--l2-latency", type="int", default=50) # load to use
|
||||
parser.add_option("--num-tccs", type="int", default=1,
|
||||
help="number of TCC banks in the GPU")
|
||||
|
||||
parser.add_option("--sqc-size", type='string', default='32kB',
|
||||
help="SQC cache size")
|
||||
parser.add_option("--sqc-assoc", type='int', default=8,
|
||||
help="SQC cache assoc")
|
||||
parser.add_option("--sqc-deadlock-threshold", type='int',
|
||||
help="Set the SQC deadlock threshold to some value")
|
||||
|
||||
parser.add_option("--WB_L1", action="store_true",
|
||||
default=False, help="L2 Writeback Cache")
|
||||
parser.add_option("--WB_L2", action="store_true",
|
||||
default=False, help="L2 Writeback Cache")
|
||||
parser.add_option("--TCP_latency",
|
||||
type="int", default=4, help="TCP latency")
|
||||
parser.add_option("--TCC_latency",
|
||||
type="int", default=16, help="TCC latency")
|
||||
parser.add_option("--tcc-size", type='string', default='2MB',
|
||||
help="agregate tcc size")
|
||||
parser.add_option("--tcc-assoc", type='int', default=16,
|
||||
help="tcc assoc")
|
||||
parser.add_option("--tcp-size", type='string', default='16kB',
|
||||
help="tcp size")
|
||||
parser.add_option("--tcp-deadlock-threshold", type='int',
|
||||
help="Set the TCP deadlock threshold to some value")
|
||||
parser.add_option("--max-coalesces-per-cycle", type="int", default=1,
|
||||
help="Maximum insts that may coalesce in a cycle");
|
||||
|
||||
parser.add_option("--dir-tag-latency", type="int", default=4)
|
||||
parser.add_option("--dir-tag-banks", type="int", default=4)
|
||||
parser.add_option("--blocks-per-region", type="int", default=16)
|
||||
parser.add_option("--dir-entries", type="int", default=8192)
|
||||
|
||||
# Region buffer is a cache of region directory. Hence region
|
||||
# directory is inclusive with respect to region directory.
|
||||
# However, region directory is non-inclusive with respect to
|
||||
# the caches in the system
|
||||
parser.add_option("--region-dir-entries", type="int", default=1024)
|
||||
parser.add_option("--region-buffer-entries", type="int", default=512)
|
||||
|
||||
parser.add_option("--always-migrate",
|
||||
action="store_true", default=False)
|
||||
parser.add_option("--symmetric-migrate",
|
||||
action="store_true", default=False)
|
||||
parser.add_option("--asymmetric-migrate",
|
||||
action="store_true", default=False)
|
||||
parser.add_option("--use-L3-on-WT", action="store_true", default=False)
|
||||
|
||||
def create_system(options, full_system, system, dma_devices, bootmem,
|
||||
ruby_system):
|
||||
if buildEnv['PROTOCOL'] != 'GPU_VIPER_Region':
|
||||
panic("This script requires the GPU_VIPER_Region protocol to be built.")
|
||||
|
||||
cpu_sequencers = []
|
||||
|
||||
#
|
||||
# The ruby network creation expects the list of nodes in the system to be
|
||||
# consistent with the NetDest list. Therefore the l1 controller nodes
|
||||
# must be listed before the directory nodes and directory nodes before
|
||||
# dma nodes, etc.
|
||||
#
|
||||
dir_cntrl_nodes = []
|
||||
|
||||
# For an odd number of CPUs, still create the right number of controllers
|
||||
TCC_bits = int(math.log(options.num_tccs, 2))
|
||||
|
||||
#
|
||||
# Must create the individual controllers before the network to ensure the
|
||||
# controller constructors are called before the network constructor
|
||||
#
|
||||
|
||||
# For an odd number of CPUs, still create the right number of controllers
|
||||
crossbar_bw = 16 * options.num_compute_units #Assuming a 2GHz clock
|
||||
cpuCluster = Cluster(extBW = (crossbar_bw), intBW=crossbar_bw)
|
||||
for i in range((options.num_cpus + 1) // 2):
|
||||
|
||||
cp_cntrl = CPCntrl()
|
||||
cp_cntrl.create(options, ruby_system, system)
|
||||
|
||||
rb_cntrl = RBCntrl()
|
||||
rb_cntrl.create(options, ruby_system, system)
|
||||
rb_cntrl.number_of_TBEs = 256
|
||||
rb_cntrl.isOnCPU = True
|
||||
|
||||
cp_cntrl.regionBufferNum = rb_cntrl.version
|
||||
|
||||
exec("system.cp_cntrl%d = cp_cntrl" % i)
|
||||
exec("system.rb_cntrl%d = rb_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.extend([cp_cntrl.sequencer, cp_cntrl.sequencer1])
|
||||
|
||||
# Connect the CP controllers and the network
|
||||
cp_cntrl.requestFromCore = MessageBuffer()
|
||||
cp_cntrl.requestFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.responseFromCore = MessageBuffer()
|
||||
cp_cntrl.responseFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.unblockFromCore = MessageBuffer()
|
||||
cp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
cp_cntrl.probeToCore = MessageBuffer()
|
||||
cp_cntrl.probeToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.responseToCore = MessageBuffer()
|
||||
cp_cntrl.responseToCore.slave = ruby_system.network.master
|
||||
|
||||
cp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
cp_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
# Connect the RB controllers to the ruby network
|
||||
rb_cntrl.requestFromCore = MessageBuffer(ordered = True)
|
||||
rb_cntrl.requestFromCore.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.responseFromCore = MessageBuffer()
|
||||
rb_cntrl.responseFromCore.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.requestToNetwork = MessageBuffer()
|
||||
rb_cntrl.requestToNetwork.master = ruby_system.network.slave
|
||||
|
||||
rb_cntrl.notifyFromRegionDir = MessageBuffer()
|
||||
rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.probeFromRegionDir = MessageBuffer()
|
||||
rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.unblockFromDir = MessageBuffer()
|
||||
rb_cntrl.unblockFromDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.responseToRegDir = MessageBuffer()
|
||||
rb_cntrl.responseToRegDir.master = ruby_system.network.slave
|
||||
|
||||
rb_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
cpuCluster.add(cp_cntrl)
|
||||
cpuCluster.add(rb_cntrl)
|
||||
|
||||
gpuCluster = Cluster(extBW = (crossbar_bw), intBW = crossbar_bw)
|
||||
for i in range(options.num_compute_units):
|
||||
|
||||
tcp_cntrl = TCPCntrl(TCC_select_num_bits = TCC_bits,
|
||||
issue_latency = 1,
|
||||
number_of_TBEs = 2560)
|
||||
# TBEs set to max outstanding requests
|
||||
tcp_cntrl.create(options, ruby_system, system)
|
||||
tcp_cntrl.WB = options.WB_L1
|
||||
tcp_cntrl.disableL1 = False
|
||||
|
||||
exec("system.tcp_cntrl%d = tcp_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(tcp_cntrl.coalescer)
|
||||
|
||||
# Connect the CP (TCP) controllers to the ruby network
|
||||
tcp_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.requestFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.responseFromTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseFromTCP.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.unblockFromCore = MessageBuffer()
|
||||
tcp_cntrl.unblockFromCore.master = ruby_system.network.slave
|
||||
|
||||
tcp_cntrl.probeToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.probeToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.responseToTCP = MessageBuffer(ordered = True)
|
||||
tcp_cntrl.responseToTCP.slave = ruby_system.network.master
|
||||
|
||||
tcp_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
gpuCluster.add(tcp_cntrl)
|
||||
|
||||
for i in range(options.num_sqc):
|
||||
|
||||
sqc_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
|
||||
sqc_cntrl.create(options, ruby_system, system)
|
||||
|
||||
exec("system.sqc_cntrl%d = sqc_cntrl" % i)
|
||||
#
|
||||
# Add controllers and sequencers to the appropriate lists
|
||||
#
|
||||
cpu_sequencers.append(sqc_cntrl.sequencer)
|
||||
|
||||
# Connect the SQC controller to the ruby network
|
||||
sqc_cntrl.requestFromSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.requestFromSQC.master = ruby_system.network.slave
|
||||
|
||||
sqc_cntrl.probeToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.probeToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.responseToSQC = MessageBuffer(ordered = True)
|
||||
sqc_cntrl.responseToSQC.slave = ruby_system.network.master
|
||||
|
||||
sqc_cntrl.mandatoryQueue = MessageBuffer()
|
||||
|
||||
# SQC also in GPU cluster
|
||||
gpuCluster.add(sqc_cntrl)
|
||||
|
||||
numa_bit = 6
|
||||
|
||||
for i in range(options.num_tccs):
|
||||
|
||||
tcc_cntrl = TCCCntrl()
|
||||
tcc_cntrl.create(options, ruby_system, system)
|
||||
tcc_cntrl.l2_request_latency = 1
|
||||
tcc_cntrl.l2_response_latency = options.TCC_latency
|
||||
tcc_cntrl.WB = options.WB_L2
|
||||
tcc_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
|
||||
# Connect the TCC controllers to the ruby network
|
||||
tcc_cntrl.requestFromTCP = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.requestFromTCP.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.responseToCore = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.probeFromNB = MessageBuffer()
|
||||
tcc_cntrl.probeFromNB.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.responseFromNB = MessageBuffer()
|
||||
tcc_cntrl.responseFromNB.slave = ruby_system.network.master
|
||||
|
||||
tcc_cntrl.requestToNB = MessageBuffer(ordered = True)
|
||||
tcc_cntrl.requestToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.responseToNB = MessageBuffer()
|
||||
tcc_cntrl.responseToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.unblockToNB = MessageBuffer()
|
||||
tcc_cntrl.unblockToNB.master = ruby_system.network.slave
|
||||
|
||||
tcc_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
rb_cntrl = RBCntrl()
|
||||
rb_cntrl.create(options, ruby_system, system)
|
||||
rb_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
rb_cntrl.isOnCPU = False
|
||||
|
||||
# Connect the RB controllers to the ruby network
|
||||
rb_cntrl.requestFromCore = MessageBuffer(ordered = True)
|
||||
rb_cntrl.requestFromCore.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.responseFromCore = MessageBuffer()
|
||||
rb_cntrl.responseFromCore.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.requestToNetwork = MessageBuffer()
|
||||
rb_cntrl.requestToNetwork.master = ruby_system.network.slave
|
||||
|
||||
rb_cntrl.notifyFromRegionDir = MessageBuffer()
|
||||
rb_cntrl.notifyFromRegionDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.probeFromRegionDir = MessageBuffer()
|
||||
rb_cntrl.probeFromRegionDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.unblockFromDir = MessageBuffer()
|
||||
rb_cntrl.unblockFromDir.slave = ruby_system.network.master
|
||||
|
||||
rb_cntrl.responseToRegDir = MessageBuffer()
|
||||
rb_cntrl.responseToRegDir.master = ruby_system.network.slave
|
||||
|
||||
rb_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
tcc_cntrl.regionBufferNum = rb_cntrl.version
|
||||
|
||||
exec("system.tcc_cntrl%d = tcc_cntrl" % i)
|
||||
exec("system.tcc_rb_cntrl%d = rb_cntrl" % i)
|
||||
|
||||
# TCC cntrls added to the GPU cluster
|
||||
gpuCluster.add(tcc_cntrl)
|
||||
gpuCluster.add(rb_cntrl)
|
||||
|
||||
# Because of wire buffers, num_l3caches must equal num_dirs
|
||||
# Region coherence only works with 1 dir
|
||||
assert(options.num_l3caches == options.num_dirs == 1)
|
||||
|
||||
# This is the base crossbar that connects the L3s, Dirs, and cpu/gpu
|
||||
# Clusters
|
||||
mainCluster = Cluster(intBW = crossbar_bw)
|
||||
|
||||
if options.numa_high_bit:
|
||||
numa_bit = options.numa_high_bit
|
||||
else:
|
||||
# if the numa_bit is not specified, set the directory bits as the
|
||||
# lowest bits above the block offset bits, and the numa_bit as the
|
||||
# highest of those directory bits
|
||||
dir_bits = int(math.log(options.num_dirs, 2))
|
||||
block_size_bits = int(math.log(options.cacheline_size, 2))
|
||||
numa_bit = block_size_bits + dir_bits - 1
|
||||
|
||||
dir_ranges = []
|
||||
for r in system.mem_ranges:
|
||||
addr_range = m5.objects.AddrRange(r.start, size = r.size(),
|
||||
intlvHighBit = numa_bit,
|
||||
intlvBits = dir_bits,
|
||||
intlvMatch = i)
|
||||
dir_ranges.append(addr_range)
|
||||
|
||||
dir_cntrl = DirCntrl()
|
||||
dir_cntrl.create(options, dir_ranges, ruby_system, system)
|
||||
dir_cntrl.number_of_TBEs = 2560 * options.num_compute_units
|
||||
dir_cntrl.useL3OnWT = options.use_L3_on_WT
|
||||
|
||||
# Connect the Directory controller to the ruby network
|
||||
dir_cntrl.requestFromCores = MessageBuffer()
|
||||
dir_cntrl.requestFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.responseFromCores = MessageBuffer()
|
||||
dir_cntrl.responseFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.unblockFromCores = MessageBuffer()
|
||||
dir_cntrl.unblockFromCores.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.probeToCore = MessageBuffer()
|
||||
dir_cntrl.probeToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.responseToCore = MessageBuffer()
|
||||
dir_cntrl.responseToCore.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.reqFromRegBuf = MessageBuffer()
|
||||
dir_cntrl.reqFromRegBuf.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.reqToRegDir = MessageBuffer(ordered = True)
|
||||
dir_cntrl.reqToRegDir.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.reqFromRegDir = MessageBuffer(ordered = True)
|
||||
dir_cntrl.reqFromRegDir.slave = ruby_system.network.master
|
||||
|
||||
dir_cntrl.unblockToRegDir = MessageBuffer()
|
||||
dir_cntrl.unblockToRegDir.master = ruby_system.network.slave
|
||||
|
||||
dir_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.L3triggerQueue = MessageBuffer(ordered = True)
|
||||
dir_cntrl.requestToMemory = MessageBuffer()
|
||||
dir_cntrl.responseFromMemory = MessageBuffer()
|
||||
|
||||
exec("system.dir_cntrl%d = dir_cntrl" % i)
|
||||
dir_cntrl_nodes.append(dir_cntrl)
|
||||
|
||||
mainCluster.add(dir_cntrl)
|
||||
|
||||
reg_cntrl = RegionCntrl(noTCCdir=True,TCC_select_num_bits = TCC_bits)
|
||||
reg_cntrl.create(options, ruby_system, system)
|
||||
reg_cntrl.number_of_TBEs = options.num_tbes
|
||||
reg_cntrl.cpuRegionBufferNum = system.rb_cntrl0.version
|
||||
reg_cntrl.gpuRegionBufferNum = system.tcc_rb_cntrl0.version
|
||||
|
||||
# Connect the Region Dir controllers to the ruby network
|
||||
reg_cntrl.requestToDir = MessageBuffer(ordered = True)
|
||||
reg_cntrl.requestToDir.master = ruby_system.network.slave
|
||||
|
||||
reg_cntrl.notifyToRBuffer = MessageBuffer()
|
||||
reg_cntrl.notifyToRBuffer.master = ruby_system.network.slave
|
||||
|
||||
reg_cntrl.probeToRBuffer = MessageBuffer()
|
||||
reg_cntrl.probeToRBuffer.master = ruby_system.network.slave
|
||||
|
||||
reg_cntrl.responseFromRBuffer = MessageBuffer()
|
||||
reg_cntrl.responseFromRBuffer.slave = ruby_system.network.master
|
||||
|
||||
reg_cntrl.requestFromRegBuf = MessageBuffer()
|
||||
reg_cntrl.requestFromRegBuf.slave = ruby_system.network.master
|
||||
|
||||
reg_cntrl.triggerQueue = MessageBuffer(ordered = True)
|
||||
|
||||
exec("system.reg_cntrl%d = reg_cntrl" % i)
|
||||
|
||||
mainCluster.add(reg_cntrl)
|
||||
|
||||
# Assuming no DMA devices
|
||||
assert(len(dma_devices) == 0)
|
||||
|
||||
# Add cpu/gpu clusters to main cluster
|
||||
mainCluster.add(cpuCluster)
|
||||
mainCluster.add(gpuCluster)
|
||||
|
||||
ruby_system.network.number_of_virtual_networks = 10
|
||||
|
||||
return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
|
||||
@@ -1,665 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
|
||||
: Sequencer* sequencer;
|
||||
CacheMemory * L1cache;
|
||||
int TCC_select_num_bits;
|
||||
Cycles issue_latency := 80; // time to send data down to TCC
|
||||
Cycles l2_hit_latency := 18;
|
||||
|
||||
MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request";
|
||||
MessageBuffer * responseFromSQC, network="To", virtual_network="3", vnet_type="response";
|
||||
MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
|
||||
|
||||
MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request";
|
||||
MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response";
|
||||
|
||||
MessageBuffer * mandatoryQueue;
|
||||
{
|
||||
state_declaration(State, desc="SQC Cache States", default="SQC_State_I") {
|
||||
I, AccessPermission:Invalid, desc="Invalid";
|
||||
S, AccessPermission:Read_Only, desc="Shared";
|
||||
|
||||
I_S, AccessPermission:Busy, desc="Invalid, issued RdBlkS, have not seen response yet";
|
||||
S_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack";
|
||||
I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCCdir for canceled WB";
|
||||
}
|
||||
|
||||
enumeration(Event, desc="SQC Events") {
|
||||
// Core initiated
|
||||
Fetch, desc="Fetch";
|
||||
|
||||
//TCC initiated
|
||||
TCC_AckS, desc="TCC Ack to Core Request";
|
||||
TCC_AckWB, desc="TCC Ack for WB";
|
||||
TCC_NackWB, desc="TCC Nack for WB";
|
||||
|
||||
// Mem sys initiated
|
||||
Repl, desc="Replacing block from cache";
|
||||
|
||||
// Probe Events
|
||||
PrbInvData, desc="probe, return M data";
|
||||
PrbInv, desc="probe, no need for data";
|
||||
PrbShrData, desc="probe downgrade, return data";
|
||||
}
|
||||
|
||||
enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
|
||||
DataArrayRead, desc="Read the data array";
|
||||
DataArrayWrite, desc="Write the data array";
|
||||
TagArrayRead, desc="Read the data array";
|
||||
TagArrayWrite, desc="Write the data array";
|
||||
}
|
||||
|
||||
|
||||
structure(Entry, desc="...", interface="AbstractCacheEntry") {
|
||||
State CacheState, desc="cache state";
|
||||
bool Dirty, desc="Is the data dirty (diff than memory)?";
|
||||
DataBlock DataBlk, desc="data for the block";
|
||||
bool FromL2, default="false", desc="block just moved from L2";
|
||||
}
|
||||
|
||||
structure(TBE, desc="...") {
|
||||
State TBEState, desc="Transient state";
|
||||
DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
|
||||
bool Dirty, desc="Is the data dirty (different than memory)?";
|
||||
int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for";
|
||||
bool Shared, desc="Victim hit by shared probe";
|
||||
}
|
||||
|
||||
structure(TBETable, external="yes") {
|
||||
TBE lookup(Addr);
|
||||
void allocate(Addr);
|
||||
void deallocate(Addr);
|
||||
bool isPresent(Addr);
|
||||
}
|
||||
|
||||
TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs";
|
||||
int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
|
||||
|
||||
Tick clockEdge();
|
||||
Tick cyclesToTicks(Cycles c);
|
||||
|
||||
void set_cache_entry(AbstractCacheEntry b);
|
||||
void unset_cache_entry();
|
||||
void set_tbe(TBE b);
|
||||
void unset_tbe();
|
||||
void wakeUpAllBuffers();
|
||||
void wakeUpBuffers(Addr a);
|
||||
Cycles curCycle();
|
||||
|
||||
// Internal functions
|
||||
Entry getCacheEntry(Addr address), return_by_pointer="yes" {
|
||||
Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
|
||||
return cache_entry;
|
||||
}
|
||||
|
||||
DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
|
||||
TBE tbe := TBEs.lookup(addr);
|
||||
if(is_valid(tbe)) {
|
||||
return tbe.DataBlk;
|
||||
} else {
|
||||
return getCacheEntry(addr).DataBlk;
|
||||
}
|
||||
}
|
||||
|
||||
State getState(TBE tbe, Entry cache_entry, Addr addr) {
|
||||
if(is_valid(tbe)) {
|
||||
return tbe.TBEState;
|
||||
} else if (is_valid(cache_entry)) {
|
||||
return cache_entry.CacheState;
|
||||
}
|
||||
return State:I;
|
||||
}
|
||||
|
||||
void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
|
||||
if (is_valid(tbe)) {
|
||||
tbe.TBEState := state;
|
||||
}
|
||||
|
||||
if (is_valid(cache_entry)) {
|
||||
cache_entry.CacheState := state;
|
||||
}
|
||||
}
|
||||
|
||||
AccessPermission getAccessPermission(Addr addr) {
|
||||
TBE tbe := TBEs.lookup(addr);
|
||||
if(is_valid(tbe)) {
|
||||
return SQC_State_to_permission(tbe.TBEState);
|
||||
}
|
||||
|
||||
Entry cache_entry := getCacheEntry(addr);
|
||||
if(is_valid(cache_entry)) {
|
||||
return SQC_State_to_permission(cache_entry.CacheState);
|
||||
}
|
||||
|
||||
return AccessPermission:NotPresent;
|
||||
}
|
||||
|
||||
void setAccessPermission(Entry cache_entry, Addr addr, State state) {
|
||||
if (is_valid(cache_entry)) {
|
||||
cache_entry.changePermission(SQC_State_to_permission(state));
|
||||
}
|
||||
}
|
||||
|
||||
void functionalRead(Addr addr, Packet *pkt) {
|
||||
TBE tbe := TBEs.lookup(addr);
|
||||
if(is_valid(tbe)) {
|
||||
testAndRead(addr, tbe.DataBlk, pkt);
|
||||
} else {
|
||||
functionalMemoryRead(pkt);
|
||||
}
|
||||
}
|
||||
|
||||
int functionalWrite(Addr addr, Packet *pkt) {
|
||||
int num_functional_writes := 0;
|
||||
|
||||
TBE tbe := TBEs.lookup(addr);
|
||||
if(is_valid(tbe)) {
|
||||
num_functional_writes := num_functional_writes +
|
||||
testAndWrite(addr, tbe.DataBlk, pkt);
|
||||
}
|
||||
|
||||
num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
|
||||
return num_functional_writes;
|
||||
}
|
||||
|
||||
void recordRequestType(RequestType request_type, Addr addr) {
|
||||
if (request_type == RequestType:DataArrayRead) {
|
||||
L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
|
||||
} else if (request_type == RequestType:DataArrayWrite) {
|
||||
L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
|
||||
} else if (request_type == RequestType:TagArrayRead) {
|
||||
L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
|
||||
} else if (request_type == RequestType:TagArrayWrite) {
|
||||
L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
|
||||
}
|
||||
}
|
||||
|
||||
bool checkResourceAvailable(RequestType request_type, Addr addr) {
|
||||
if (request_type == RequestType:DataArrayRead) {
|
||||
return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
|
||||
} else if (request_type == RequestType:DataArrayWrite) {
|
||||
return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
|
||||
} else if (request_type == RequestType:TagArrayRead) {
|
||||
return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
|
||||
} else if (request_type == RequestType:TagArrayWrite) {
|
||||
return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
|
||||
} else {
|
||||
error("Invalid RequestType type in checkResourceAvailable");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Out Ports
|
||||
|
||||
out_port(requestNetwork_out, CPURequestMsg, requestFromSQC);
|
||||
out_port(responseNetwork_out, ResponseMsg, responseFromSQC);
|
||||
out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
|
||||
|
||||
// In Ports
|
||||
|
||||
in_port(probeNetwork_in, TDProbeRequestMsg, probeToSQC) {
|
||||
if (probeNetwork_in.isReady(clockEdge())) {
|
||||
peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") {
|
||||
Entry cache_entry := getCacheEntry(in_msg.addr);
|
||||
TBE tbe := TBEs.lookup(in_msg.addr);
|
||||
|
||||
if (in_msg.Type == ProbeRequestType:PrbInv) {
|
||||
if (in_msg.ReturnData) {
|
||||
trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
|
||||
} else {
|
||||
trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
|
||||
}
|
||||
} else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
|
||||
assert(in_msg.ReturnData);
|
||||
trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
in_port(responseToSQC_in, ResponseMsg, responseToSQC) {
|
||||
if (responseToSQC_in.isReady(clockEdge())) {
|
||||
peek(responseToSQC_in, ResponseMsg, block_on="addr") {
|
||||
|
||||
Entry cache_entry := getCacheEntry(in_msg.addr);
|
||||
TBE tbe := TBEs.lookup(in_msg.addr);
|
||||
|
||||
if (in_msg.Type == CoherenceResponseType:TDSysResp) {
|
||||
if (in_msg.State == CoherenceState:Shared) {
|
||||
trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe);
|
||||
} else {
|
||||
error("SQC should not receive TDSysResp other than CoherenceState:Shared");
|
||||
}
|
||||
} else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) {
|
||||
trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
|
||||
} else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) {
|
||||
trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe);
|
||||
} else {
|
||||
error("Unexpected Response Message to Core");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
|
||||
if (mandatoryQueue_in.isReady(clockEdge())) {
|
||||
peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
|
||||
Entry cache_entry := getCacheEntry(in_msg.LineAddress);
|
||||
TBE tbe := TBEs.lookup(in_msg.LineAddress);
|
||||
|
||||
assert(in_msg.Type == RubyRequestType:IFETCH);
|
||||
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
|
||||
trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
|
||||
} else {
|
||||
Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
|
||||
trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Actions
|
||||
|
||||
action(ic_invCache, "ic", desc="invalidate cache") {
|
||||
if(is_valid(cache_entry)) {
|
||||
L1cache.deallocate(address);
|
||||
}
|
||||
unset_cache_entry();
|
||||
}
|
||||
|
||||
action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
|
||||
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceRequestType:RdBlkS;
|
||||
out_msg.Requestor := machineID;
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.MessageSize := MessageSizeType:Request_Control;
|
||||
out_msg.InitialRequestTime := curCycle();
|
||||
}
|
||||
}
|
||||
|
||||
action(vc_victim, "vc", desc="Victimize E/S Data") {
|
||||
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Requestor := machineID;
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.MessageSize := MessageSizeType:Request_Control;
|
||||
out_msg.Type := CoherenceRequestType:VicClean;
|
||||
out_msg.InitialRequestTime := curCycle();
|
||||
if (cache_entry.CacheState == State:S) {
|
||||
out_msg.Shared := true;
|
||||
} else {
|
||||
out_msg.Shared := false;
|
||||
}
|
||||
out_msg.InitialRequestTime := curCycle();
|
||||
}
|
||||
}
|
||||
|
||||
action(a_allocate, "a", desc="allocate block") {
|
||||
if (is_invalid(cache_entry)) {
|
||||
set_cache_entry(L1cache.allocate(address, new Entry));
|
||||
}
|
||||
}
|
||||
|
||||
action(t_allocateTBE, "t", desc="allocate TBE Entry") {
|
||||
check_allocate(TBEs);
|
||||
assert(is_valid(cache_entry));
|
||||
TBEs.allocate(address);
|
||||
set_tbe(TBEs.lookup(address));
|
||||
tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs
|
||||
tbe.Dirty := cache_entry.Dirty;
|
||||
tbe.Shared := false;
|
||||
}
|
||||
|
||||
action(d_deallocateTBE, "d", desc="Deallocate TBE") {
|
||||
TBEs.deallocate(address);
|
||||
unset_tbe();
|
||||
}
|
||||
|
||||
action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
|
||||
mandatoryQueue_in.dequeue(clockEdge());
|
||||
}
|
||||
|
||||
action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
|
||||
responseToSQC_in.dequeue(clockEdge());
|
||||
}
|
||||
|
||||
action(pp_popProbeQueue, "pp", desc="pop probe queue") {
|
||||
probeNetwork_in.dequeue(clockEdge());
|
||||
}
|
||||
|
||||
action(l_loadDone, "l", desc="local load done") {
|
||||
assert(is_valid(cache_entry));
|
||||
sequencer.readCallback(address, cache_entry.DataBlk,
|
||||
false, MachineType:L1Cache);
|
||||
APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
|
||||
}
|
||||
|
||||
action(xl_loadDone, "xl", desc="remote load done") {
|
||||
peek(responseToSQC_in, ResponseMsg) {
|
||||
assert(is_valid(cache_entry));
|
||||
sequencer.readCallback(address,
|
||||
cache_entry.DataBlk,
|
||||
false,
|
||||
machineIDToMachineType(in_msg.Sender),
|
||||
in_msg.InitialRequestTime,
|
||||
in_msg.ForwardRequestTime,
|
||||
in_msg.ProbeRequestStartTime);
|
||||
APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
|
||||
}
|
||||
}
|
||||
|
||||
action(w_writeCache, "w", desc="write data to cache") {
|
||||
peek(responseToSQC_in, ResponseMsg) {
|
||||
assert(is_valid(cache_entry));
|
||||
cache_entry.DataBlk := in_msg.DataBlk;
|
||||
cache_entry.Dirty := in_msg.Dirty;
|
||||
}
|
||||
}
|
||||
|
||||
action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
|
||||
peek(responseToSQC_in, ResponseMsg) {
|
||||
enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:StaleNotif;
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.MessageSize := MessageSizeType:Response_Control;
|
||||
DPRINTF(RubySlicc, "%s\n", out_msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(wb_data, "wb", desc="write back data") {
|
||||
peek(responseToSQC_in, ResponseMsg) {
|
||||
enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:CPUData;
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.DataBlk := tbe.DataBlk;
|
||||
out_msg.Dirty := tbe.Dirty;
|
||||
if (tbe.Shared) {
|
||||
out_msg.NbReqShared := true;
|
||||
} else {
|
||||
out_msg.NbReqShared := false;
|
||||
}
|
||||
out_msg.State := CoherenceState:Shared; // faux info
|
||||
out_msg.MessageSize := MessageSizeType:Writeback_Data;
|
||||
DPRINTF(RubySlicc, "%s\n", out_msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
|
||||
enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
|
||||
out_msg.Sender := machineID;
|
||||
// will this always be ok? probably not for multisocket
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.Dirty := false;
|
||||
out_msg.Hit := false;
|
||||
out_msg.Ntsl := true;
|
||||
out_msg.State := CoherenceState:NA;
|
||||
out_msg.MessageSize := MessageSizeType:Response_Control;
|
||||
}
|
||||
}
|
||||
|
||||
action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
|
||||
enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
|
||||
out_msg.Sender := machineID;
|
||||
// will this always be ok? probably not for multisocket
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.Dirty := false;
|
||||
out_msg.Ntsl := true;
|
||||
out_msg.Hit := false;
|
||||
out_msg.State := CoherenceState:NA;
|
||||
out_msg.MessageSize := MessageSizeType:Response_Control;
|
||||
}
|
||||
}
|
||||
|
||||
action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") {
|
||||
enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
|
||||
out_msg.Sender := machineID;
|
||||
// will this always be ok? probably not for multisocket
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.Dirty := false; // only true if sending back data i think
|
||||
out_msg.Hit := false;
|
||||
out_msg.Ntsl := false;
|
||||
out_msg.State := CoherenceState:NA;
|
||||
out_msg.MessageSize := MessageSizeType:Response_Control;
|
||||
}
|
||||
}
|
||||
|
||||
action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
|
||||
enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
|
||||
assert(is_valid(cache_entry) || is_valid(tbe));
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:CPUPrbResp;
|
||||
out_msg.Sender := machineID;
|
||||
// will this always be ok? probably not for multisocket
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.DataBlk := getDataBlock(address);
|
||||
if (is_valid(tbe)) {
|
||||
out_msg.Dirty := tbe.Dirty;
|
||||
} else {
|
||||
out_msg.Dirty := cache_entry.Dirty;
|
||||
}
|
||||
out_msg.Hit := true;
|
||||
out_msg.State := CoherenceState:NA;
|
||||
out_msg.MessageSize := MessageSizeType:Response_Data;
|
||||
}
|
||||
}
|
||||
|
||||
action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
|
||||
enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
|
||||
assert(is_valid(cache_entry) || is_valid(tbe));
|
||||
assert(is_valid(cache_entry));
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:CPUPrbResp;
|
||||
out_msg.Sender := machineID;
|
||||
// will this always be ok? probably not for multisocket
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.DataBlk := getDataBlock(address);
|
||||
if (is_valid(tbe)) {
|
||||
out_msg.Dirty := tbe.Dirty;
|
||||
} else {
|
||||
out_msg.Dirty := cache_entry.Dirty;
|
||||
}
|
||||
out_msg.Hit := true;
|
||||
out_msg.State := CoherenceState:NA;
|
||||
out_msg.MessageSize := MessageSizeType:Response_Data;
|
||||
}
|
||||
}
|
||||
|
||||
action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") {
|
||||
assert(is_valid(tbe));
|
||||
tbe.Shared := true;
|
||||
}
|
||||
|
||||
action(uu_sendUnblock, "uu", desc="state changed, unblock") {
|
||||
enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.MessageSize := MessageSizeType:Unblock_Control;
|
||||
DPRINTF(RubySlicc, "%s\n", out_msg);
|
||||
}
|
||||
}
|
||||
|
||||
action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
|
||||
probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
|
||||
}
|
||||
|
||||
action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
|
||||
mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
|
||||
}
|
||||
|
||||
// Transitions
|
||||
|
||||
// transitions from base
|
||||
transition(I, Fetch, I_S) {TagArrayRead, TagArrayWrite} {
|
||||
a_allocate;
|
||||
nS_issueRdBlkS;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
// simple hit transitions
|
||||
transition(S, Fetch) {TagArrayRead, DataArrayRead} {
|
||||
l_loadDone;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
// recycles from transients
|
||||
transition({I_S, S_I, I_C}, {Fetch, Repl}) {} {
|
||||
zz_recycleMandatoryQueue;
|
||||
}
|
||||
|
||||
transition(S, Repl, S_I) {TagArrayRead} {
|
||||
t_allocateTBE;
|
||||
vc_victim;
|
||||
ic_invCache;
|
||||
}
|
||||
|
||||
// TCC event
|
||||
transition(I_S, TCC_AckS, S) {DataArrayRead, DataArrayWrite} {
|
||||
w_writeCache;
|
||||
xl_loadDone;
|
||||
uu_sendUnblock;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
transition(S_I, TCC_NackWB, I){TagArrayWrite} {
|
||||
d_deallocateTBE;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
transition(S_I, TCC_AckWB, I) {TagArrayWrite} {
|
||||
wb_data;
|
||||
d_deallocateTBE;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
transition(I_C, TCC_AckWB, I){TagArrayWrite} {
|
||||
ss_sendStaleNotification;
|
||||
d_deallocateTBE;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
transition(I_C, TCC_NackWB, I) {TagArrayWrite} {
|
||||
d_deallocateTBE;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
// Probe transitions
|
||||
transition({S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
|
||||
pd_sendProbeResponseData;
|
||||
ic_invCache;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition(I_C, PrbInvData, I_C) {
|
||||
pi_sendProbeResponseInv;
|
||||
ic_invCache;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition({S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
|
||||
pi_sendProbeResponseInv;
|
||||
ic_invCache;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition({S}, PrbShrData, S) {DataArrayRead} {
|
||||
pd_sendProbeResponseData;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition({I, I_C}, PrbShrData) {TagArrayRead} {
|
||||
prm_sendProbeResponseMiss;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition(I_C, PrbInv, I_C){
|
||||
pi_sendProbeResponseInv;
|
||||
ic_invCache;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition(I_S, {PrbInv, PrbInvData}) {} {
|
||||
pi_sendProbeResponseInv;
|
||||
ic_invCache;
|
||||
a_allocate; // but make sure there is room for incoming data when it arrives
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition(I_S, PrbShrData) {} {
|
||||
prm_sendProbeResponseMiss;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition(S_I, PrbInvData, I_C) {TagArrayWrite} {
|
||||
pi_sendProbeResponseInv;
|
||||
ic_invCache;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition(S_I, PrbInv, I_C) {TagArrayWrite} {
|
||||
pi_sendProbeResponseInv;
|
||||
ic_invCache;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition(S_I, PrbShrData) {DataArrayRead} {
|
||||
pd_sendProbeResponseData;
|
||||
sf_setSharedFlip;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
protocol "GPU_AMD_Base";
|
||||
include "RubySlicc_interfaces.slicc";
|
||||
include "MOESI_AMD_Base-msg.sm";
|
||||
include "MOESI_AMD_Base-dir.sm";
|
||||
include "MOESI_AMD_Base-CorePair.sm";
|
||||
include "GPU_RfO-TCP.sm";
|
||||
include "GPU_RfO-SQC.sm";
|
||||
include "GPU_RfO-TCC.sm";
|
||||
include "GPU_RfO-TCCdir.sm";
|
||||
include "MOESI_AMD_Base-L3cache.sm";
|
||||
include "MOESI_AMD_Base-RegionBuffer.sm";
|
||||
@@ -1,9 +0,0 @@
|
||||
protocol "GPU_VIPER";
|
||||
include "RubySlicc_interfaces.slicc";
|
||||
include "MOESI_AMD_Base-msg.sm";
|
||||
include "MOESI_AMD_Base-probeFilter.sm";
|
||||
include "MOESI_AMD_Base-CorePair.sm";
|
||||
include "GPU_VIPER-TCP.sm";
|
||||
include "GPU_VIPER-SQC.sm";
|
||||
include "GPU_VIPER-TCC.sm";
|
||||
include "MOESI_AMD_Base-L3cache.sm";
|
||||
@@ -1,774 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Sooraj Puthoor, Blake Hechtman
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is inherited from GPU_VIPER-TCC.sm and retains its structure.
|
||||
* There are very few modifications in this file from the original VIPER TCC
|
||||
*/
|
||||
|
||||
machine(MachineType:TCC, "TCC Cache")
|
||||
: CacheMemory * L2cache;
|
||||
bool WB; /*is this cache Writeback?*/
|
||||
int regionBufferNum;
|
||||
Cycles l2_request_latency := 50;
|
||||
Cycles l2_response_latency := 20;
|
||||
|
||||
// From the TCPs or SQCs
|
||||
MessageBuffer * requestFromTCP, network="From", virtual_network="1", ordered="true", vnet_type="request";
|
||||
// To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC.
|
||||
MessageBuffer * responseToCore, network="To", virtual_network="3", ordered="true", vnet_type="response";
|
||||
// From the NB
|
||||
MessageBuffer * probeFromNB, network="From", virtual_network="0", ordered="false", vnet_type="request";
|
||||
MessageBuffer * responseFromNB, network="From", virtual_network="2", ordered="false", vnet_type="response";
|
||||
// To the NB
|
||||
MessageBuffer * requestToNB, network="To", virtual_network="0", ordered="false", vnet_type="request";
|
||||
MessageBuffer * responseToNB, network="To", virtual_network="2", ordered="false", vnet_type="response";
|
||||
MessageBuffer * unblockToNB, network="To", virtual_network="4", ordered="false", vnet_type="unblock";
|
||||
|
||||
MessageBuffer * triggerQueue, ordered="true", random="false";
|
||||
{
|
||||
// EVENTS
|
||||
enumeration(Event, desc="TCC Events") {
|
||||
// Requests coming from the Cores
|
||||
RdBlk, desc="RdBlk event";
|
||||
WrVicBlk, desc="L1 Write Through";
|
||||
WrVicBlkBack, desc="L1 Write Back(dirty cache)";
|
||||
Atomic, desc="Atomic Op";
|
||||
AtomicDone, desc="AtomicOps Complete";
|
||||
AtomicNotDone, desc="AtomicOps not Complete";
|
||||
Data, desc="data messgae";
|
||||
// Coming from this TCC
|
||||
L2_Repl, desc="L2 Replacement";
|
||||
// Probes
|
||||
PrbInv, desc="Invalidating probe";
|
||||
// Coming from Memory Controller
|
||||
WBAck, desc="writethrough ack from memory";
|
||||
}
|
||||
|
||||
// STATES
|
||||
state_declaration(State, desc="TCC State", default="TCC_State_I") {
|
||||
M, AccessPermission:Read_Write, desc="Modified(dirty cache only)";
|
||||
W, AccessPermission:Read_Write, desc="Written(dirty cache only)";
|
||||
V, AccessPermission:Read_Only, desc="Valid";
|
||||
I, AccessPermission:Invalid, desc="Invalid";
|
||||
IV, AccessPermission:Busy, desc="Waiting for Data";
|
||||
WI, AccessPermission:Busy, desc="Waiting on Writethrough Ack";
|
||||
A, AccessPermission:Busy, desc="Invalid waiting on atomic Data";
|
||||
}
|
||||
|
||||
enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
|
||||
DataArrayRead, desc="Read the data array";
|
||||
DataArrayWrite, desc="Write the data array";
|
||||
TagArrayRead, desc="Read the data array";
|
||||
TagArrayWrite, desc="Write the data array";
|
||||
}
|
||||
|
||||
|
||||
// STRUCTURES
|
||||
|
||||
structure(Entry, desc="...", interface="AbstractCacheEntry") {
|
||||
State CacheState, desc="cache state";
|
||||
bool Dirty, desc="Is the data dirty (diff from memory?)";
|
||||
DataBlock DataBlk, desc="Data for the block";
|
||||
WriteMask writeMask, desc="Dirty byte mask";
|
||||
}
|
||||
|
||||
structure(TBE, desc="...") {
|
||||
State TBEState, desc="Transient state";
|
||||
DataBlock DataBlk, desc="data for the block";
|
||||
bool Dirty, desc="Is the data dirty?";
|
||||
bool Shared, desc="Victim hit by shared probe";
|
||||
MachineID From, desc="Waiting for writeback from...";
|
||||
NetDest Destination, desc="Data destination";
|
||||
int numAtomics, desc="number remaining atomics";
|
||||
}
|
||||
|
||||
structure(TBETable, external="yes") {
|
||||
TBE lookup(Addr);
|
||||
void allocate(Addr);
|
||||
void deallocate(Addr);
|
||||
bool isPresent(Addr);
|
||||
}
|
||||
|
||||
TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";
|
||||
|
||||
void set_cache_entry(AbstractCacheEntry b);
|
||||
void unset_cache_entry();
|
||||
void set_tbe(TBE b);
|
||||
void unset_tbe();
|
||||
void wakeUpAllBuffers();
|
||||
void wakeUpBuffers(Addr a);
|
||||
|
||||
MachineID mapAddressToMachine(Addr addr, MachineType mtype);
|
||||
|
||||
// FUNCTION DEFINITIONS
|
||||
|
||||
Tick clockEdge();
|
||||
Tick cyclesToTicks(Cycles c);
|
||||
|
||||
MachineID getPeer(MachineID mach) {
|
||||
return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum));
|
||||
}
|
||||
|
||||
Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
|
||||
return static_cast(Entry, "pointer", L2cache.lookup(addr));
|
||||
}
|
||||
|
||||
DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
|
||||
return getCacheEntry(addr).DataBlk;
|
||||
}
|
||||
|
||||
bool presentOrAvail(Addr addr) {
|
||||
return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
|
||||
}
|
||||
|
||||
State getState(TBE tbe, Entry cache_entry, Addr addr) {
|
||||
if (is_valid(tbe)) {
|
||||
return tbe.TBEState;
|
||||
} else if (is_valid(cache_entry)) {
|
||||
return cache_entry.CacheState;
|
||||
}
|
||||
return State:I;
|
||||
}
|
||||
|
||||
void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
|
||||
if (is_valid(tbe)) {
|
||||
tbe.TBEState := state;
|
||||
}
|
||||
|
||||
if (is_valid(cache_entry)) {
|
||||
cache_entry.CacheState := state;
|
||||
}
|
||||
}
|
||||
|
||||
void functionalRead(Addr addr, Packet *pkt) {
|
||||
TBE tbe := TBEs.lookup(addr);
|
||||
if(is_valid(tbe)) {
|
||||
testAndRead(addr, tbe.DataBlk, pkt);
|
||||
} else {
|
||||
functionalMemoryRead(pkt);
|
||||
}
|
||||
}
|
||||
|
||||
int functionalWrite(Addr addr, Packet *pkt) {
|
||||
int num_functional_writes := 0;
|
||||
|
||||
TBE tbe := TBEs.lookup(addr);
|
||||
if(is_valid(tbe)) {
|
||||
num_functional_writes := num_functional_writes +
|
||||
testAndWrite(addr, tbe.DataBlk, pkt);
|
||||
}
|
||||
|
||||
num_functional_writes := num_functional_writes +
|
||||
functionalMemoryWrite(pkt);
|
||||
return num_functional_writes;
|
||||
}
|
||||
|
||||
AccessPermission getAccessPermission(Addr addr) {
|
||||
TBE tbe := TBEs.lookup(addr);
|
||||
if(is_valid(tbe)) {
|
||||
return TCC_State_to_permission(tbe.TBEState);
|
||||
}
|
||||
|
||||
Entry cache_entry := getCacheEntry(addr);
|
||||
if(is_valid(cache_entry)) {
|
||||
return TCC_State_to_permission(cache_entry.CacheState);
|
||||
}
|
||||
|
||||
return AccessPermission:NotPresent;
|
||||
}
|
||||
|
||||
void setAccessPermission(Entry cache_entry, Addr addr, State state) {
|
||||
if (is_valid(cache_entry)) {
|
||||
cache_entry.changePermission(TCC_State_to_permission(state));
|
||||
}
|
||||
}
|
||||
|
||||
void recordRequestType(RequestType request_type, Addr addr) {
|
||||
if (request_type == RequestType:DataArrayRead) {
|
||||
L2cache.recordRequestType(CacheRequestType:DataArrayRead,addr);
|
||||
} else if (request_type == RequestType:DataArrayWrite) {
|
||||
L2cache.recordRequestType(CacheRequestType:DataArrayWrite,addr);
|
||||
} else if (request_type == RequestType:TagArrayRead) {
|
||||
L2cache.recordRequestType(CacheRequestType:TagArrayRead,addr);
|
||||
} else if (request_type == RequestType:TagArrayWrite) {
|
||||
L2cache.recordRequestType(CacheRequestType:TagArrayWrite,addr);
|
||||
}
|
||||
}
|
||||
|
||||
bool checkResourceAvailable(RequestType request_type, Addr addr) {
|
||||
if (request_type == RequestType:DataArrayRead) {
|
||||
return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
|
||||
} else if (request_type == RequestType:DataArrayWrite) {
|
||||
return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
|
||||
} else if (request_type == RequestType:TagArrayRead) {
|
||||
return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
|
||||
} else if (request_type == RequestType:TagArrayWrite) {
|
||||
return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
|
||||
} else {
|
||||
error("Invalid RequestType type in checkResourceAvailable");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ** OUT_PORTS **
|
||||
|
||||
// Three classes of ports
|
||||
// Class 1: downward facing network links to NB
|
||||
out_port(requestToNB_out, CPURequestMsg, requestToNB);
|
||||
out_port(responseToNB_out, ResponseMsg, responseToNB);
|
||||
out_port(unblockToNB_out, UnblockMsg, unblockToNB);
|
||||
|
||||
// Class 2: upward facing ports to GPU cores
|
||||
out_port(responseToCore_out, ResponseMsg, responseToCore);
|
||||
|
||||
out_port(triggerQueue_out, TriggerMsg, triggerQueue);
|
||||
//
|
||||
// request queue going to NB
|
||||
//
|
||||
|
||||
|
||||
// ** IN_PORTS **
|
||||
in_port(triggerQueue_in, TiggerMsg, triggerQueue) {
|
||||
if (triggerQueue_in.isReady(clockEdge())) {
|
||||
peek(triggerQueue_in, TriggerMsg) {
|
||||
TBE tbe := TBEs.lookup(in_msg.addr);
|
||||
Entry cache_entry := getCacheEntry(in_msg.addr);
|
||||
if (tbe.numAtomics == 0) {
|
||||
trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe);
|
||||
} else {
|
||||
trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
in_port(responseFromNB_in, ResponseMsg, responseFromNB) {
|
||||
if (responseFromNB_in.isReady(clockEdge())) {
|
||||
peek(responseFromNB_in, ResponseMsg, block_on="addr") {
|
||||
TBE tbe := TBEs.lookup(in_msg.addr);
|
||||
Entry cache_entry := getCacheEntry(in_msg.addr);
|
||||
if (in_msg.Type == CoherenceResponseType:NBSysResp) {
|
||||
if(presentOrAvail(in_msg.addr)) {
|
||||
trigger(Event:Data, in_msg.addr, cache_entry, tbe);
|
||||
} else {
|
||||
Addr victim := L2cache.cacheProbe(in_msg.addr);
|
||||
trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
|
||||
}
|
||||
} else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
|
||||
trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
|
||||
} else {
|
||||
error("Unexpected Response Message to Core");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Finally handling incoming requests (from TCP) and probes (from NB).
|
||||
|
||||
in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) {
|
||||
if (probeNetwork_in.isReady(clockEdge())) {
|
||||
peek(probeNetwork_in, NBProbeRequestMsg) {
|
||||
DPRINTF(RubySlicc, "%s\n", in_msg);
|
||||
DPRINTF(RubySlicc, "machineID: %s\n", machineID);
|
||||
Entry cache_entry := getCacheEntry(in_msg.addr);
|
||||
TBE tbe := TBEs.lookup(in_msg.addr);
|
||||
trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
|
||||
if (coreRequestNetwork_in.isReady(clockEdge())) {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
TBE tbe := TBEs.lookup(in_msg.addr);
|
||||
Entry cache_entry := getCacheEntry(in_msg.addr);
|
||||
if (in_msg.Type == CoherenceRequestType:WriteThrough) {
|
||||
if(WB) {
|
||||
if(presentOrAvail(in_msg.addr)) {
|
||||
trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe);
|
||||
} else {
|
||||
Addr victim := L2cache.cacheProbe(in_msg.addr);
|
||||
trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
|
||||
}
|
||||
} else {
|
||||
trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
|
||||
}
|
||||
} else if (in_msg.Type == CoherenceRequestType:Atomic) {
|
||||
trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
|
||||
} else if (in_msg.Type == CoherenceRequestType:RdBlk) {
|
||||
trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
|
||||
} else {
|
||||
DPRINTF(RubySlicc, "%s\n", in_msg);
|
||||
error("Unexpected Response Message to Core");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// BEGIN ACTIONS
|
||||
|
||||
action(i_invL2, "i", desc="invalidate TCC cache block") {
|
||||
if (is_valid(cache_entry)) {
|
||||
L2cache.deallocate(address);
|
||||
}
|
||||
unset_cache_entry();
|
||||
}
|
||||
|
||||
// Data available at TCC. Send the DATA to TCP
|
||||
action(sd_sendData, "sd", desc="send Shared response") {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:TDSysResp;
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.Destination.add(in_msg.Requestor);
|
||||
out_msg.DataBlk := cache_entry.DataBlk;
|
||||
out_msg.MessageSize := MessageSizeType:Response_Data;
|
||||
out_msg.Dirty := false;
|
||||
out_msg.State := CoherenceState:Shared;
|
||||
DPRINTF(RubySlicc, "%s\n", out_msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Data was not available at TCC. So, TCC forwarded the request to
|
||||
// directory and directory responded back with data. Now, forward the
|
||||
// DATA to TCP and send the unblock ack back to directory.
|
||||
action(sdr_sendDataResponse, "sdr", desc="send Shared response") {
|
||||
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:TDSysResp;
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.Destination := tbe.Destination;
|
||||
out_msg.DataBlk := cache_entry.DataBlk;
|
||||
out_msg.MessageSize := MessageSizeType:Response_Data;
|
||||
out_msg.Dirty := false;
|
||||
out_msg.State := CoherenceState:Shared;
|
||||
DPRINTF(RubySlicc, "%s\n", out_msg);
|
||||
}
|
||||
enqueue(unblockToNB_out, UnblockMsg, 1) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
|
||||
out_msg.MessageSize := MessageSizeType:Unblock_Control;
|
||||
DPRINTF(RubySlicc, "%s\n", out_msg);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
action(rd_requestData, "r", desc="Miss in L2, pass on") {
|
||||
if(tbe.Destination.count()==1){
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := in_msg.Type;
|
||||
out_msg.Requestor := machineID;
|
||||
out_msg.Destination.add(getPeer(machineID));
|
||||
out_msg.Shared := false; // unneeded for this request
|
||||
out_msg.MessageSize := in_msg.MessageSize;
|
||||
DPRINTF(RubySlicc, "%s\n", out_msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(w_sendResponseWBAck, "w", desc="send WB Ack") {
|
||||
peek(responseFromNB_in, ResponseMsg) {
|
||||
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:TDSysWBAck;
|
||||
out_msg.Destination.clear();
|
||||
out_msg.Destination.add(in_msg.WTRequestor);
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.MessageSize := MessageSizeType:Writeback_Control;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(swb_sendWBAck, "swb", desc="send WB Ack") {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:TDSysWBAck;
|
||||
out_msg.Destination.clear();
|
||||
out_msg.Destination.add(in_msg.Requestor);
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.MessageSize := MessageSizeType:Writeback_Control;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
|
||||
peek(responseFromNB_in, ResponseMsg) {
|
||||
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:TDSysResp;
|
||||
out_msg.Destination.add(in_msg.WTRequestor);
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.MessageSize := in_msg.MessageSize;
|
||||
out_msg.DataBlk := in_msg.DataBlk;
|
||||
}
|
||||
}
|
||||
}
|
||||
action(sd2rb_sendDone2RegionBuffer, "sd2rb", desc="Request finished, send done ack") {
|
||||
enqueue(unblockToNB_out, UnblockMsg, 1) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Destination.add(getPeer(machineID));
|
||||
out_msg.DoneAck := true;
|
||||
out_msg.MessageSize := MessageSizeType:Unblock_Control;
|
||||
if (is_valid(tbe)) {
|
||||
out_msg.Dirty := tbe.Dirty;
|
||||
} else {
|
||||
out_msg.Dirty := false;
|
||||
}
|
||||
DPRINTF(RubySlicc, "%s\n", out_msg);
|
||||
}
|
||||
}
|
||||
|
||||
action(a_allocateBlock, "a", desc="allocate TCC block") {
|
||||
if (is_invalid(cache_entry)) {
|
||||
set_cache_entry(L2cache.allocate(address, new Entry));
|
||||
cache_entry.writeMask.clear();
|
||||
}
|
||||
}
|
||||
|
||||
action(t_allocateTBE, "t", desc="allocate TBE Entry") {
|
||||
if (is_invalid(tbe)) {
|
||||
check_allocate(TBEs);
|
||||
TBEs.allocate(address);
|
||||
set_tbe(TBEs.lookup(address));
|
||||
tbe.Destination.clear();
|
||||
tbe.numAtomics := 0;
|
||||
}
|
||||
if (coreRequestNetwork_in.isReady(clockEdge())) {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
|
||||
tbe.Destination.add(in_msg.Requestor);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
|
||||
tbe.Destination.clear();
|
||||
TBEs.deallocate(address);
|
||||
unset_tbe();
|
||||
}
|
||||
|
||||
action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") {
|
||||
peek(responseFromNB_in, ResponseMsg) {
|
||||
cache_entry.DataBlk := in_msg.DataBlk;
|
||||
DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
|
||||
}
|
||||
}
|
||||
|
||||
action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
|
||||
cache_entry.writeMask.orMask(in_msg.writeMask);
|
||||
DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
|
||||
}
|
||||
}
|
||||
|
||||
action(wt_writeThrough, "wt", desc="write through data") {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Requestor := machineID;
|
||||
out_msg.WTRequestor := in_msg.Requestor;
|
||||
out_msg.Destination.add(getPeer(machineID));
|
||||
out_msg.MessageSize := MessageSizeType:Data;
|
||||
out_msg.Type := CoherenceRequestType:WriteThrough;
|
||||
out_msg.Dirty := true;
|
||||
out_msg.DataBlk := in_msg.DataBlk;
|
||||
out_msg.writeMask.orMask(in_msg.writeMask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(wb_writeBack, "wb", desc="write back data") {
|
||||
enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Requestor := machineID;
|
||||
out_msg.WTRequestor := machineID;
|
||||
out_msg.Destination.add(getPeer(machineID));
|
||||
out_msg.MessageSize := MessageSizeType:Data;
|
||||
out_msg.Type := CoherenceRequestType:WriteThrough;
|
||||
out_msg.Dirty := true;
|
||||
out_msg.DataBlk := cache_entry.DataBlk;
|
||||
out_msg.writeMask.orMask(cache_entry.writeMask);
|
||||
}
|
||||
}
|
||||
|
||||
action(at_atomicThrough, "at", desc="write back data") {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Requestor := machineID;
|
||||
out_msg.WTRequestor := in_msg.Requestor;
|
||||
out_msg.Destination.add(getPeer(machineID));
|
||||
out_msg.MessageSize := MessageSizeType:Data;
|
||||
out_msg.Type := CoherenceRequestType:Atomic;
|
||||
out_msg.Dirty := true;
|
||||
out_msg.writeMask.orMask(in_msg.writeMask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
|
||||
enqueue(responseToNB_out, ResponseMsg, 1) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
|
||||
out_msg.Dirty := false;
|
||||
out_msg.Hit := false;
|
||||
out_msg.Ntsl := true;
|
||||
out_msg.State := CoherenceState:NA;
|
||||
out_msg.MessageSize := MessageSizeType:Response_Control;
|
||||
}
|
||||
}
|
||||
action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
|
||||
L2cache.setMRU(address);
|
||||
}
|
||||
|
||||
action(p_popRequestQueue, "p", desc="pop request queue") {
|
||||
coreRequestNetwork_in.dequeue(clockEdge());
|
||||
}
|
||||
|
||||
action(pr_popResponseQueue, "pr", desc="pop response queue") {
|
||||
responseFromNB_in.dequeue(clockEdge());
|
||||
}
|
||||
|
||||
action(pp_popProbeQueue, "pp", desc="pop probe queue") {
|
||||
probeNetwork_in.dequeue(clockEdge());
|
||||
}
|
||||
action(zz_recycleRequestQueue, "z", desc="stall"){
|
||||
coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
|
||||
}
|
||||
|
||||
|
||||
action(ina_incrementNumAtomics, "ina", desc="inc num atomics") {
|
||||
tbe.numAtomics := tbe.numAtomics + 1;
|
||||
}
|
||||
|
||||
|
||||
action(dna_decrementNumAtomics, "dna", desc="dec num atomics") {
|
||||
tbe.numAtomics := tbe.numAtomics - 1;
|
||||
if (tbe.numAtomics==0) {
|
||||
enqueue(triggerQueue_out, TriggerMsg, 1) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := TriggerType:AtomicDone;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") {
|
||||
triggerQueue_in.dequeue(clockEdge());
|
||||
}
|
||||
|
||||
// END ACTIONS
|
||||
|
||||
// BEGIN TRANSITIONS
|
||||
// transitions from base
|
||||
// Assumptions for ArrayRead/Write
|
||||
// TBE checked before tags
|
||||
// Data Read/Write requires Tag Read
|
||||
|
||||
transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} {
|
||||
zz_recycleRequestQueue;
|
||||
}
|
||||
transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) {TagArrayRead} {
|
||||
zz_recycleRequestQueue;
|
||||
}
|
||||
transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} {
|
||||
zz_recycleRequestQueue;
|
||||
}
|
||||
transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} {
|
||||
sd_sendData;
|
||||
ut_updateTag;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} {
|
||||
t_allocateTBE;
|
||||
wb_writeBack;
|
||||
}
|
||||
|
||||
transition(I, RdBlk, IV) {TagArrayRead} {
|
||||
t_allocateTBE;
|
||||
rd_requestData;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition(IV, RdBlk) {
|
||||
t_allocateTBE;
|
||||
rd_requestData;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition({V, I},Atomic, A) {TagArrayRead} {
|
||||
i_invL2;
|
||||
t_allocateTBE;
|
||||
at_atomicThrough;
|
||||
ina_incrementNumAtomics;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition(A, Atomic) {
|
||||
at_atomicThrough;
|
||||
ina_incrementNumAtomics;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition({M, W}, Atomic, WI) {TagArrayRead} {
|
||||
t_allocateTBE;
|
||||
wb_writeBack;
|
||||
}
|
||||
|
||||
// Cahceblock stays in I state which implies
|
||||
// this TCC is a write-no-allocate cache
|
||||
transition(I, WrVicBlk) {TagArrayRead} {
|
||||
wt_writeThrough;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} {
|
||||
ut_updateTag;
|
||||
wdb_writeDirtyBytes;
|
||||
wt_writeThrough;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
ut_updateTag;
|
||||
swb_sendWBAck;
|
||||
wdb_writeDirtyBytes;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
ut_updateTag;
|
||||
swb_sendWBAck;
|
||||
wdb_writeDirtyBytes;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
a_allocateBlock;
|
||||
ut_updateTag;
|
||||
swb_sendWBAck;
|
||||
wdb_writeDirtyBytes;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
|
||||
t_allocateTBE;
|
||||
wb_writeBack;
|
||||
i_invL2;
|
||||
}
|
||||
|
||||
transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} {
|
||||
i_invL2;
|
||||
}
|
||||
|
||||
transition({A, IV, WI}, L2_Repl) {
|
||||
i_invL2;
|
||||
}
|
||||
|
||||
transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
|
||||
pi_sendProbeResponseInv;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} {
|
||||
pi_sendProbeResponseInv;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition(W, PrbInv) {TagArrayRead} {
|
||||
pi_sendProbeResponseInv;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition({A, IV, WI}, PrbInv) {
|
||||
pi_sendProbeResponseInv;
|
||||
pp_popProbeQueue;
|
||||
}
|
||||
|
||||
transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
a_allocateBlock;
|
||||
ut_updateTag;
|
||||
wcb_writeCacheBlock;
|
||||
sdr_sendDataResponse;
|
||||
sd2rb_sendDone2RegionBuffer;
|
||||
pr_popResponseQueue;
|
||||
dt_deallocateTBE;
|
||||
}
|
||||
|
||||
transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
a_allocateBlock;
|
||||
ar_sendAtomicResponse;
|
||||
sd2rb_sendDone2RegionBuffer;
|
||||
dna_decrementNumAtomics;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} {
|
||||
dt_deallocateTBE;
|
||||
ptr_popTriggerQueue;
|
||||
}
|
||||
|
||||
transition(A, AtomicNotDone) {TagArrayRead} {
|
||||
ptr_popTriggerQueue;
|
||||
}
|
||||
|
||||
//M,W should not see WBAck as the cache is in WB mode
|
||||
//WBAcks do not need to check tags
|
||||
transition({I, V, IV, A}, WBAck) {
|
||||
w_sendResponseWBAck;
|
||||
sd2rb_sendDone2RegionBuffer;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
transition(WI, WBAck,I) {
|
||||
sd2rb_sendDone2RegionBuffer;
|
||||
dt_deallocateTBE;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
protocol "GPU_VIPER_Region";
|
||||
include "RubySlicc_interfaces.slicc";
|
||||
include "MOESI_AMD_Base-msg.sm";
|
||||
include "MOESI_AMD_Base-Region-CorePair.sm";
|
||||
include "MOESI_AMD_Base-L3cache.sm";
|
||||
include "MOESI_AMD_Base-Region-dir.sm";
|
||||
include "GPU_VIPER_Region-TCC.sm";
|
||||
include "GPU_VIPER-TCP.sm";
|
||||
include "GPU_VIPER-SQC.sm";
|
||||
include "MOESI_AMD_Base-RegionDir.sm";
|
||||
include "MOESI_AMD_Base-RegionBuffer.sm";
|
||||
Reference in New Issue
Block a user