# Copyright (c) 2015-2018 Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # 3. Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from this # software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. from m5.defines import buildEnv from m5.params import * from m5.proxy import * from m5.SimObject import SimObject from m5.objects.Bridge import Bridge from m5.objects.ClockedObject import ClockedObject from m5.objects.Device import DmaVirtDevice from m5.objects.LdsState import LdsState from m5.objects.Process import EmulatedDriver from m5.objects.VegaGPUTLB import VegaPagetableWalker class PrefetchType(Enum): vals = ["PF_CU", "PF_PHASE", "PF_WF", "PF_STRIDE", "PF_END"] class GfxVersion(ScopedEnum): vals = ["gfx801", "gfx803", "gfx900", "gfx902"] class PoolManager(SimObject): type = "PoolManager" abstract = True cxx_class = "gem5::PoolManager" cxx_header = "gpu-compute/pool_manager.hh" min_alloc = Param.Int(4, "min number of VGPRs allocated per WF") pool_size = Param.Int(2048, "number of vector registers per SIMD") # The simple pool manage only allows one workgroup to # be executing on a CU at any given time. class SimplePoolManager(PoolManager): type = "SimplePoolManager" cxx_class = "gem5::SimplePoolManager" cxx_header = "gpu-compute/simple_pool_manager.hh" ## This is for allowing multiple workgroups on one CU class DynPoolManager(PoolManager): type = "DynPoolManager" cxx_class = "gem5::DynPoolManager" cxx_header = "gpu-compute/dyn_pool_manager.hh" class RegisterFile(SimObject): type = "RegisterFile" cxx_class = "gem5::RegisterFile" cxx_header = "gpu-compute/register_file.hh" simd_id = Param.Int(-1, "SIMD ID associated with this Register File") num_regs = Param.Int(2048, "number of registers in this RF") wf_size = Param.Int(64, "Wavefront size (in work items)") class ScalarRegisterFile(RegisterFile): type = "ScalarRegisterFile" cxx_class = "gem5::ScalarRegisterFile" cxx_header = "gpu-compute/scalar_register_file.hh" class VectorRegisterFile(RegisterFile): type = "VectorRegisterFile" cxx_class = "gem5::VectorRegisterFile" cxx_header = "gpu-compute/vector_register_file.hh" class RegisterManager(SimObject): type = "RegisterManager" cxx_class = "gem5::RegisterManager" cxx_header = "gpu-compute/register_manager.hh" policy = Param.String("static", "Register Manager Policy") vrf_pool_managers = VectorParam.PoolManager("VRF Pool Managers") srf_pool_managers = VectorParam.PoolManager("SRF Pool Managers") class Wavefront(SimObject): type = "Wavefront" cxx_class = "gem5::Wavefront" cxx_header = "gpu-compute/wavefront.hh" simdId = Param.Int("SIMD id (0-ComputeUnit.num_SIMDs)") wf_slot_id = Param.Int("wavefront id (0-ComputeUnit.max_wfs)") wf_size = Param.Int(64, "Wavefront size (in work items)") max_ib_size = Param.Int( 13, "Maximum size (in number of insts) of the " "instruction buffer (IB).", ) # Most of the default values here are obtained from the # AMD Graphics Core Next (GCN) Architecture whitepaper. class ComputeUnit(ClockedObject): type = "ComputeUnit" cxx_class = "gem5::ComputeUnit" cxx_header = "gpu-compute/compute_unit.hh" wavefronts = VectorParam.Wavefront("Number of wavefronts") # Wavefront size is 64. This is configurable, however changing # this value to anything other than 64 will likely cause errors. wf_size = Param.Int(64, "Wavefront size (in work items)") num_barrier_slots = Param.Int(4, "Number of barrier slots in a CU") num_SIMDs = Param.Int(4, "number of SIMD units per CU") num_scalar_cores = Param.Int(1, "number of Scalar cores per CU") num_scalar_mem_pipes = Param.Int( 1, "number of Scalar memory pipelines " "per CU" ) simd_width = Param.Int(16, "width (number of lanes) per SIMD unit") operand_network_length = Param.Int( 1, "number of pipe stages of operand " "network" ) spbypass_pipe_length = Param.Int( 4, "vector ALU Single Precision bypass " "latency" ) dpbypass_pipe_length = Param.Int( 4, "vector ALU Double Precision bypass " "latency" ) scalar_pipe_length = Param.Int(1, "number of pipe stages per scalar ALU") issue_period = Param.Int(4, "number of cycles per issue period") vrf_gm_bus_latency = Param.Int( 1, "number of cycles per use of VRF to " "GM bus" ) srf_scm_bus_latency = Param.Int( 1, "number of cycles per use of SRF " "to Scalar Mem bus" ) vrf_lm_bus_latency = Param.Int( 1, "number of cycles per use of VRF to " "LM bus" ) num_global_mem_pipes = Param.Int(1, "number of global memory pipes per CU") num_shared_mem_pipes = Param.Int(1, "number of shared memory pipes per CU") n_wf = Param.Int(10, "Number of wavefront slots per SIMD") mem_req_latency = Param.Int( 50, "Latency for request from the cu to ruby. " "Represents the pipeline to reach the TCP " "and specified in GPU clock cycles", ) mem_resp_latency = Param.Int( 50, "Latency for responses from ruby to the " "cu. Represents the pipeline between the " "TCP and cu as well as TCP data array " "access. Specified in GPU clock cycles", ) system = Param.System(Parent.any, "system object") cu_id = Param.Int("CU id") vrf_to_coalescer_bus_width = Param.Int( 64, "VRF->Coalescer data bus " "width in bytes" ) coalescer_to_vrf_bus_width = Param.Int( 64, "Coalescer->VRF data bus " "width in bytes" ) memory_port = VectorRequestPort("Port to the memory system") translation_port = VectorRequestPort("Port to the TLB hierarchy") sqc_port = RequestPort("Port to the SQC (I-cache") sqc_tlb_port = RequestPort("Port to the TLB for the SQC (I-cache)") scalar_port = RequestPort("Port to the scalar data cache") scalar_tlb_port = RequestPort("Port to the TLB for the scalar data cache") gmTokenPort = RequestPort("Port to the GPU coalesecer for sharing tokens") perLaneTLB = Param.Bool(False, "enable per-lane TLB") prefetch_depth = Param.Int( 0, "Number of prefetches triggered at a time" "(0 turns off prefetching)", ) prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)") prefetch_prev_type = Param.PrefetchType( "PF_PHASE", "Prefetch the stride " "from last mem req in lane of " "CU|Phase|Wavefront", ) execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy") debugSegFault = Param.Bool(False, "enable debugging GPU seg faults") functionalTLB = Param.Bool(False, "Assume TLB causes no delay") localMemBarrier = Param.Bool( False, "Assume Barriers do not wait on " "kernel end" ) countPages = Param.Bool( False, "Generate per-CU file of all pages " "touched and how many times", ) scalar_mem_queue_size = Param.Int( 32, "Number of entries in scalar " "memory pipeline's queues" ) global_mem_queue_size = Param.Int( 256, "Number of entries in the global " "memory pipeline's queues" ) local_mem_queue_size = Param.Int( 256, "Number of entries in the local " "memory pipeline's queues" ) max_wave_requests = Param.Int( 64, "number of pending vector memory " "requests per wavefront" ) max_cu_tokens = Param.Int( 4, "Maximum number of tokens, i.e., the number" " of instructions that can be sent to coalescer", ) ldsBus = Bridge() # the bridge between the CU and its LDS ldsPort = RequestPort("The port that goes to the LDS") localDataStore = Param.LdsState("the LDS for this CU") vector_register_file = VectorParam.VectorRegisterFile( "Vector register " "file" ) scalar_register_file = VectorParam.ScalarRegisterFile( "Scalar register " "file" ) out_of_order_data_delivery = Param.Bool( False, "enable OoO data delivery" " in the GM pipeline" ) register_manager = Param.RegisterManager("Register Manager") fetch_depth = Param.Int( 2, "number of i-cache lines that may be " "buffered in the fetch unit." ) class Shader(ClockedObject): type = "Shader" cxx_class = "gem5::Shader" cxx_header = "gpu-compute/shader.hh" CUs = VectorParam.ComputeUnit("Number of compute units") gpu_cmd_proc = Param.GPUCommandProcessor("Command processor for GPU") dispatcher = Param.GPUDispatcher("GPU workgroup dispatcher") system_hub = Param.AMDGPUSystemHub(NULL, "GPU System Hub (FS Mode only)") n_wf = Param.Int(10, "Number of wavefront slots per SIMD") impl_kern_launch_acq = Param.Bool( True, """Insert acq packet into ruby at kernel launch""", ) impl_kern_end_rel = Param.Bool( False, """Insert rel packet into ruby at kernel end""", ) globalmem = Param.MemorySize("64kB", "Memory size") timing = Param.Bool(False, "timing memory accesses") cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU") translation = Param.Bool(False, "address translation") timer_period = Param.Clock("10us", "system timer period") idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold") max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting") class GPUComputeDriver(EmulatedDriver): type = "GPUComputeDriver" cxx_class = "gem5::GPUComputeDriver" cxx_header = "gpu-compute/gpu_compute_driver.hh" device = Param.GPUCommandProcessor("GPU controlled by this driver") isdGPU = Param.Bool(False, "Driver is for a dGPU") gfxVersion = Param.GfxVersion("gfx801", "ISA of gpu to model") dGPUPoolID = Param.Int(0, "Pool ID for dGPU.") # Default Mtype for caches # -- 1 1 1 C_RW_S (Cached-ReadWrite-Shared) # -- 1 1 0 C_RW_US (Cached-ReadWrite-Unshared) # -- 1 0 1 C_RO_S (Cached-ReadOnly-Shared) # -- 1 0 0 C_RO_US (Cached-ReadOnly-Unshared) # -- 0 1 x UC_L2 (Uncached_GL2) # -- 0 0 x UC_All (Uncached_All_Load) # default value: 5/C_RO_S (only allow caching in GL2 for read. Shared) m_type = Param.Int("Default MTYPE for cache. Valid values between 0-7") class GPURenderDriver(EmulatedDriver): type = "GPURenderDriver" cxx_class = "gem5::GPURenderDriver" cxx_header = "gpu-compute/gpu_render_driver.hh" class GPUDispatcher(SimObject): type = "GPUDispatcher" cxx_class = "gem5::GPUDispatcher" cxx_header = "gpu-compute/dispatcher.hh" class GPUCommandProcessor(DmaVirtDevice): type = "GPUCommandProcessor" cxx_class = "gem5::GPUCommandProcessor" cxx_header = "gpu-compute/gpu_command_processor.hh" dispatcher = Param.GPUDispatcher("workgroup dispatcher for the GPU") hsapp = Param.HSAPacketProcessor("PP attached to this device") walker = Param.VegaPagetableWalker( VegaPagetableWalker(), "Page table walker" ) class StorageClassType(Enum): vals = [ "SC_SPILL", "SC_GLOBAL", "SC_GROUP", "SC_PRIVATE", "SC_READONLY", "SC_KERNARG", "SC_ARG", "SC_NONE", ]