/* * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Authors: Steve Reinhardt */ #ifndef __SHADER_HH__ #define __SHADER_HH__ #include #include #include "arch/isa.hh" #include "arch/isa_traits.hh" #include "base/types.hh" #include "cpu/simple/atomic.hh" #include "cpu/simple/timing.hh" #include "cpu/simple_thread.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/gpu_tlb.hh" #include "gpu-compute/hsa_queue_entry.hh" #include "gpu-compute/lds_state.hh" #include "mem/page_table.hh" #include "mem/port.hh" #include "mem/request.hh" #include "params/Shader.hh" #include "sim/faults.hh" #include "sim/process.hh" #include "sim/sim_object.hh" class BaseTLB; class GPUCommandProcessor; class GPUDispatcher; namespace TheISA { class GpuTLB; } static const int LDS_SIZE = 65536; // aperture (APE) registers define the base/limit // pair for the ATC mapped memory space. currently // the only APEs we consider are for GPUVM/LDS/scratch. // the APEs are registered with unique values based // on a per-device basis struct ApertureRegister { Addr base; Addr limit; }; // Class Shader: This describes a single shader instance. Most // configurations will only have a single shader. class Shader : public ClockedObject { private: ApertureRegister _gpuVmApe; ApertureRegister _ldsApe; ApertureRegister _scratchApe; Addr shHiddenPrivateBaseVmid; // Number of active Cus attached to this shader int _activeCus; // Last tick that all CUs attached to this shader were inactive Tick _lastInactiveTick; // some stats for measuring latency Stats::Distribution allLatencyDist; Stats::Distribution loadLatencyDist; Stats::Distribution storeLatencyDist; // average ticks from vmem inst initiateAcc to coalescer issue, // average ticks from coalescer issue to coalescer hit callback, // average ticks from coalescer hit callback to GM pipe enqueue, // and average ticks spent in GM pipe's ordered resp buffer. Stats::Distribution initToCoalesceLatency; Stats::Distribution rubyNetworkLatency; Stats::Distribution gmEnqueueLatency; Stats::Distribution gmToCompleteLatency; // average number of cache blocks requested by vmem inst, and // average ticks for cache blocks to main memory for the Nth // cache block generated by a vmem inst. Stats::Distribution coalsrLineAddresses; Stats::Distribution *cacheBlockRoundTrip; public: typedef ShaderParams Params; enum hsail_mode_e {SIMT,VECTOR_SCALAR}; GPUDispatcher &dispatcher(); void sampleLoad(const Tick accessTime); void sampleStore(const Tick accessTime); void sampleInstRoundTrip(std::vector roundTripTime); void sampleLineRoundTrip(const std::map> &roundTripTime); SimpleThread *cpuThread; ThreadContext *gpuTc; BaseCPU *cpuPointer; const ApertureRegister& gpuVmApe() const { return _gpuVmApe; } const ApertureRegister& ldsApe() const { return _ldsApe; } const ApertureRegister& scratchApe() const { return _scratchApe; } bool isGpuVmApe(Addr addr) const { bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit; return is_gpu_vm; } bool isLdsApe(Addr addr) const { bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit; return is_lds; } bool isScratchApe(Addr addr) const { bool is_scratch = addr >= _scratchApe.base && addr <= _scratchApe.limit; return is_scratch; } Addr getScratchBase() { return _scratchApe.base; } Addr getHiddenPrivateBase() { return shHiddenPrivateBaseVmid; } void initShHiddenPrivateBase(Addr queueBase, uint32_t offset) { Addr sh_hidden_base_new = queueBase - offset; // We are initializing sh_hidden_private_base_vmid from the // amd queue descriptor from the first queue. // The sh_hidden_private_base_vmid is supposed to be same for // all the queues from the same process if (shHiddenPrivateBaseVmid != sh_hidden_base_new) { // Do not panic if shHiddenPrivateBaseVmid == 0, // that is if it is uninitialized. Panic only // if the value is initilized and we get // a differnt base later. panic_if(shHiddenPrivateBaseVmid != 0, "Currently we support only single process\n"); } shHiddenPrivateBaseVmid = sh_hidden_base_new; } EventFunctionWrapper tickEvent; // is this simulation going to be timing mode in the memory? bool timingSim; hsail_mode_e hsail_mode; // If set, issue acq packet @ kernel launch int impl_kern_launch_acq; // If set, issue rel packet @ kernel end int impl_kern_end_rel; // If set, fetch returns may be coissued with instructions int coissue_return; // If set, always dump all 64 gprs to trace int trace_vgpr_all; // Number of cu units in the shader int n_cu; // Number of wavefront slots per SIMD per CU int n_wf; // The size of global memory int globalMemSize; // Tracks CU that rr dispatcher should attempt scheduling int nextSchedCu; // Size of scheduled add queue uint32_t sa_n; // Pointer to value to be increments std::vector sa_val; // When to do the increment std::vector sa_when; // Amount to increment by std::vector sa_x; // List of Compute Units (CU's) std::vector cuList; GPUCommandProcessor &gpuCmdProc; GPUDispatcher &_dispatcher; /** * Statistics */ Stats::Scalar shaderActiveTicks; Stats::Vector vectorInstSrcOperand; Stats::Vector vectorInstDstOperand; void regStats(); int max_valu_insts; int total_valu_insts; Shader(const Params *p); ~Shader(); virtual void init(); // Run shader scheduled adds void execScheduledAdds(); // Schedule a 32-bit value to be incremented some time in the future void ScheduleAdd(int *val, Tick when, int x); bool processTimingPacket(PacketPtr pkt); void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors); void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id); void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id, bool suppress_func_errors); void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id); void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id, bool suppress_func_errors); void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id); void registerCU(int cu_id, ComputeUnit *compute_unit) { cuList[cu_id] = compute_unit; } void prepareInvalidate(HSAQueueEntry *task); void prepareFlush(GPUDynInstPtr gpuDynInst); bool dispatchWorkgroups(HSAQueueEntry *task); Addr mmap(int length); void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode); void updateContext(int cid); void notifyCuSleep(); }; #endif // __SHADER_HH__