Change-Id: Ib4415a7c5918da03bbd16fe9adb4dd593dcaa95c Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29929 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
308 lines
8.9 KiB
C++
308 lines
8.9 KiB
C++
/*
|
|
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* Authors: Steve Reinhardt
|
|
*/
|
|
|
|
#ifndef __SHADER_HH__
|
|
#define __SHADER_HH__
|
|
|
|
#include <functional>
|
|
#include <string>
|
|
|
|
#include "arch/isa.hh"
|
|
#include "arch/isa_traits.hh"
|
|
#include "base/types.hh"
|
|
#include "cpu/simple/atomic.hh"
|
|
#include "cpu/simple/timing.hh"
|
|
#include "cpu/simple_thread.hh"
|
|
#include "cpu/thread_context.hh"
|
|
#include "cpu/thread_state.hh"
|
|
#include "gpu-compute/compute_unit.hh"
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
#include "gpu-compute/gpu_tlb.hh"
|
|
#include "gpu-compute/hsa_queue_entry.hh"
|
|
#include "gpu-compute/lds_state.hh"
|
|
#include "mem/page_table.hh"
|
|
#include "mem/port.hh"
|
|
#include "mem/request.hh"
|
|
#include "params/Shader.hh"
|
|
#include "sim/faults.hh"
|
|
#include "sim/process.hh"
|
|
#include "sim/sim_object.hh"
|
|
|
|
class BaseTLB;
|
|
class GPUCommandProcessor;
|
|
class GPUDispatcher;
|
|
|
|
namespace TheISA
|
|
{
|
|
class GpuTLB;
|
|
}
|
|
|
|
static const int LDS_SIZE = 65536;
|
|
|
|
// aperture (APE) registers define the base/limit
|
|
// pair for the ATC mapped memory space. currently
|
|
// the only APEs we consider are for GPUVM/LDS/scratch.
|
|
// the APEs are registered with unique values based
|
|
// on a per-device basis
|
|
struct ApertureRegister
|
|
{
|
|
Addr base;
|
|
Addr limit;
|
|
};
|
|
|
|
// Class Shader: This describes a single shader instance. Most
|
|
// configurations will only have a single shader.
|
|
|
|
class Shader : public ClockedObject
|
|
{
|
|
private:
|
|
ApertureRegister _gpuVmApe;
|
|
ApertureRegister _ldsApe;
|
|
ApertureRegister _scratchApe;
|
|
Addr shHiddenPrivateBaseVmid;
|
|
|
|
// Number of active Cus attached to this shader
|
|
int _activeCus;
|
|
|
|
// Last tick that all CUs attached to this shader were inactive
|
|
Tick _lastInactiveTick;
|
|
|
|
// some stats for measuring latency
|
|
Stats::Distribution allLatencyDist;
|
|
Stats::Distribution loadLatencyDist;
|
|
Stats::Distribution storeLatencyDist;
|
|
|
|
// average ticks from vmem inst initiateAcc to coalescer issue,
|
|
// average ticks from coalescer issue to coalescer hit callback,
|
|
// average ticks from coalescer hit callback to GM pipe enqueue,
|
|
// and average ticks spent in GM pipe's ordered resp buffer.
|
|
Stats::Distribution initToCoalesceLatency;
|
|
Stats::Distribution rubyNetworkLatency;
|
|
Stats::Distribution gmEnqueueLatency;
|
|
Stats::Distribution gmToCompleteLatency;
|
|
|
|
// average number of cache blocks requested by vmem inst, and
|
|
// average ticks for cache blocks to main memory for the Nth
|
|
// cache block generated by a vmem inst.
|
|
Stats::Distribution coalsrLineAddresses;
|
|
Stats::Distribution *cacheBlockRoundTrip;
|
|
|
|
public:
|
|
typedef ShaderParams Params;
|
|
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
|
|
|
|
GPUDispatcher &dispatcher();
|
|
void sampleLoad(const Tick accessTime);
|
|
void sampleStore(const Tick accessTime);
|
|
void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
|
|
void sampleLineRoundTrip(const std::map<Addr,
|
|
std::vector<Tick>> &roundTripTime);
|
|
|
|
SimpleThread *cpuThread;
|
|
ThreadContext *gpuTc;
|
|
BaseCPU *cpuPointer;
|
|
|
|
const ApertureRegister&
|
|
gpuVmApe() const
|
|
{
|
|
return _gpuVmApe;
|
|
}
|
|
|
|
const ApertureRegister&
|
|
ldsApe() const
|
|
{
|
|
return _ldsApe;
|
|
}
|
|
|
|
const ApertureRegister&
|
|
scratchApe() const
|
|
{
|
|
return _scratchApe;
|
|
}
|
|
|
|
bool
|
|
isGpuVmApe(Addr addr) const
|
|
{
|
|
bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
|
|
|
|
return is_gpu_vm;
|
|
}
|
|
|
|
bool
|
|
isLdsApe(Addr addr) const
|
|
{
|
|
bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
|
|
|
|
return is_lds;
|
|
}
|
|
|
|
bool
|
|
isScratchApe(Addr addr) const
|
|
{
|
|
bool is_scratch
|
|
= addr >= _scratchApe.base && addr <= _scratchApe.limit;
|
|
|
|
return is_scratch;
|
|
}
|
|
|
|
Addr
|
|
getScratchBase()
|
|
{
|
|
return _scratchApe.base;
|
|
}
|
|
|
|
Addr
|
|
getHiddenPrivateBase()
|
|
{
|
|
return shHiddenPrivateBaseVmid;
|
|
}
|
|
|
|
void
|
|
initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
|
|
{
|
|
Addr sh_hidden_base_new = queueBase - offset;
|
|
|
|
// We are initializing sh_hidden_private_base_vmid from the
|
|
// amd queue descriptor from the first queue.
|
|
// The sh_hidden_private_base_vmid is supposed to be same for
|
|
// all the queues from the same process
|
|
if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
|
|
// Do not panic if shHiddenPrivateBaseVmid == 0,
|
|
// that is if it is uninitialized. Panic only
|
|
// if the value is initilized and we get
|
|
// a differnt base later.
|
|
panic_if(shHiddenPrivateBaseVmid != 0,
|
|
"Currently we support only single process\n");
|
|
}
|
|
shHiddenPrivateBaseVmid = sh_hidden_base_new;
|
|
}
|
|
|
|
EventFunctionWrapper tickEvent;
|
|
|
|
// is this simulation going to be timing mode in the memory?
|
|
bool timingSim;
|
|
hsail_mode_e hsail_mode;
|
|
|
|
// If set, issue acq packet @ kernel launch
|
|
int impl_kern_launch_acq;
|
|
// If set, issue rel packet @ kernel end
|
|
int impl_kern_end_rel;
|
|
// If set, fetch returns may be coissued with instructions
|
|
int coissue_return;
|
|
// If set, always dump all 64 gprs to trace
|
|
int trace_vgpr_all;
|
|
// Number of cu units in the shader
|
|
int n_cu;
|
|
// Number of wavefront slots per SIMD per CU
|
|
int n_wf;
|
|
|
|
// The size of global memory
|
|
int globalMemSize;
|
|
|
|
// Tracks CU that rr dispatcher should attempt scheduling
|
|
int nextSchedCu;
|
|
|
|
// Size of scheduled add queue
|
|
uint32_t sa_n;
|
|
|
|
// Pointer to value to be increments
|
|
std::vector<int*> sa_val;
|
|
// When to do the increment
|
|
std::vector<uint64_t> sa_when;
|
|
// Amount to increment by
|
|
std::vector<int32_t> sa_x;
|
|
|
|
// List of Compute Units (CU's)
|
|
std::vector<ComputeUnit*> cuList;
|
|
|
|
GPUCommandProcessor &gpuCmdProc;
|
|
GPUDispatcher &_dispatcher;
|
|
|
|
/**
|
|
* Statistics
|
|
*/
|
|
Stats::Scalar shaderActiveTicks;
|
|
Stats::Vector vectorInstSrcOperand;
|
|
Stats::Vector vectorInstDstOperand;
|
|
void regStats();
|
|
|
|
int max_valu_insts;
|
|
int total_valu_insts;
|
|
|
|
Shader(const Params *p);
|
|
~Shader();
|
|
virtual void init();
|
|
|
|
// Run shader scheduled adds
|
|
void execScheduledAdds();
|
|
|
|
// Schedule a 32-bit value to be incremented some time in the future
|
|
void ScheduleAdd(int *val, Tick when, int x);
|
|
bool processTimingPacket(PacketPtr pkt);
|
|
|
|
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
|
|
MemCmd cmd, bool suppress_func_errors);
|
|
|
|
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
|
|
|
|
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
|
|
bool suppress_func_errors);
|
|
|
|
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
|
|
|
|
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
|
|
bool suppress_func_errors);
|
|
|
|
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
|
|
bool suppress_func_errors, int cu_id);
|
|
|
|
void
|
|
registerCU(int cu_id, ComputeUnit *compute_unit)
|
|
{
|
|
cuList[cu_id] = compute_unit;
|
|
}
|
|
|
|
void prepareInvalidate(HSAQueueEntry *task);
|
|
void prepareFlush(GPUDynInstPtr gpuDynInst);
|
|
|
|
bool dispatchWorkgroups(HSAQueueEntry *task);
|
|
Addr mmap(int length);
|
|
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
|
|
void updateContext(int cid);
|
|
void notifyCuSleep();
|
|
};
|
|
|
|
#endif // __SHADER_HH__
|