Files
gem5/src/gpu-compute/shader.hh
Daniel R. Carvalho 98ac080ec4 base-stats,misc: Rename Stats namespace as statistics
As part of recent decisions regarding namespace
naming conventions, all namespaces will be changed
to snake case.

::Stats became ::statistics.

"statistics" was chosen over "stats" to avoid generating
conflicts with the already existing variables (there are
way too many "stats" in the codebase), which would make
this patch even more disturbing for the users.

Change-Id: If877b12d7dac356f86e3b3d941bf7558a4fd8719
Signed-off-by: Daniel R. Carvalho <odanrc@yahoo.com.br>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/45421
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
2021-05-29 11:13:49 +00:00

319 lines
9.3 KiB
C++

/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __SHADER_HH__
#define __SHADER_HH__
#include <functional>
#include <string>
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "base/types.hh"
#include "cpu/simple/atomic.hh"
#include "cpu/simple/timing.hh"
#include "cpu/simple_thread.hh"
#include "cpu/thread_context.hh"
#include "cpu/thread_state.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/gpu_tlb.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/lds_state.hh"
#include "mem/page_table.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/Shader.hh"
#include "sim/faults.hh"
#include "sim/process.hh"
#include "sim/sim_object.hh"
class BaseTLB;
class GPUCommandProcessor;
class GPUDispatcher;
static const int LDS_SIZE = 65536;
// aperture (APE) registers define the base/limit
// pair for the ATC mapped memory space. currently
// the only APEs we consider are for GPUVM/LDS/scratch.
// the APEs are registered with unique values based
// on a per-device basis
struct ApertureRegister
{
Addr base;
Addr limit;
};
// Class Shader: This describes a single shader instance. Most
// configurations will only have a single shader.
class Shader : public ClockedObject
{
private:
ApertureRegister _gpuVmApe;
ApertureRegister _ldsApe;
ApertureRegister _scratchApe;
Addr shHiddenPrivateBaseVmid;
// Number of active Cus attached to this shader
int _activeCus;
// Last tick that all CUs attached to this shader were inactive
Tick _lastInactiveTick;
public:
typedef ShaderParams Params;
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
GPUDispatcher &dispatcher();
void sampleLoad(const Tick accessTime);
void sampleStore(const Tick accessTime);
void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
void sampleLineRoundTrip(const std::map<Addr,
std::vector<Tick>> &roundTripTime);
SimpleThread *cpuThread;
ThreadContext *gpuTc;
BaseCPU *cpuPointer;
const ApertureRegister&
gpuVmApe() const
{
return _gpuVmApe;
}
const ApertureRegister&
ldsApe() const
{
return _ldsApe;
}
const ApertureRegister&
scratchApe() const
{
return _scratchApe;
}
bool
isGpuVmApe(Addr addr) const
{
bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
return is_gpu_vm;
}
bool
isLdsApe(Addr addr) const
{
bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
return is_lds;
}
bool
isScratchApe(Addr addr) const
{
bool is_scratch
= addr >= _scratchApe.base && addr <= _scratchApe.limit;
return is_scratch;
}
Addr
getScratchBase()
{
return _scratchApe.base;
}
Addr
getHiddenPrivateBase()
{
return shHiddenPrivateBaseVmid;
}
void
initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
{
Addr sh_hidden_base_new = queueBase - offset;
// We are initializing sh_hidden_private_base_vmid from the
// amd queue descriptor from the first queue.
// The sh_hidden_private_base_vmid is supposed to be same for
// all the queues from the same process
if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
// Do not panic if shHiddenPrivateBaseVmid == 0,
// that is if it is uninitialized. Panic only
// if the value is initilized and we get
// a differnt base later.
panic_if(shHiddenPrivateBaseVmid != 0,
"Currently we support only single process\n");
}
shHiddenPrivateBaseVmid = sh_hidden_base_new;
}
EventFunctionWrapper tickEvent;
// is this simulation going to be timing mode in the memory?
bool timingSim;
hsail_mode_e hsail_mode;
// If set, issue acq packet @ kernel launch
int impl_kern_launch_acq;
// If set, issue rel packet @ kernel end
int impl_kern_end_rel;
// If set, fetch returns may be coissued with instructions
int coissue_return;
// If set, always dump all 64 gprs to trace
int trace_vgpr_all;
// Number of cu units in the shader
int n_cu;
// Number of wavefront slots per SIMD per CU
int n_wf;
// The size of global memory
int globalMemSize;
// Tracks CU that rr dispatcher should attempt scheduling
int nextSchedCu;
// Size of scheduled add queue
uint32_t sa_n;
// Pointer to value to be increments
std::vector<int*> sa_val;
// When to do the increment
std::vector<uint64_t> sa_when;
// Amount to increment by
std::vector<int32_t> sa_x;
// List of Compute Units (CU's)
std::vector<ComputeUnit*> cuList;
GPUCommandProcessor &gpuCmdProc;
GPUDispatcher &_dispatcher;
int64_t max_valu_insts;
int64_t total_valu_insts;
Shader(const Params &p);
~Shader();
virtual void init();
// Run shader scheduled adds
void execScheduledAdds();
// Schedule a 32-bit value to be incremented some time in the future
void ScheduleAdd(int *val, Tick when, int x);
bool processTimingPacket(PacketPtr pkt);
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
MemCmd cmd, bool suppress_func_errors);
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
bool suppress_func_errors);
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
bool suppress_func_errors);
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
bool suppress_func_errors, int cu_id);
void
registerCU(int cu_id, ComputeUnit *compute_unit)
{
cuList[cu_id] = compute_unit;
}
void prepareInvalidate(HSAQueueEntry *task);
void prepareFlush(GPUDynInstPtr gpuDynInst);
bool dispatchWorkgroups(HSAQueueEntry *task);
Addr mmap(int length);
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
void updateContext(int cid);
void notifyCuSleep();
void
incVectorInstSrcOperand(int num_operands)
{
stats.vectorInstSrcOperand[num_operands]++;
}
void
incVectorInstDstOperand(int num_operands)
{
stats.vectorInstDstOperand[num_operands]++;
}
protected:
struct ShaderStats : public statistics::Group
{
ShaderStats(statistics::Group *parent, int wf_size);
// some stats for measuring latency
statistics::Distribution allLatencyDist;
statistics::Distribution loadLatencyDist;
statistics::Distribution storeLatencyDist;
// average ticks from vmem inst initiateAcc to coalescer issue,
statistics::Distribution initToCoalesceLatency;
// average ticks from coalescer issue to coalescer hit callback,
statistics::Distribution rubyNetworkLatency;
// average ticks from coalescer hit callback to GM pipe enqueue,
statistics::Distribution gmEnqueueLatency;
// average ticks spent in GM pipe's ordered resp buffer.
statistics::Distribution gmToCompleteLatency;
// average number of cache blocks requested by vmem inst
statistics::Distribution coalsrLineAddresses;
// average ticks for cache blocks to main memory for the Nth
// cache block generated by a vmem inst.
statistics::Distribution *cacheBlockRoundTrip;
statistics::Scalar shaderActiveTicks;
statistics::Vector vectorInstSrcOperand;
statistics::Vector vectorInstDstOperand;
} stats;
};
#endif // __SHADER_HH__