Files
gem5/src/gpu-compute/shader.hh
Gabe Black 1d755b4ba1 misc: Clean up usage of arch/isa_traits.hh.
isa_traits.hh used to have much more in it, but now it only has
PageShift, PageBytes, and (for now) the guest endianness. These values
should only be retrieved from the System class generally speaking, so
only the system class should include arch/isa_traits.hh.

Some gpu compute related files need PageBytes or PageShift. Even though
those files don't advertise their ISA dependence, they are tied to x86.
In those files, they can include arch/x86/isa_traits.hh.

The only other file which legitimately needs arch/isa_traits.hh is the
decoder cache since it uses PageBytes to size an array.

Change-Id: I12686368715623e3140a68a7027c136bd52567b1
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/33203
Reviewed-by: Gabe Black <gabeblack@google.com>
Reviewed-by: Daniel Carvalho <odanrc@yahoo.com.br>
Maintainer: Gabe Black <gabeblack@google.com>
Tested-by: kokoro <noreply+kokoro@google.com>
2020-08-28 07:20:58 +00:00

307 lines
8.9 KiB
C++

/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Steve Reinhardt
*/
#ifndef __SHADER_HH__
#define __SHADER_HH__
#include <functional>
#include <string>
#include "arch/isa.hh"
#include "base/types.hh"
#include "cpu/simple/atomic.hh"
#include "cpu/simple/timing.hh"
#include "cpu/simple_thread.hh"
#include "cpu/thread_context.hh"
#include "cpu/thread_state.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/gpu_tlb.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/lds_state.hh"
#include "mem/page_table.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/Shader.hh"
#include "sim/faults.hh"
#include "sim/process.hh"
#include "sim/sim_object.hh"
class BaseTLB;
class GPUCommandProcessor;
class GPUDispatcher;
namespace TheISA
{
class GpuTLB;
}
static const int LDS_SIZE = 65536;
// aperture (APE) registers define the base/limit
// pair for the ATC mapped memory space. currently
// the only APEs we consider are for GPUVM/LDS/scratch.
// the APEs are registered with unique values based
// on a per-device basis
struct ApertureRegister
{
Addr base;
Addr limit;
};
// Class Shader: This describes a single shader instance. Most
// configurations will only have a single shader.
class Shader : public ClockedObject
{
private:
ApertureRegister _gpuVmApe;
ApertureRegister _ldsApe;
ApertureRegister _scratchApe;
Addr shHiddenPrivateBaseVmid;
// Number of active Cus attached to this shader
int _activeCus;
// Last tick that all CUs attached to this shader were inactive
Tick _lastInactiveTick;
// some stats for measuring latency
Stats::Distribution allLatencyDist;
Stats::Distribution loadLatencyDist;
Stats::Distribution storeLatencyDist;
// average ticks from vmem inst initiateAcc to coalescer issue,
// average ticks from coalescer issue to coalescer hit callback,
// average ticks from coalescer hit callback to GM pipe enqueue,
// and average ticks spent in GM pipe's ordered resp buffer.
Stats::Distribution initToCoalesceLatency;
Stats::Distribution rubyNetworkLatency;
Stats::Distribution gmEnqueueLatency;
Stats::Distribution gmToCompleteLatency;
// average number of cache blocks requested by vmem inst, and
// average ticks for cache blocks to main memory for the Nth
// cache block generated by a vmem inst.
Stats::Distribution coalsrLineAddresses;
Stats::Distribution *cacheBlockRoundTrip;
public:
typedef ShaderParams Params;
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
GPUDispatcher &dispatcher();
void sampleLoad(const Tick accessTime);
void sampleStore(const Tick accessTime);
void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
void sampleLineRoundTrip(const std::map<Addr,
std::vector<Tick>> &roundTripTime);
SimpleThread *cpuThread;
ThreadContext *gpuTc;
BaseCPU *cpuPointer;
const ApertureRegister&
gpuVmApe() const
{
return _gpuVmApe;
}
const ApertureRegister&
ldsApe() const
{
return _ldsApe;
}
const ApertureRegister&
scratchApe() const
{
return _scratchApe;
}
bool
isGpuVmApe(Addr addr) const
{
bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
return is_gpu_vm;
}
bool
isLdsApe(Addr addr) const
{
bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
return is_lds;
}
bool
isScratchApe(Addr addr) const
{
bool is_scratch
= addr >= _scratchApe.base && addr <= _scratchApe.limit;
return is_scratch;
}
Addr
getScratchBase()
{
return _scratchApe.base;
}
Addr
getHiddenPrivateBase()
{
return shHiddenPrivateBaseVmid;
}
void
initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
{
Addr sh_hidden_base_new = queueBase - offset;
// We are initializing sh_hidden_private_base_vmid from the
// amd queue descriptor from the first queue.
// The sh_hidden_private_base_vmid is supposed to be same for
// all the queues from the same process
if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
// Do not panic if shHiddenPrivateBaseVmid == 0,
// that is if it is uninitialized. Panic only
// if the value is initilized and we get
// a differnt base later.
panic_if(shHiddenPrivateBaseVmid != 0,
"Currently we support only single process\n");
}
shHiddenPrivateBaseVmid = sh_hidden_base_new;
}
EventFunctionWrapper tickEvent;
// is this simulation going to be timing mode in the memory?
bool timingSim;
hsail_mode_e hsail_mode;
// If set, issue acq packet @ kernel launch
int impl_kern_launch_acq;
// If set, issue rel packet @ kernel end
int impl_kern_end_rel;
// If set, fetch returns may be coissued with instructions
int coissue_return;
// If set, always dump all 64 gprs to trace
int trace_vgpr_all;
// Number of cu units in the shader
int n_cu;
// Number of wavefront slots per SIMD per CU
int n_wf;
// The size of global memory
int globalMemSize;
// Tracks CU that rr dispatcher should attempt scheduling
int nextSchedCu;
// Size of scheduled add queue
uint32_t sa_n;
// Pointer to value to be increments
std::vector<int*> sa_val;
// When to do the increment
std::vector<uint64_t> sa_when;
// Amount to increment by
std::vector<int32_t> sa_x;
// List of Compute Units (CU's)
std::vector<ComputeUnit*> cuList;
GPUCommandProcessor &gpuCmdProc;
GPUDispatcher &_dispatcher;
/**
* Statistics
*/
Stats::Scalar shaderActiveTicks;
Stats::Vector vectorInstSrcOperand;
Stats::Vector vectorInstDstOperand;
void regStats();
int64_t max_valu_insts;
int64_t total_valu_insts;
Shader(const Params *p);
~Shader();
virtual void init();
// Run shader scheduled adds
void execScheduledAdds();
// Schedule a 32-bit value to be incremented some time in the future
void ScheduleAdd(int *val, Tick when, int x);
bool processTimingPacket(PacketPtr pkt);
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
MemCmd cmd, bool suppress_func_errors);
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
bool suppress_func_errors);
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
bool suppress_func_errors);
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
bool suppress_func_errors, int cu_id);
void
registerCU(int cu_id, ComputeUnit *compute_unit)
{
cuList[cu_id] = compute_unit;
}
void prepareInvalidate(HSAQueueEntry *task);
void prepareFlush(GPUDynInstPtr gpuDynInst);
bool dispatchWorkgroups(HSAQueueEntry *task);
Addr mmap(int length);
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
void updateContext(int cid);
void notifyCuSleep();
};
#endif // __SHADER_HH__