There is no GPU device in SE mode to get version from and no GPU driver in FS mode to get version from, so a conditional needs to be added depending on the mode to get the gfx version. Change-Id: I33fdafb60d351ebc5148e2248244537fb5bebd31 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/71078 Tested-by: kokoro <noreply+kokoro@google.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
257 lines
9.3 KiB
C++
257 lines
9.3 KiB
C++
/*
|
|
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* The GPUComputeDriver implements an HSADriver for an HSA AMD GPU
|
|
* agent. Other GPU devices, or other HSA agents, should not derive
|
|
* from this class. Instead device-specific implementations of an
|
|
* HSADriver should be provided for each unique device.
|
|
*/
|
|
|
|
#ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
|
|
#define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
|
|
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <set>
|
|
#include <unordered_map>
|
|
|
|
#include "base/addr_range_map.hh"
|
|
#include "base/types.hh"
|
|
#include "enums/GfxVersion.hh"
|
|
#include "mem/request.hh"
|
|
#include "sim/emul_driver.hh"
|
|
|
|
namespace gem5
|
|
{
|
|
|
|
struct GPUComputeDriverParams;
|
|
class GPUCommandProcessor;
|
|
class PortProxy;
|
|
class ThreadContext;
|
|
|
|
class GPUComputeDriver final : public EmulatedDriver
|
|
{
|
|
public:
|
|
typedef GPUComputeDriverParams Params;
|
|
GPUComputeDriver(const Params &p);
|
|
int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;
|
|
|
|
int open(ThreadContext *tc, int mode, int flags) override;
|
|
Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
|
|
int prot, int tgt_flags, int tgt_fd, off_t offset) override;
|
|
virtual void signalWakeupEvent(uint32_t event_id);
|
|
void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
|
|
/**
|
|
* Called by the compute units right before a request is issued to ruby.
|
|
* This uses our VMAs to correctly set the MTYPE on a per-request basis.
|
|
* In real hardware, this is actually done through PTE bits in GPUVM.
|
|
* Since we are running a single VM (x86 PT) system, the MTYPE bits aren't
|
|
* available. Adding GPUVM specific bits to x86 page tables probably
|
|
* isn't the best way to proceed. For now we just have the driver set
|
|
* these until we implement a proper dual PT system.
|
|
*/
|
|
void setMtype(RequestPtr req);
|
|
|
|
int
|
|
doorbellSize()
|
|
{
|
|
switch (gfxVersion) {
|
|
case GfxVersion::gfx801:
|
|
case GfxVersion::gfx803:
|
|
case GfxVersion::gfx902:
|
|
return 4;
|
|
case GfxVersion::gfx900:
|
|
// gfx900 supports large BAR, so it has a larger doorbell
|
|
return 8;
|
|
default:
|
|
fatal("Invalid GPU type\n");
|
|
}
|
|
return 4;
|
|
}
|
|
|
|
class DriverWakeupEvent : public Event
|
|
{
|
|
public:
|
|
DriverWakeupEvent(GPUComputeDriver *gpu_driver,
|
|
ThreadContext *thrd_cntxt)
|
|
: driver(gpu_driver), tc(thrd_cntxt) {}
|
|
void process() override;
|
|
const char *description() const override;
|
|
void scheduleWakeup(Tick wakeup_delay);
|
|
private:
|
|
GPUComputeDriver *driver;
|
|
ThreadContext *tc;
|
|
};
|
|
|
|
class EventTableEntry
|
|
{
|
|
public:
|
|
EventTableEntry() :
|
|
mailBoxPtr(0), tc(nullptr), threadWaiting(false), setEvent(false)
|
|
{}
|
|
// Mail box pointer for this address. Current implementation does not
|
|
// use this mailBoxPtr to notify events but directly calls
|
|
// signalWakeupEvent from dispatcher (GPU) to notifiy events. So,
|
|
// currently this mailBoxPtr is not used. But a future implementation
|
|
// may communicate to the driver using mailBoxPtr.
|
|
Addr mailBoxPtr;
|
|
// Thread context waiting on this even. We do not support multiple
|
|
// threads waiting on an event currently.
|
|
ThreadContext *tc;
|
|
// threadWaiting = true, if some thread context is waiting on this
|
|
// event. A thread context waiting on this event is put to sleep.
|
|
bool threadWaiting;
|
|
// setEvent = true, if this event is triggered but when this event
|
|
// triggered, no thread context was waiting on it. In the future, some
|
|
// thread context will try to wait on this event but since event has
|
|
// already happened, we will not allow that thread context to go to
|
|
// sleep. The above mentioned scneario can happen when the waiting
|
|
// thread and wakeup thread race on this event and the wakeup thread
|
|
// beat the waiting thread at the driver.
|
|
bool setEvent;
|
|
};
|
|
typedef class EventTableEntry ETEntry;
|
|
|
|
GfxVersion getGfxVersion() const { return gfxVersion; }
|
|
|
|
private:
|
|
/**
|
|
* GPU that is controlled by this driver.
|
|
*/
|
|
GPUCommandProcessor *device;
|
|
uint32_t queueId;
|
|
bool isdGPU;
|
|
GfxVersion gfxVersion;
|
|
int dGPUPoolID;
|
|
Addr eventPage;
|
|
uint32_t eventSlotIndex;
|
|
//Event table that keeps track of events. It is indexed with event ID.
|
|
std::unordered_map<uint32_t, ETEntry> ETable;
|
|
|
|
/**
|
|
* VMA structures for GPUVM memory.
|
|
*/
|
|
AddrRangeMap<Request::CacheCoherenceFlags, 1> gpuVmas;
|
|
|
|
/**
|
|
* Mtype bits {Cached, Read Write, Shared} for caches
|
|
*/
|
|
enum MtypeFlags
|
|
{
|
|
SHARED = 0,
|
|
READ_WRITE = 1,
|
|
CACHED = 2,
|
|
NUM_MTYPE_BITS
|
|
};
|
|
|
|
Request::CacheCoherenceFlags defaultMtype;
|
|
|
|
// TCEvents map keeps trak of the events that can wakeup this thread. When
|
|
// multiple events can wake up this thread, this data structure helps to
|
|
// reset all events when one of those events wake up this thread. the
|
|
// signal events that can wake up this thread are stored in signalEvents
|
|
// whereas the timer wakeup event is stored in timerEvent.
|
|
class EventList
|
|
{
|
|
public:
|
|
EventList() : driver(nullptr), timerEvent(nullptr, nullptr) {}
|
|
EventList(GPUComputeDriver *gpu_driver, ThreadContext *thrd_cntxt)
|
|
: driver(gpu_driver), timerEvent(gpu_driver, thrd_cntxt)
|
|
{ }
|
|
void clearEvents() {
|
|
assert(driver);
|
|
for (auto event : signalEvents) {
|
|
assert(event < driver->eventSlotIndex);
|
|
driver->ETable[event].tc = nullptr;
|
|
driver->ETable[event].threadWaiting = false;
|
|
}
|
|
signalEvents.clear();
|
|
if (timerEvent.scheduled()) {
|
|
driver->deschedule(timerEvent);
|
|
}
|
|
}
|
|
GPUComputeDriver *driver;
|
|
DriverWakeupEvent timerEvent;
|
|
// The set of events that can wake up the same thread.
|
|
std::set<uint32_t> signalEvents;
|
|
};
|
|
std::unordered_map<ThreadContext *, EventList> TCEvents;
|
|
|
|
/**
|
|
* Register a region of host memory as uncacheable from the perspective
|
|
* of the dGPU.
|
|
*/
|
|
void registerUncacheableMemory(Addr start, Addr length);
|
|
|
|
/**
|
|
* The aperture (APE) base/limit pairs are set
|
|
* statically at startup by the real KFD. AMD
|
|
* x86_64 CPUs only use the areas in the 64b
|
|
* address space where VA[63:47] == 0x1ffff or
|
|
* VA[63:47] = 0. These methods generate the APE
|
|
* base/limit pairs in exactly the same way as
|
|
* the real KFD does, which ensures these APEs do
|
|
* not fall into the CPU's address space
|
|
*
|
|
* see the macros in the KFD driver in the ROCm
|
|
* Linux kernel source:
|
|
*
|
|
* drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
|
|
*/
|
|
Addr gpuVmApeBase(int gpuNum) const;
|
|
Addr gpuVmApeLimit(Addr apeBase) const;
|
|
Addr scratchApeBase(int gpuNum) const;
|
|
Addr scratchApeBaseV9() const;
|
|
Addr scratchApeLimit(Addr apeBase) const;
|
|
Addr ldsApeBase(int gpuNum) const;
|
|
Addr ldsApeBaseV9() const;
|
|
Addr ldsApeLimit(Addr apeBase) const;
|
|
|
|
/**
|
|
* Allocate/deallocate GPUVM VMAs for tracking virtual address allocations
|
|
* and properties on DGPUs. For now, we use these to track MTYPE and to
|
|
* be able to select which pages to unmap when the user provides us with
|
|
* a handle during the free ioctl.
|
|
*/
|
|
void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start,
|
|
Addr length);
|
|
Addr deallocateGpuVma(Addr start);
|
|
|
|
void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr);
|
|
|
|
};
|
|
|
|
} // namespace gem5
|
|
|
|
#endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
|