gpu-compute, dev-hsa: Remove HSADriver, HSADevice
HSADriver/HSADevice were primarily used with GPUCommandProcessor/ GPUComputeDriver. This change merges the classes together to simplify the inheritance hierarchy, as well as removing any casting. Change-Id: I670eb9b49a16c8aba17e13fd1d1287d0621c9f48 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/42219 Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
This commit is contained in:
committed by
Matthew Poremba
parent
d019912efa
commit
ec6b325382
@@ -34,12 +34,6 @@ from m5.params import *
|
||||
from m5.proxy import *
|
||||
from m5.objects.Device import DmaDevice
|
||||
|
||||
class HSADevice(DmaDevice):
|
||||
type = 'HSADevice'
|
||||
abstract = True
|
||||
cxx_header = "dev/hsa/hsa_device.hh"
|
||||
hsapp = Param.HSAPacketProcessor("PP attached to this device")
|
||||
|
||||
class HSAPacketProcessor(DmaDevice):
|
||||
type = 'HSAPacketProcessor'
|
||||
cxx_header = 'dev/hsa/hsa_packet_processor.hh'
|
||||
|
||||
@@ -37,12 +37,8 @@ if not env['BUILD_GPU']:
|
||||
Return()
|
||||
|
||||
SimObject('HSADevice.py')
|
||||
SimObject('HSADriver.py')
|
||||
|
||||
Source('hsa_device.cc')
|
||||
Source('hsa_driver.cc')
|
||||
Source('hsa_packet_processor.cc')
|
||||
Source('hw_scheduler.cc')
|
||||
|
||||
DebugFlag('HSADriver')
|
||||
DebugFlag('HSAPacketProcessor')
|
||||
|
||||
@@ -1,104 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "dev/hsa/hsa_device.hh"
|
||||
|
||||
#include "base/chunk_generator.hh"
|
||||
#include "sim/process.hh"
|
||||
|
||||
HSAPacketProcessor&
|
||||
HSADevice::hsaPacketProc()
|
||||
{
|
||||
return *hsaPP;
|
||||
}
|
||||
|
||||
void
|
||||
HSADevice::dmaReadVirt(Addr host_addr, unsigned size,
|
||||
DmaCallback *cb, void *data, Tick delay)
|
||||
{
|
||||
dmaVirt(&DmaDevice::dmaRead, host_addr, size, cb, data, delay);
|
||||
}
|
||||
|
||||
void
|
||||
HSADevice::dmaWriteVirt(Addr host_addr, unsigned size,
|
||||
DmaCallback *cb, void *data, Tick delay)
|
||||
{
|
||||
dmaVirt(&DmaDevice::dmaWrite, host_addr, size, cb, data, delay);
|
||||
}
|
||||
|
||||
void
|
||||
HSADevice::dmaVirt(DmaFnPtr dmaFn, Addr addr, unsigned size,
|
||||
DmaCallback *cb, void *data, Tick delay)
|
||||
{
|
||||
if (size == 0) {
|
||||
if (cb)
|
||||
schedule(cb->getChunkEvent(), curTick() + delay);
|
||||
return;
|
||||
}
|
||||
|
||||
// move the buffer data pointer with the chunks
|
||||
uint8_t *loc_data = (uint8_t*)data;
|
||||
|
||||
for (ChunkGenerator gen(addr, size, PAGE_SIZE); !gen.done(); gen.next()) {
|
||||
Addr phys;
|
||||
|
||||
// translate pages into their corresponding frames
|
||||
translateOrDie(gen.addr(), phys);
|
||||
|
||||
Event *event = cb ? cb->getChunkEvent() : nullptr;
|
||||
|
||||
(this->*dmaFn)(phys, gen.size(), event, loc_data, delay);
|
||||
|
||||
loc_data += gen.size();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* HSADevices will perform DMA operations on VAs, and because
|
||||
* page faults are not currently supported for HSADevices, we
|
||||
* must be able to find the pages mapped for the process.
|
||||
*/
|
||||
void
|
||||
HSADevice::translateOrDie(Addr vaddr, Addr &paddr)
|
||||
{
|
||||
/**
|
||||
* Grab the process and try to translate the virtual address with it;
|
||||
* with new extensions, it will likely be wrong to just arbitrarily
|
||||
* grab context zero.
|
||||
*/
|
||||
auto process = sys->threads[0]->getProcessPtr();
|
||||
|
||||
if (!process->pTable->translate(vaddr, paddr)) {
|
||||
fatal("failed translation: vaddr 0x%x\n", vaddr);
|
||||
}
|
||||
}
|
||||
@@ -1,128 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __DEV_HSA_HSA_DEVICE_HH__
|
||||
#define __DEV_HSA_HSA_DEVICE_HH__
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
|
||||
#include "base/logging.hh"
|
||||
#include "base/types.hh"
|
||||
#include "dev/dma_device.hh"
|
||||
#include "dev/hsa/hsa_packet_processor.hh"
|
||||
#include "params/HSADevice.hh"
|
||||
|
||||
class HSADriver;
|
||||
|
||||
class HSADevice : public DmaDevice
|
||||
{
|
||||
public:
|
||||
typedef HSADeviceParams Params;
|
||||
typedef std::function<void(const uint64_t &)> HsaSignalCallbackFunction;
|
||||
|
||||
HSADevice(const Params &p) : DmaDevice(p), hsaPP(p.hsapp)
|
||||
{
|
||||
assert(hsaPP);
|
||||
hsaPP->setDevice(this);
|
||||
};
|
||||
|
||||
HSAPacketProcessor& hsaPacketProc();
|
||||
|
||||
/**
|
||||
* submitAgentDispatchPkt() accepts AQL dispatch packets from the HSA
|
||||
* packet processor. Not all devices will accept AQL dispatch packets,
|
||||
* so the default implementation will fatal.
|
||||
* Implementation added to steal kernel signals.
|
||||
*/
|
||||
virtual void
|
||||
submitAgentDispatchPkt(void *raw_pkt, uint32_t qID, Addr host_pkt_addr)
|
||||
{
|
||||
fatal("%s does not accept dispatch packets\n", name());
|
||||
}
|
||||
|
||||
/**
|
||||
* submitDispatchPkt() accepts AQL dispatch packets from the HSA packet
|
||||
* processor. Not all devices will accept AQL dispatch packets, so the
|
||||
* default implementation will fatal.
|
||||
*/
|
||||
virtual void
|
||||
submitDispatchPkt(void *raw_pkt, uint32_t qID, Addr host_pkt_addr)
|
||||
{
|
||||
fatal("%s does not accept dispatch packets\n", name());
|
||||
}
|
||||
|
||||
/**
|
||||
* submitVendorPkt() accepts vendor specific packets from the HSA
|
||||
* packet processor. This method should be overriden in any HSADevice
|
||||
* that acceptes vendor specific packets, and should interpret the
|
||||
* packet according to the vendor's specifications. Not all HSA
|
||||
* devices will accept vendor specific packets, so the default
|
||||
* implementation will fatal.
|
||||
*/
|
||||
virtual void
|
||||
submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
|
||||
{
|
||||
fatal("%s does not accept vendor specific packets\n", name());
|
||||
}
|
||||
virtual void
|
||||
attachDriver(HSADriver *driver)
|
||||
{
|
||||
fatal("%s does not need HSA driver\n", name());
|
||||
}
|
||||
virtual void
|
||||
updateHsaSignal(Addr signal_handle, uint64_t signal_value,
|
||||
HsaSignalCallbackFunction function = [] (const uint64_t &) { })
|
||||
{
|
||||
fatal("%s does not have HSA signal update functionality.\n", name());
|
||||
}
|
||||
virtual uint64_t
|
||||
functionalReadHsaSignal(Addr signal_handle)
|
||||
{
|
||||
fatal("%s does not have HSA signal read functionality.\n", name());
|
||||
}
|
||||
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb,
|
||||
void *data, Tick delay = 0);
|
||||
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *cb,
|
||||
void *data, Tick delay = 0);
|
||||
|
||||
protected:
|
||||
// Typedefing dmaRead and dmaWrite function pointer
|
||||
typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
|
||||
HSAPacketProcessor *hsaPP;
|
||||
void dmaVirt(DmaFnPtr, Addr host_addr, unsigned size, DmaCallback *cb,
|
||||
void *data, Tick delay = 0);
|
||||
void translateOrDie(Addr vaddr, Addr &paddr);
|
||||
};
|
||||
|
||||
#endif // __DEV_HSA_HSA_DEVICE_HH__
|
||||
@@ -1,188 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "dev/hsa/hsa_driver.hh"
|
||||
|
||||
#include "base/trace.hh"
|
||||
#include "debug/HSADriver.hh"
|
||||
#include "dev/hsa/hsa_device.hh"
|
||||
#include "dev/hsa/hsa_packet_processor.hh"
|
||||
#include "dev/hsa/kfd_event_defines.h"
|
||||
#include "dev/hsa/kfd_ioctl.h"
|
||||
#include "params/HSADriver.hh"
|
||||
#include "sim/process.hh"
|
||||
#include "sim/proxy_ptr.hh"
|
||||
#include "sim/syscall_emul_buf.hh"
|
||||
|
||||
HSADriver::HSADriver(const HSADriverParams &p)
|
||||
: EmulatedDriver(p), device(p.device), queueId(0)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an FD entry for the KFD inside of the owning process.
|
||||
*/
|
||||
int
|
||||
HSADriver::open(ThreadContext *tc, int mode, int flags)
|
||||
{
|
||||
DPRINTF(HSADriver, "Opened %s\n", filename);
|
||||
auto process = tc->getProcessPtr();
|
||||
auto device_fd_entry = std::make_shared<DeviceFDEntry>(this, filename);
|
||||
int tgt_fd = process->fds->allocFD(device_fd_entry);
|
||||
return tgt_fd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Currently, mmap() will simply setup a mapping for the associated
|
||||
* device's packet processor's doorbells and creates the event page.
|
||||
*/
|
||||
Addr
|
||||
HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot,
|
||||
int tgt_flags, int tgt_fd, off_t offset)
|
||||
{
|
||||
auto process = tc->getProcessPtr();
|
||||
auto mem_state = process->memState;
|
||||
|
||||
Addr pg_off = offset >> PAGE_SHIFT;
|
||||
Addr mmap_type = pg_off & KFD_MMAP_TYPE_MASK;
|
||||
DPRINTF(HSADriver, "amdkfd mmap (start: %p, length: 0x%x,"
|
||||
"offset: 0x%x)\n", start, length, offset);
|
||||
|
||||
switch (mmap_type) {
|
||||
case KFD_MMAP_TYPE_DOORBELL:
|
||||
DPRINTF(HSADriver, "amdkfd mmap type DOORBELL offset\n");
|
||||
start = mem_state->extendMmap(length);
|
||||
process->pTable->map(start, device->hsaPacketProc().pioAddr,
|
||||
length, false);
|
||||
break;
|
||||
case KFD_MMAP_TYPE_EVENTS:
|
||||
DPRINTF(HSADriver, "amdkfd mmap type EVENTS offset\n");
|
||||
panic_if(start != 0,
|
||||
"Start address should be provided by KFD\n");
|
||||
panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
|
||||
"Requested length %d, expected length %d; length "
|
||||
"mismatch\n", length, 8 * KFD_SIGNAL_EVENT_LIMIT);
|
||||
/**
|
||||
* We don't actually access these pages. We just need to reserve
|
||||
* some VA space. See commit id 5ce8abce for details on how
|
||||
* events are currently implemented.
|
||||
*/
|
||||
if (!eventPage) {
|
||||
eventPage = mem_state->extendMmap(length);
|
||||
start = eventPage;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
warn_once("Unrecognized kfd mmap type %llx\n", mmap_type);
|
||||
break;
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Forward relevant parameters to packet processor; queueID
|
||||
* is used to link doorbell. The queueIDs are not re-used
|
||||
* in current implementation, and we allocate only one page
|
||||
* (4096 bytes) for doorbells, so check if this queue ID can
|
||||
* be mapped into that page.
|
||||
*/
|
||||
void
|
||||
HSADriver::allocateQueue(ThreadContext *tc, Addr ioc_buf)
|
||||
{
|
||||
VPtr<kfd_ioctl_create_queue_args> args(ioc_buf, tc);
|
||||
|
||||
if (queueId >= 0x1000) {
|
||||
fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
|
||||
}
|
||||
|
||||
args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL |
|
||||
KFD_MMAP_GPU_ID(args->gpu_id)) << PAGE_SHIFT;
|
||||
|
||||
args->queue_id = queueId++;
|
||||
auto &hsa_pp = device->hsaPacketProc();
|
||||
hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
|
||||
args->ring_base_address, args->queue_id,
|
||||
args->ring_size);
|
||||
}
|
||||
|
||||
const char*
|
||||
HSADriver::DriverWakeupEvent::description() const
|
||||
{
|
||||
return "DriverWakeupEvent";
|
||||
}
|
||||
|
||||
void
|
||||
HSADriver::DriverWakeupEvent::scheduleWakeup(Tick wakeup_delay)
|
||||
{
|
||||
assert(driver);
|
||||
driver->schedule(this, curTick() + wakeup_delay);
|
||||
}
|
||||
|
||||
void
|
||||
HSADriver::signalWakeupEvent(uint32_t event_id)
|
||||
{
|
||||
panic_if(event_id >= eventSlotIndex,
|
||||
"Trying wakeup on an event that is not yet created\n");
|
||||
if (ETable[event_id].threadWaiting) {
|
||||
panic_if(!ETable[event_id].tc,
|
||||
"No thread context to wake up\n");
|
||||
ThreadContext *tc = ETable[event_id].tc;
|
||||
DPRINTF(HSADriver,
|
||||
"Signal event: Waking up CPU %d\n", tc->cpuId());
|
||||
// Wake up this thread
|
||||
tc->activate();
|
||||
// Remove events that can wake up this thread
|
||||
TCEvents[tc].clearEvents();
|
||||
} else {
|
||||
// This may be a race condition between an ioctl call asking to wait on
|
||||
// this event and this signalWakeupEvent. Taking care of this race
|
||||
// condition here by setting the event here. The ioctl call should take
|
||||
// the necessary action when waiting on an already set event. However,
|
||||
// this may be a genuine instance in which the runtime has decided not
|
||||
// to wait on this event. But since we cannot distinguish this case with
|
||||
// the race condition, we are any way setting the event.
|
||||
ETable[event_id].setEvent = true;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
HSADriver::DriverWakeupEvent::process()
|
||||
{
|
||||
DPRINTF(HSADriver,
|
||||
"Timer event: Waking up CPU %d\n", tc->cpuId());
|
||||
// Wake up this thread
|
||||
tc->activate();
|
||||
// Remove events that can wake up this thread
|
||||
driver->TCEvents[tc].clearEvents();
|
||||
}
|
||||
@@ -1,163 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* For use for simulation and test purposes only
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* An HSADriver is an emulated driver that controls an HSA agent,
|
||||
* or more simply put, an HSA device. An HSA device is a device
|
||||
* that has an associated HSA packet processor.
|
||||
*
|
||||
* In the base HSADriver class the open() method is implemented, as
|
||||
* well as the mmap() call, which maps the HSA packet processor's
|
||||
* doorbells. Drivers for other HSA devices should derive from this
|
||||
* class and implement the necessary methods; typically this is an
|
||||
* ioctl() method that satisfies the ioctl requests needed to manage
|
||||
* and control the device.
|
||||
*/
|
||||
|
||||
#ifndef __DEV_HSA_HSA_DRIVER_HH__
|
||||
#define __DEV_HSA_HSA_DRIVER_HH__
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "base/logging.hh"
|
||||
#include "base/types.hh"
|
||||
#include "cpu/thread_context.hh"
|
||||
#include "sim/emul_driver.hh"
|
||||
|
||||
struct HSADriverParams;
|
||||
class HSADevice;
|
||||
|
||||
class HSADriver : public EmulatedDriver
|
||||
{
|
||||
public:
|
||||
HSADriver(const HSADriverParams &p);
|
||||
|
||||
int open(ThreadContext *tc, int mode, int flags);
|
||||
Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
|
||||
int prot, int tgt_flags, int tgt_fd, off_t offset);
|
||||
virtual void signalWakeupEvent(uint32_t event_id);
|
||||
class DriverWakeupEvent : public Event
|
||||
{
|
||||
public:
|
||||
DriverWakeupEvent(HSADriver *hsa_driver, ThreadContext *thrd_cntxt)
|
||||
: driver(hsa_driver), tc(thrd_cntxt) {}
|
||||
void process() override;
|
||||
const char *description() const override;
|
||||
void scheduleWakeup(Tick wakeup_delay);
|
||||
private:
|
||||
HSADriver *driver;
|
||||
ThreadContext *tc;
|
||||
};
|
||||
class EventTableEntry
|
||||
{
|
||||
public:
|
||||
EventTableEntry() :
|
||||
mailBoxPtr(0), tc(nullptr), threadWaiting(false), setEvent(false)
|
||||
{}
|
||||
// Mail box pointer for this address. Current implementation does not
|
||||
// use this mailBoxPtr to notify events but directly calls
|
||||
// signalWakeupEvent from dispatcher (GPU) to notify event. So,
|
||||
// currently this mailBoxPtr is not used. But a future implementation
|
||||
// may communicate to the driver using mailBoxPtr.
|
||||
Addr mailBoxPtr;
|
||||
// Thread context waiting on this event. We do not support multiple
|
||||
// threads waiting on an event currently.
|
||||
ThreadContext *tc;
|
||||
// threadWaiting = true, if some thread context is waiting on this
|
||||
// event. A thread context waiting on this event is put to sleep.
|
||||
bool threadWaiting;
|
||||
// setEvent = true, if this event is triggered but when this event
|
||||
// triggered, no thread context was waiting on it. In the future, some
|
||||
// thread context will try to wait on this event but since event has
|
||||
// already happened, we will not allow that thread context to go to
|
||||
// sleep. The above mentioned scenario can happen when the waiting
|
||||
// thread and wakeup thread race on this event and the wakeup thread
|
||||
// beat the waiting thread at the driver.
|
||||
bool setEvent;
|
||||
};
|
||||
typedef class EventTableEntry ETEntry;
|
||||
|
||||
protected:
|
||||
Addr eventPage;
|
||||
uint32_t eventSlotIndex;
|
||||
// Event table that keeps track of events. It is indexed with event ID.
|
||||
std::unordered_map<uint32_t, ETEntry> ETable;
|
||||
|
||||
// TCEvents map keeps track of the events that can wakeup this thread. When
|
||||
// multiple events can wake up this thread, this data structure helps to
|
||||
// reset all events when one of those events wake up this thread. The
|
||||
// signal events that can wake up this thread are stored in signalEvents
|
||||
// whereas the timer wakeup event is stored in timerEvent.
|
||||
class EventList
|
||||
{
|
||||
public:
|
||||
EventList() : driver(nullptr), timerEvent(nullptr, nullptr) {}
|
||||
EventList(HSADriver *hsa_driver, ThreadContext *thrd_cntxt)
|
||||
: driver(hsa_driver), timerEvent(hsa_driver, thrd_cntxt)
|
||||
{ }
|
||||
void clearEvents() {
|
||||
assert(driver);
|
||||
for (auto event : signalEvents) {
|
||||
assert(event < driver->eventSlotIndex);
|
||||
panic_if(driver->ETable[event].tc->status() == \
|
||||
ThreadContext::Suspended,
|
||||
"Thread should not be suspended\n");
|
||||
driver->ETable[event].tc = nullptr;
|
||||
driver->ETable[event].threadWaiting = false;
|
||||
}
|
||||
signalEvents.clear();
|
||||
if (timerEvent.scheduled()) {
|
||||
driver->deschedule(timerEvent);
|
||||
}
|
||||
}
|
||||
HSADriver *driver;
|
||||
DriverWakeupEvent timerEvent;
|
||||
// The set of events that can wake up the same thread.
|
||||
std::set<uint32_t> signalEvents;
|
||||
};
|
||||
std::unordered_map<ThreadContext *, EventList> TCEvents;
|
||||
|
||||
/**
|
||||
* HSA agent (device) that is controled by this driver.
|
||||
*/
|
||||
HSADevice *device;
|
||||
uint32_t queueId;
|
||||
|
||||
void allocateQueue(ThreadContext *tc, Addr ioc_buf);
|
||||
};
|
||||
|
||||
#endif // __DEV_HSA_HSA_DRIVER_HH__
|
||||
@@ -42,9 +42,9 @@
|
||||
#include "base/trace.hh"
|
||||
#include "debug/HSAPacketProcessor.hh"
|
||||
#include "dev/dma_device.hh"
|
||||
#include "dev/hsa/hsa_device.hh"
|
||||
#include "dev/hsa/hsa_packet.hh"
|
||||
#include "dev/hsa/hw_scheduler.hh"
|
||||
#include "gpu-compute/gpu_command_processor.hh"
|
||||
#include "mem/packet_access.hh"
|
||||
#include "mem/page_table.hh"
|
||||
#include "sim/process.hh"
|
||||
@@ -330,14 +330,24 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
|
||||
DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
|
||||
" active list ID = %d\n", __FUNCTION__, rl_idx);
|
||||
// Submit packet to HSA device (dispatcher)
|
||||
hsa_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
|
||||
gpu_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
|
||||
is_submitted = UNBLOCKED;
|
||||
} else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
|
||||
DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
|
||||
" active list ID = %d\n", __FUNCTION__, rl_idx);
|
||||
// Submit packet to HSA device (dispatcher)
|
||||
hsa_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
|
||||
gpu_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
|
||||
is_submitted = UNBLOCKED;
|
||||
/*
|
||||
If this packet is using the "barrier bit" to enforce ordering with
|
||||
subsequent kernels, set the bit for this queue now, after
|
||||
dispatching.
|
||||
*/
|
||||
if (IS_BARRIER(disp_pkt)) {
|
||||
DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
|
||||
" list ID = %d\n", __FUNCTION__, rl_idx);
|
||||
regdQList[rl_idx]->setBarrierBit(true);
|
||||
}
|
||||
} else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
|
||||
DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
|
||||
" active list ID = %d\n", __FUNCTION__, rl_idx);
|
||||
@@ -404,14 +414,14 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
|
||||
// I'm going to cheat here and read out
|
||||
// the value from main memory using functional
|
||||
// access, and then just DMA the decremented value.
|
||||
uint64_t signal_value = hsa_device->functionalReadHsaSignal(\
|
||||
uint64_t signal_value = gpu_device->functionalReadHsaSignal(\
|
||||
bar_and_pkt->completion_signal);
|
||||
|
||||
DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
|
||||
" completion signal! Addr: %x\n",
|
||||
bar_and_pkt->completion_signal);
|
||||
|
||||
hsa_device->updateHsaSignal(bar_and_pkt->completion_signal,
|
||||
gpu_device->updateHsaSignal(bar_and_pkt->completion_signal,
|
||||
signal_value - 1);
|
||||
}
|
||||
}
|
||||
@@ -428,7 +438,7 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
|
||||
DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \
|
||||
" active list ID = %d\n", __FUNCTION__, rl_idx);
|
||||
// Submit packet to HSA device (dispatcher)
|
||||
hsa_device->submitAgentDispatchPkt(
|
||||
gpu_device->submitAgentDispatchPkt(
|
||||
(void *)disp_pkt, rl_idx, host_pkt_addr);
|
||||
is_submitted = UNBLOCKED;
|
||||
sendAgentDispatchCompletionSignal((void *)disp_pkt,0);
|
||||
@@ -633,9 +643,9 @@ AQLRingBuffer::freeEntry(void *pkt)
|
||||
}
|
||||
|
||||
void
|
||||
HSAPacketProcessor::setDevice(HSADevice *dev)
|
||||
HSAPacketProcessor::setDevice(GPUCommandProcessor *dev)
|
||||
{
|
||||
this->hsa_device = dev;
|
||||
this->gpu_device = dev;
|
||||
}
|
||||
|
||||
int
|
||||
@@ -670,15 +680,13 @@ HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
|
||||
DPRINTF(HSAPacketProcessor,
|
||||
"Unset barrier bit for active list ID %d\n", rl_idx);
|
||||
regdQList[rl_idx]->setBarrierBit(false);
|
||||
panic_if(!regdQList[rl_idx]->dispPending(),
|
||||
"There should be pending kernels in this queue\n");
|
||||
DPRINTF(HSAPacketProcessor,
|
||||
"Rescheduling active list ID %d after unsetting barrier "
|
||||
"bit\n", rl_idx);
|
||||
// Try to schedule wakeup in the next cycle. There is a minimum
|
||||
// pktProcessDelay for queue wake up. If that processing delay is
|
||||
// elapsed, schedAQLProcessing will wakeup next tick.
|
||||
schedAQLProcessing(rl_idx, 1);
|
||||
// if pending kernels in the queue after this kernel, reschedule
|
||||
if (regdQList[rl_idx]->dispPending()) {
|
||||
DPRINTF(HSAPacketProcessor,
|
||||
"Rescheduling active list ID %d after unsetting barrier "
|
||||
"bit\n", rl_idx);
|
||||
schedAQLProcessing(rl_idx);
|
||||
}
|
||||
}
|
||||
|
||||
// If set, then blocked schedule, so need to reschedule
|
||||
|
||||
@@ -66,7 +66,7 @@ typedef enum
|
||||
// barrier packet completes.
|
||||
} Q_STATE;
|
||||
|
||||
class HSADevice;
|
||||
class GPUCommandProcessor;
|
||||
class HWScheduler;
|
||||
|
||||
// Our internal representation of an HSA queue
|
||||
@@ -120,7 +120,7 @@ class HSAQueueDescriptor
|
||||
* FREE: Entry is empty
|
||||
* ALLOCATED: Entry has been allocated for a packet, but the DMA has not
|
||||
* yet completed
|
||||
* SUBMITTED: Packet has been submitted to the HSADevice, but has not
|
||||
* SUBMITTED: Packet has been submitted to the GPUCommandProcessor, but has not
|
||||
* yet completed
|
||||
*/
|
||||
class AQLRingBuffer
|
||||
@@ -224,7 +224,7 @@ class HSAPacketProcessor: public DmaDevice
|
||||
friend class HWScheduler;
|
||||
protected:
|
||||
typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
|
||||
HSADevice *hsa_device;
|
||||
GPUCommandProcessor *gpu_device;
|
||||
HWScheduler *hwSchdlr;
|
||||
|
||||
// Structure to store the read values of dependency signals
|
||||
@@ -333,7 +333,7 @@ class HSAPacketProcessor: public DmaDevice
|
||||
uint64_t queue_id,
|
||||
uint32_t size);
|
||||
void unsetDeviceQueueDesc(uint64_t queue_id);
|
||||
void setDevice(HSADevice * dev);
|
||||
void setDevice(GPUCommandProcessor * dev);
|
||||
void updateReadIndex(int, uint32_t);
|
||||
void getCommandsFromHost(int pid, uint32_t rl_idx);
|
||||
|
||||
|
||||
@@ -37,8 +37,6 @@ from m5.SimObject import SimObject
|
||||
from m5.objects.Bridge import Bridge
|
||||
from m5.objects.ClockedObject import ClockedObject
|
||||
from m5.objects.Device import DmaDevice
|
||||
from m5.objects.HSADevice import HSADevice
|
||||
from m5.objects.HSADriver import HSADriver
|
||||
from m5.objects.LdsState import LdsState
|
||||
from m5.objects.Process import EmulatedDriver
|
||||
|
||||
@@ -239,9 +237,10 @@ class Shader(ClockedObject):
|
||||
idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold")
|
||||
max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting")
|
||||
|
||||
class GPUComputeDriver(HSADriver):
|
||||
class GPUComputeDriver(EmulatedDriver):
|
||||
type = 'GPUComputeDriver'
|
||||
cxx_header = 'gpu-compute/gpu_compute_driver.hh'
|
||||
device = Param.GPUCommandProcessor('GPU controlled by this driver')
|
||||
isdGPU = Param.Bool(False, 'Driver is for a dGPU')
|
||||
gfxVersion = Param.GfxVersion('gfx801', 'ISA of gpu to model')
|
||||
dGPUPoolID = Param.Int(False, 'Pool ID for dGPU.')
|
||||
@@ -259,11 +258,13 @@ class GPUDispatcher(SimObject):
|
||||
type = 'GPUDispatcher'
|
||||
cxx_header = 'gpu-compute/dispatcher.hh'
|
||||
|
||||
class GPUCommandProcessor(HSADevice):
|
||||
class GPUCommandProcessor(DmaDevice):
|
||||
type = 'GPUCommandProcessor'
|
||||
cxx_header = 'gpu-compute/gpu_command_processor.hh'
|
||||
dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU')
|
||||
|
||||
hsapp = Param.HSAPacketProcessor('PP attached to this device')
|
||||
|
||||
class StorageClassType(Enum): vals = [
|
||||
'SC_SPILL',
|
||||
'SC_GLOBAL',
|
||||
|
||||
@@ -33,6 +33,9 @@
|
||||
|
||||
#include "gpu-compute/gpu_command_processor.hh"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#include "base/chunk_generator.hh"
|
||||
#include "debug/GPUCommandProc.hh"
|
||||
#include "debug/GPUKernelInfo.hh"
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
@@ -42,11 +45,75 @@
|
||||
#include "sim/syscall_emul_buf.hh"
|
||||
|
||||
GPUCommandProcessor::GPUCommandProcessor(const Params &p)
|
||||
: HSADevice(p), dispatcher(*p.dispatcher), _driver(nullptr)
|
||||
: DmaDevice(p), dispatcher(*p.dispatcher), _driver(nullptr), hsaPP(p.hsapp)
|
||||
{
|
||||
assert(hsaPP);
|
||||
hsaPP->setDevice(this);
|
||||
dispatcher.setCommandProcessor(this);
|
||||
}
|
||||
|
||||
HSAPacketProcessor&
|
||||
GPUCommandProcessor::hsaPacketProc()
|
||||
{
|
||||
return *hsaPP;
|
||||
}
|
||||
|
||||
void
|
||||
GPUCommandProcessor::dmaReadVirt(Addr host_addr, unsigned size,
|
||||
DmaCallback *cb, void *data, Tick delay)
|
||||
{
|
||||
dmaVirt(&DmaDevice::dmaRead, host_addr, size, cb, data, delay);
|
||||
}
|
||||
|
||||
void
|
||||
GPUCommandProcessor::dmaWriteVirt(Addr host_addr, unsigned size,
|
||||
DmaCallback *cb, void *data, Tick delay)
|
||||
{
|
||||
dmaVirt(&DmaDevice::dmaWrite, host_addr, size, cb, data, delay);
|
||||
}
|
||||
|
||||
void
|
||||
GPUCommandProcessor::dmaVirt(DmaFnPtr dmaFn, Addr addr, unsigned size,
|
||||
DmaCallback *cb, void *data, Tick delay)
|
||||
{
|
||||
if (size == 0) {
|
||||
if (cb)
|
||||
schedule(cb->getChunkEvent(), curTick() + delay);
|
||||
return;
|
||||
}
|
||||
|
||||
// move the buffer data pointer with the chunks
|
||||
uint8_t *loc_data = (uint8_t*)data;
|
||||
|
||||
for (ChunkGenerator gen(addr, size, PAGE_SIZE); !gen.done(); gen.next()) {
|
||||
Addr phys;
|
||||
|
||||
// translate pages into their corresponding frames
|
||||
translateOrDie(gen.addr(), phys);
|
||||
|
||||
Event *event = cb ? cb->getChunkEvent() : nullptr;
|
||||
|
||||
(this->*dmaFn)(phys, gen.size(), event, loc_data, delay);
|
||||
|
||||
loc_data += gen.size();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUCommandProcessor::translateOrDie(Addr vaddr, Addr &paddr)
|
||||
{
|
||||
/**
|
||||
* Grab the process and try to translate the virtual address with it;
|
||||
* with new extensions, it will likely be wrong to just arbitrarily
|
||||
* grab context zero.
|
||||
*/
|
||||
auto process = sys->threads[0]->getProcessPtr();
|
||||
|
||||
if (!process->pTable->translate(vaddr, paddr)) {
|
||||
fatal("failed translation: vaddr 0x%x\n", vaddr);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* submitDispatchPkt() is the entry point into the CP from the HSAPP
|
||||
* and is only meant to be used with AQL kernel dispatch packets.
|
||||
@@ -192,12 +259,12 @@ GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
|
||||
}
|
||||
|
||||
void
|
||||
GPUCommandProcessor::attachDriver(HSADriver *hsa_driver)
|
||||
GPUCommandProcessor::attachDriver(GPUComputeDriver *gpu_driver)
|
||||
{
|
||||
fatal_if(_driver, "Should not overwrite driver.");
|
||||
// TODO: GPU Driver inheritance hierarchy doesn't really make sense.
|
||||
// Should get rid of the base class.
|
||||
_driver = dynamic_cast<GPUComputeDriver *>(hsa_driver);
|
||||
_driver = gpu_driver;
|
||||
assert(_driver);
|
||||
}
|
||||
|
||||
|
||||
@@ -45,17 +45,27 @@
|
||||
#ifndef __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
|
||||
#define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
|
||||
#include "base/logging.hh"
|
||||
#include "base/trace.hh"
|
||||
#include "base/types.hh"
|
||||
#include "debug/GPUCommandProc.hh"
|
||||
#include "dev/hsa/hsa_device.hh"
|
||||
#include "dev/dma_device.hh"
|
||||
#include "dev/hsa/hsa_packet_processor.hh"
|
||||
#include "dev/hsa/hsa_signal.hh"
|
||||
#include "gpu-compute/dispatcher.hh"
|
||||
#include "gpu-compute/gpu_compute_driver.hh"
|
||||
#include "gpu-compute/hsa_queue_entry.hh"
|
||||
#include "params/GPUCommandProcessor.hh"
|
||||
|
||||
struct GPUCommandProcessorParams;
|
||||
class GPUComputeDriver;
|
||||
class GPUDispatcher;
|
||||
class Shader;
|
||||
|
||||
class GPUCommandProcessor : public HSADevice
|
||||
class GPUCommandProcessor : public DmaDevice
|
||||
{
|
||||
public:
|
||||
typedef GPUCommandProcessorParams Params;
|
||||
@@ -64,6 +74,13 @@ class GPUCommandProcessor : public HSADevice
|
||||
GPUCommandProcessor() = delete;
|
||||
GPUCommandProcessor(const Params &p);
|
||||
|
||||
HSAPacketProcessor& hsaPacketProc();
|
||||
|
||||
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb,
|
||||
void *data, Tick delay = 0);
|
||||
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b,
|
||||
void *data, Tick delay = 0);
|
||||
|
||||
void setShader(Shader *shader);
|
||||
Shader* shader();
|
||||
GPUComputeDriver* driver();
|
||||
@@ -75,12 +92,13 @@ class GPUCommandProcessor : public HSADevice
|
||||
};
|
||||
|
||||
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr) override;
|
||||
Addr host_pkt_addr);
|
||||
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr) override;
|
||||
Addr host_pkt_addr);
|
||||
void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
|
||||
Addr host_pkt_addr) override;
|
||||
void attachDriver(HSADriver *driver) override;
|
||||
Addr host_pkt_addr);
|
||||
void attachDriver(GPUComputeDriver *driver);
|
||||
|
||||
void dispatchPkt(HSAQueueEntry *task);
|
||||
void signalWakeupEvent(uint32_t event_id);
|
||||
|
||||
@@ -91,9 +109,9 @@ class GPUCommandProcessor : public HSADevice
|
||||
|
||||
void updateHsaSignal(Addr signal_handle, uint64_t signal_value,
|
||||
HsaSignalCallbackFunction function =
|
||||
[] (const uint64_t &) { }) override;
|
||||
[] (const uint64_t &) { });
|
||||
|
||||
uint64_t functionalReadHsaSignal(Addr signal_handle) override;
|
||||
uint64_t functionalReadHsaSignal(Addr signal_handle);
|
||||
|
||||
Addr getHsaSignalValueAddr(Addr signal_handle)
|
||||
{
|
||||
@@ -115,8 +133,13 @@ class GPUCommandProcessor : public HSADevice
|
||||
GPUDispatcher &dispatcher;
|
||||
GPUComputeDriver *_driver;
|
||||
|
||||
// Typedefing dmaRead and dmaWrite function pointer
|
||||
typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
|
||||
void initABI(HSAQueueEntry *task);
|
||||
|
||||
HSAPacketProcessor *hsaPP;
|
||||
void dmaVirt(DmaFnPtr, Addr host_addr, unsigned size, DmaCallback *cb,
|
||||
void *data, Tick delay = 0);
|
||||
void translateOrDie(Addr vaddr, Addr &paddr);
|
||||
|
||||
/**
|
||||
* Wraps a std::function object in a DmaCallback. Much cleaner than
|
||||
|
||||
@@ -33,22 +33,26 @@
|
||||
|
||||
#include "gpu-compute/gpu_compute_driver.hh"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "base/logging.hh"
|
||||
#include "base/trace.hh"
|
||||
#include "cpu/thread_context.hh"
|
||||
#include "debug/GPUDriver.hh"
|
||||
#include "debug/GPUShader.hh"
|
||||
#include "dev/hsa/hsa_device.hh"
|
||||
#include "dev/hsa/hsa_packet_processor.hh"
|
||||
#include "dev/hsa/kfd_event_defines.h"
|
||||
#include "dev/hsa/kfd_ioctl.h"
|
||||
#include "gpu-compute/gpu_command_processor.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "mem/port_proxy.hh"
|
||||
#include "params/GPUComputeDriver.hh"
|
||||
#include "sim/process.hh"
|
||||
#include "sim/syscall_emul_buf.hh"
|
||||
|
||||
GPUComputeDriver::GPUComputeDriver(const Params &p)
|
||||
: HSADriver(p), isdGPU(p.isdGPU), gfxVersion(p.gfxVersion),
|
||||
dGPUPoolID(p.dGPUPoolID)
|
||||
: EmulatedDriver(p), device(p.device), queueId(0),
|
||||
isdGPU(p.isdGPU), gfxVersion(p.gfxVersion), dGPUPoolID(p.dGPUPoolID)
|
||||
{
|
||||
device->attachDriver(this);
|
||||
DPRINTF(GPUDriver, "Constructing KFD: device\n");
|
||||
@@ -65,6 +69,146 @@ GPUComputeDriver::GPUComputeDriver(const Params &p)
|
||||
defaultMtype.set(Request::CACHED);
|
||||
}
|
||||
|
||||
const char*
|
||||
GPUComputeDriver::DriverWakeupEvent::description() const
|
||||
{
|
||||
return "DriverWakeupEvent";
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an FD entry for the KFD inside of the owning process.
|
||||
*/
|
||||
int
|
||||
GPUComputeDriver::open(ThreadContext *tc, int mode, int flags)
|
||||
{
|
||||
DPRINTF(GPUDriver, "Opened %s\n", filename);
|
||||
auto process = tc->getProcessPtr();
|
||||
auto device_fd_entry = std::make_shared<DeviceFDEntry>(this, filename);
|
||||
int tgt_fd = process->fds->allocFD(device_fd_entry);
|
||||
return tgt_fd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Currently, mmap() will simply setup a mapping for the associated
|
||||
* device's packet processor's doorbells and creates the event page.
|
||||
*/
|
||||
Addr
|
||||
GPUComputeDriver::mmap(ThreadContext *tc, Addr start, uint64_t length,
|
||||
int prot, int tgt_flags, int tgt_fd, off_t offset)
|
||||
{
|
||||
auto process = tc->getProcessPtr();
|
||||
auto mem_state = process->memState;
|
||||
|
||||
Addr pg_off = offset >> PAGE_SHIFT;
|
||||
Addr mmap_type = pg_off & KFD_MMAP_TYPE_MASK;
|
||||
DPRINTF(GPUDriver, "amdkfd mmap (start: %p, length: 0x%x,"
|
||||
"offset: 0x%x)\n", start, length, offset);
|
||||
|
||||
switch(mmap_type) {
|
||||
case KFD_MMAP_TYPE_DOORBELL:
|
||||
DPRINTF(GPUDriver, "amdkfd mmap type DOORBELL offset\n");
|
||||
start = mem_state->extendMmap(length);
|
||||
process->pTable->map(start, device->hsaPacketProc().pioAddr,
|
||||
length, false);
|
||||
break;
|
||||
case KFD_MMAP_TYPE_EVENTS:
|
||||
DPRINTF(GPUDriver, "amdkfd mmap type EVENTS offset\n");
|
||||
panic_if(start != 0,
|
||||
"Start address should be provided by KFD\n");
|
||||
panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
|
||||
"Requested length %d, expected length %d; length "
|
||||
"mismatch\n", length, 8* KFD_SIGNAL_EVENT_LIMIT);
|
||||
/**
|
||||
* We don't actually access these pages. We just need to reserve
|
||||
* some VA space. See commit id 5ce8abce for details on how
|
||||
* events are currently implemented.
|
||||
*/
|
||||
if (!eventPage) {
|
||||
eventPage = mem_state->extendMmap(length);
|
||||
start = eventPage;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
warn_once("Unrecognized kfd mmap type %llx\n", mmap_type);
|
||||
break;
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Forward relevant parameters to packet processor; queueId
|
||||
* is used to link doorbell. The queueIDs are not re-used
|
||||
* in current implementation, and we allocate only one page
|
||||
* (4096 bytes) for doorbells, so check if this queueID can
|
||||
* be mapped into that page.
|
||||
*/
|
||||
void
|
||||
GPUComputeDriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
|
||||
{
|
||||
TypedBufferArg<kfd_ioctl_create_queue_args> args(ioc_buf);
|
||||
args.copyIn(mem_proxy);
|
||||
|
||||
if ((sizeof(uint32_t) * queueId) > 4096) {
|
||||
fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
|
||||
}
|
||||
|
||||
args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL |
|
||||
KFD_MMAP_GPU_ID(args->gpu_id)) << PAGE_SHIFT;
|
||||
|
||||
args->queue_id = queueId++;
|
||||
auto &hsa_pp = device->hsaPacketProc();
|
||||
hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
|
||||
args->ring_base_address, args->queue_id,
|
||||
args->ring_size);
|
||||
args.copyOut(mem_proxy);
|
||||
}
|
||||
|
||||
void
|
||||
GPUComputeDriver::DriverWakeupEvent::scheduleWakeup(Tick wakeup_delay)
|
||||
{
|
||||
assert(driver);
|
||||
driver->schedule(this, curTick() + wakeup_delay);
|
||||
}
|
||||
|
||||
void
|
||||
GPUComputeDriver::signalWakeupEvent(uint32_t event_id)
|
||||
{
|
||||
panic_if(event_id >= eventSlotIndex,
|
||||
"Trying wakeup on an event that is not yet created\n");
|
||||
if (ETable[event_id].threadWaiting) {
|
||||
panic_if(!ETable[event_id].tc,
|
||||
"No thread context to wake up\n");
|
||||
ThreadContext *tc = ETable[event_id].tc;
|
||||
DPRINTF(GPUDriver,
|
||||
"Signal event: Waking up CPU %d\n", tc->cpuId());
|
||||
// Remove events that can wakeup this thread
|
||||
TCEvents[tc].clearEvents();
|
||||
// Now wakeup this thread
|
||||
tc->activate();
|
||||
} else {
|
||||
// This may be a race condition between an ioctl call asking to wait on
|
||||
// this event and this signalWakeupEvent. Taking care of this race
|
||||
// condition here by setting the event here. The ioctl call should take
|
||||
// the necessary action when waiting on an already set event. However,
|
||||
// this may be a genuine instance in which the runtime has decided not
|
||||
// to wait on this event. But since we cannot distinguish this case with
|
||||
// the race condition, we are any way setting the event.
|
||||
ETable[event_id].setEvent = true;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GPUComputeDriver::DriverWakeupEvent::process()
|
||||
{
|
||||
DPRINTF(GPUDriver,
|
||||
"Timer event: Waking up CPU %d\n", tc->cpuId());
|
||||
// Remove events that can wakeup this thread
|
||||
driver->TCEvents[tc].clearEvents();
|
||||
// Now wakeup this thread
|
||||
tc->activate();
|
||||
}
|
||||
|
||||
int
|
||||
GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
|
||||
{
|
||||
@@ -88,7 +232,7 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
|
||||
{
|
||||
DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
|
||||
|
||||
allocateQueue(tc, ioc_buf);
|
||||
allocateQueue(virt_proxy, ioc_buf);
|
||||
|
||||
DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
|
||||
}
|
||||
|
||||
@@ -42,19 +42,33 @@
|
||||
#ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
|
||||
#define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "base/addr_range_map.hh"
|
||||
#include "dev/hsa/hsa_driver.hh"
|
||||
#include "base/types.hh"
|
||||
#include "enums/GfxVersion.hh"
|
||||
#include "mem/request.hh"
|
||||
#include "sim/emul_driver.hh"
|
||||
|
||||
struct GPUComputeDriverParams;
|
||||
class GPUCommandProcessor;
|
||||
class PortProxy;
|
||||
class ThreadContext;
|
||||
|
||||
class GPUComputeDriver final : public HSADriver
|
||||
class GPUComputeDriver final : public EmulatedDriver
|
||||
{
|
||||
public:
|
||||
typedef GPUComputeDriverParams Params;
|
||||
GPUComputeDriver(const Params &p);
|
||||
int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;
|
||||
|
||||
int open(ThreadContext *tc, int mode, int flags);
|
||||
Addr mmap(ThreadContext *tc, Addr start, uint64_t length,
|
||||
int prot, int tgt_flags, int tgt_fd, off_t offset);
|
||||
virtual void signalWakeupEvent(uint32_t event_id);
|
||||
void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout);
|
||||
/**
|
||||
* Called by the compute units right before a request is issued to ruby.
|
||||
@@ -67,10 +81,62 @@ class GPUComputeDriver final : public HSADriver
|
||||
*/
|
||||
void setMtype(RequestPtr req);
|
||||
|
||||
class DriverWakeupEvent : public Event
|
||||
{
|
||||
public:
|
||||
DriverWakeupEvent(GPUComputeDriver *gpu_driver,
|
||||
ThreadContext *thrd_cntxt)
|
||||
: driver(gpu_driver), tc(thrd_cntxt) {}
|
||||
void process() override;
|
||||
const char *description() const override;
|
||||
void scheduleWakeup(Tick wakeup_delay);
|
||||
private:
|
||||
GPUComputeDriver *driver;
|
||||
ThreadContext *tc;
|
||||
};
|
||||
|
||||
class EventTableEntry
|
||||
{
|
||||
public:
|
||||
EventTableEntry() :
|
||||
mailBoxPtr(0), tc(nullptr), threadWaiting(false), setEvent(false)
|
||||
{}
|
||||
// Mail box pointer for this address. Current implementation does not
|
||||
// use this mailBoxPtr to notify events but directly calls
|
||||
// signalWakeupEvent from dispatcher (GPU) to notifiy events. So,
|
||||
// currently this mailBoxPtr is not used. But a future implementation
|
||||
// may communicate to the driver using mailBoxPtr.
|
||||
Addr mailBoxPtr;
|
||||
// Thread context waiting on this even. We do not support multiple
|
||||
// threads waiting on an event currently.
|
||||
ThreadContext *tc;
|
||||
// threadWaiting = true, if some thread context is waiting on this
|
||||
// event. A thread context waiting on this event is put to sleep.
|
||||
bool threadWaiting;
|
||||
// setEvent = true, if this event is triggered but when this event
|
||||
// triggered, no thread context was waiting on it. In the future, some
|
||||
// thread context will try to wait on this event but since event has
|
||||
// already happened, we will not allow that thread context to go to
|
||||
// sleep. The above mentioned scneario can happen when the waiting
|
||||
// thread and wakeup thread race on this event and the wakeup thread
|
||||
// beat the waiting thread at the driver.
|
||||
bool setEvent;
|
||||
};
|
||||
typedef class EventTableEntry ETEntry;
|
||||
|
||||
private:
|
||||
/**
|
||||
* GPU that is controlled by this driver.
|
||||
*/
|
||||
GPUCommandProcessor *device;
|
||||
uint32_t queueId;
|
||||
bool isdGPU;
|
||||
GfxVersion gfxVersion;
|
||||
int dGPUPoolID;
|
||||
Addr eventPage;
|
||||
uint32_t eventSlotIndex;
|
||||
//Event table that keeps track of events. It is indexed with event ID.
|
||||
std::unordered_map<uint32_t, ETEntry> ETable;
|
||||
|
||||
/**
|
||||
* VMA structures for GPUVM memory.
|
||||
@@ -89,6 +155,37 @@ class GPUComputeDriver final : public HSADriver
|
||||
|
||||
Request::CacheCoherenceFlags defaultMtype;
|
||||
|
||||
// TCEvents map keeps trak of the events that can wakeup this thread. When
|
||||
// multiple events can wake up this thread, this data structure helps to
|
||||
// reset all events when one of those events wake up this thread. the
|
||||
// signal events that can wake up this thread are stored in signalEvents
|
||||
// whereas the timer wakeup event is stored in timerEvent.
|
||||
class EventList
|
||||
{
|
||||
public:
|
||||
EventList() : driver(nullptr), timerEvent(nullptr, nullptr) {}
|
||||
EventList(GPUComputeDriver *gpu_driver, ThreadContext *thrd_cntxt)
|
||||
: driver(gpu_driver), timerEvent(gpu_driver, thrd_cntxt)
|
||||
{ }
|
||||
void clearEvents() {
|
||||
assert(driver);
|
||||
for (auto event : signalEvents) {
|
||||
assert(event < driver->eventSlotIndex);
|
||||
driver->ETable[event].tc = nullptr;
|
||||
driver->ETable[event].threadWaiting = false;
|
||||
}
|
||||
signalEvents.clear();
|
||||
if (timerEvent.scheduled()) {
|
||||
driver->deschedule(timerEvent);
|
||||
}
|
||||
}
|
||||
GPUComputeDriver *driver;
|
||||
DriverWakeupEvent timerEvent;
|
||||
// The set of events that can wake up the same thread.
|
||||
std::set<uint32_t> signalEvents;
|
||||
};
|
||||
std::unordered_map<ThreadContext *, EventList> TCEvents;
|
||||
|
||||
/**
|
||||
* Register a region of host memory as uncacheable from the perspective
|
||||
* of the dGPU.
|
||||
@@ -126,6 +223,9 @@ class GPUComputeDriver final : public HSADriver
|
||||
void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start,
|
||||
Addr length);
|
||||
Addr deallocateGpuVma(Addr start);
|
||||
|
||||
void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr);
|
||||
|
||||
};
|
||||
|
||||
#endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
|
||||
|
||||
Reference in New Issue
Block a user