Files
gem5/src/dev/hsa/hsa_packet_processor.cc
Matthew Poremba 897c0c11ed dev,dev-hsa,gpu-compute: Refactor dmaVirt calls
Remove the duplicate dmaVirt calls from HSA packet processor and GPU
command processor and move them into their own class. This removes some
duplicate code and allows a DmaVirtDevice to be created which will be
useful for upcoming full system GPU commits.

The DmaVirtDevice is an abstraction of the base DmaDevice but iterates
using ChunkGenerator over virtual addresses. Classes which inherit from
DmaVirtDevice must provide a translation function to translate from
virtual address to physical address. Once translated, the physical
address is passed to DmaDevice to do the work.

Change-Id: Idd59ccb4d9ba21c0b1150ee328ededf5a88d824e
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/47179
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
2021-07-09 22:40:18 +00:00

725 lines
28 KiB
C++

/*
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "dev/hsa/hsa_packet_processor.hh"
#include <cassert>
#include <cstring>
#include "base/chunk_generator.hh"
#include "base/compiler.hh"
#include "base/logging.hh"
#include "base/trace.hh"
#include "debug/HSAPacketProcessor.hh"
#include "dev/dma_device.hh"
#include "dev/hsa/hsa_packet.hh"
#include "dev/hsa/hw_scheduler.hh"
#include "gpu-compute/gpu_command_processor.hh"
#include "mem/packet_access.hh"
#include "mem/page_table.hh"
#include "sim/process.hh"
#include "sim/proxy_ptr.hh"
#include "sim/system.hh"
#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT) \
const char* \
HSAPacketProcessor::XEVENT::description() const \
{ \
return #XEVENT; \
}
#define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \
HSA_PACKET_HEADER_TYPE) & (HSA_PACKET_HEADER_WIDTH_TYPE - 1)))
// checks if the barrier bit is set in the header -- shift the barrier bit
// to LSB, then bitwise "and" to mask off all other bits
#define IS_BARRIER(PKT) ((hsa_packet_header_t)(((PKT->header) >> \
HSA_PACKET_HEADER_BARRIER) & HSA_PACKET_HEADER_WIDTH_BARRIER))
namespace gem5
{
HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent)
HSAPacketProcessor::HSAPacketProcessor(const Params &p)
: DmaVirtDevice(p), numHWQueues(p.numHWQueues), pioAddr(p.pioAddr),
pioSize(PAGE_SIZE), pioDelay(10), pktProcessDelay(p.pktProcessDelay)
{
DPRINTF(HSAPacketProcessor, "%s:\n", __FUNCTION__);
hwSchdlr = new HWScheduler(this, p.wakeupDelay);
regdQList.resize(numHWQueues);
for (int i = 0; i < numHWQueues; i++) {
regdQList[i] = new RQLEntry(this, i);
}
}
HSAPacketProcessor::~HSAPacketProcessor()
{
for (auto &queue : regdQList) {
delete queue;
}
}
void
HSAPacketProcessor::unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
{
hwSchdlr->unregisterQueue(queue_id, doorbellSize);
}
void
HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
uint64_t basePointer,
uint64_t queue_id,
uint32_t size, int doorbellSize)
{
DPRINTF(HSAPacketProcessor,
"%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
(void *)basePointer, queue_id, size);
hwSchdlr->registerNewQueue(hostReadIndexPointer,
basePointer, queue_id, size, doorbellSize);
}
AddrRangeList
HSAPacketProcessor::getAddrRanges() const
{
assert(pioSize != 0);
AddrRangeList ranges;
ranges.push_back(RangeSize(pioAddr, pioSize));
return ranges;
}
// Basically only processes writes to the queue doorbell register.
Tick
HSAPacketProcessor::write(Packet *pkt)
{
assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
// TODO: How to get pid??
GEM5_VAR_USED Addr daddr = pkt->getAddr() - pioAddr;
DPRINTF(HSAPacketProcessor,
"%s: write of size %d to reg-offset %d (0x%x)\n",
__FUNCTION__, pkt->getSize(), daddr, daddr);
assert(gpu_device->driver()->doorbellSize() == pkt->getSize());
uint64_t doorbell_reg(0);
if (pkt->getSize() == 8)
doorbell_reg = pkt->getLE<uint64_t>() + 1;
else if (pkt->getSize() == 4)
doorbell_reg = pkt->getLE<uint32_t>();
else
fatal("invalid db size");
DPRINTF(HSAPacketProcessor,
"%s: write data 0x%x to offset %d (0x%x)\n",
__FUNCTION__, doorbell_reg, daddr, daddr);
hwSchdlr->write(daddr, doorbell_reg);
pkt->makeAtomicResponse();
return pioDelay;
}
Tick
HSAPacketProcessor::read(Packet *pkt)
{
pkt->makeAtomicResponse();
pkt->setBadAddress();
return pioDelay;
}
void
HSAPacketProcessor::translateOrDie(Addr vaddr, Addr &paddr)
{
// Grab the process and try to translate the virtual address with it; with
// new extensions, it will likely be wrong to just arbitrarily grab context
// zero.
auto process = sys->threads[0]->getProcessPtr();
if (!process->pTable->translate(vaddr, paddr))
fatal("failed translation: vaddr 0x%x\n", vaddr);
}
/**
* this event is used to update the read_disp_id field (the read pointer)
* of the MQD, which is how the host code knows the status of the HQD's
* read pointer
*/
void
HSAPacketProcessor::updateReadDispIdDma()
{
DPRINTF(HSAPacketProcessor, "updateReaddispId\n");
}
void
HSAPacketProcessor::updateReadIndex(int pid, uint32_t rl_idx)
{
AQLRingBuffer* aqlbuf = regdQList[rl_idx]->qCntxt.aqlBuf;
HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint32_t &dma_data) { this->updateReadDispIdDma(); }, 0);
DPRINTF(HSAPacketProcessor,
"%s: read-pointer offset [0x%x]\n", __FUNCTION__, aqlbuf->rdIdx());
dmaWriteVirt((Addr)qDesc->hostReadIndexPtr, sizeof(aqlbuf->rdIdx()),
cb, aqlbuf->rdIdxPtr());
DPRINTF(HSAPacketProcessor,
"%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
" q size = %d, is_empty = %s, active list ID = %d\n", __FUNCTION__,
qDesc->readIndex, qDesc->writeIndex, qDesc->spaceUsed(),
qDesc->numElts, qDesc->isEmpty()? "true" : "false", rl_idx);
if (qDesc->writeIndex != aqlbuf->wrIdx()) {
getCommandsFromHost(pid, rl_idx);
}
}
void
HSAPacketProcessor::cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid,
bool isRead, uint32_t ix_start, unsigned num_pkts,
dma_series_ctx *series_ctx, void *dest_4debug)
{
uint32_t rl_idx = series_ctx->rl_idx;
GEM5_VAR_USED AQLRingBuffer *aqlRingBuffer =
hsaPP->regdQList[rl_idx]->qCntxt.aqlBuf;
HSAQueueDescriptor* qDesc =
hsaPP->regdQList[rl_idx]->qCntxt.qDesc;
DPRINTF(HSAPacketProcessor, ">%s, ix = %d, npkts = %d," \
" pktsRemaining = %d, active list ID = %d\n", __FUNCTION__,
ix_start, num_pkts, series_ctx->pkts_2_go,
rl_idx);
if (isRead) {
series_ctx->pkts_2_go -= num_pkts;
if (series_ctx->pkts_2_go == 0) {
// Mark DMA as completed
qDesc->dmaInProgress = false;
DPRINTF(HSAPacketProcessor,
"%s: schedule Qwakeup next cycle, rdIdx %d, wrIdx %d," \
" dispIdx %d, active list ID = %d\n",
__FUNCTION__, aqlRingBuffer->rdIdx(),
aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rl_idx);
// schedule queue wakeup
hsaPP->schedAQLProcessing(rl_idx);
delete series_ctx;
}
}
}
void
HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx, Tick delay)
{
RQLEntry *queue = regdQList[rl_idx];
if (!queue->aqlProcessEvent.scheduled()) {
Tick processingTick = curTick() + delay;
schedule(queue->aqlProcessEvent, processingTick);
DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n",
processingTick);
} else {
DPRINTF(HSAPacketProcessor, "AQL processing already scheduled\n");
}
}
void
HSAPacketProcessor::schedAQLProcessing(uint32_t rl_idx)
{
schedAQLProcessing(rl_idx, pktProcessDelay);
}
Q_STATE
HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
{
Q_STATE is_submitted = BLOCKED_BPKT;
SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
// Dependency signals are not read yet. And this can only be a retry.
// The retry logic will schedule the packet processor wakeup
if (dep_sgnl_rd_st->pendingReads != 0) {
return BLOCKED_BPKT;
}
// `pkt` can be typecasted to any type of AQL packet since they all
// have header information at offset zero
auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
if (IS_BARRIER(disp_pkt) &&
regdQList[rl_idx]->compltnPending() > 0) {
// If this packet is using the "barrier bit" to enforce ordering with
// previous packets, and if there are outstanding packets, set the
// barrier bit for this queue and block the queue.
DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
" list ID = %d\n", __FUNCTION__, rl_idx);
regdQList[rl_idx]->setBarrierBit(true);
return BLOCKED_BBIT;
}
if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// Submit packet to HSA device (dispatcher)
gpu_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
is_submitted = UNBLOCKED;
} else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// Submit packet to HSA device (dispatcher)
gpu_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
is_submitted = UNBLOCKED;
/*
If this packet is using the "barrier bit" to enforce ordering with
subsequent kernels, set the bit for this queue now, after
dispatching.
*/
if (IS_BARRIER(disp_pkt)) {
DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
" list ID = %d\n", __FUNCTION__, rl_idx);
regdQList[rl_idx]->setBarrierBit(true);
}
} else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
auto bar_and_pkt = (_hsa_barrier_and_packet_t *)pkt;
bool isReady = true;
// Loop thorugh all the completion signals to see if this barrier
// packet is ready.
for (int i = 0; i < NumSignalsPerBarrier; i++) {
// dep_signal = zero imply no signal connected
if (bar_and_pkt->dep_signal[i]) {
// The signal value is aligned 8 bytes from
// the actual handle in the runtime
uint64_t signal_addr =
(uint64_t) (((uint64_t *) bar_and_pkt->dep_signal[i]) + 1);
hsa_signal_value_t *signal_val =
&(dep_sgnl_rd_st->values[i]);
DPRINTF(HSAPacketProcessor, "%s: Barrier pkt dep sgnl[%d]" \
" , sig addr %x, value %d active list ID = %d\n",
__FUNCTION__, i, signal_addr,
*signal_val, rl_idx);
// The if condition will be executed everytime except the
// very first time this barrier packet is encounteresd.
if (dep_sgnl_rd_st->allRead) {
if (*signal_val != 0) {
// This signal is not yet ready, read it again
isReady = false;
auto cb = new DmaVirtCallback<int64_t>(
[ = ] (const uint32_t &dma_data)
{ dep_sgnl_rd_st->handleReadDMA(); }, 0);
dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
cb, signal_val);
dep_sgnl_rd_st->pendingReads++;
DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
" active list %d\n", __FUNCTION__,
dep_sgnl_rd_st->pendingReads, rl_idx);
}
} else {
// This signal is not yet ready, read it again
isReady = false;
auto cb = new DmaVirtCallback<int64_t>(
[ = ] (const uint32_t &dma_data)
{ dep_sgnl_rd_st->handleReadDMA(); }, 0);
dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
cb, signal_val);
dep_sgnl_rd_st->pendingReads++;
DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
" active list %d\n", __FUNCTION__,
dep_sgnl_rd_st->pendingReads, rl_idx);
}
}
}
if (isReady) {
assert(dep_sgnl_rd_st->pendingReads == 0);
DPRINTF(HSAPacketProcessor, "%s: Barrier packet completed" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// TODO: Completion signal of barrier packet to be
// atomically decremented here
finishPkt((void*)bar_and_pkt, rl_idx);
is_submitted = UNBLOCKED;
// Reset signal values
dep_sgnl_rd_st->resetSigVals();
// The completion signal is connected
if (bar_and_pkt->completion_signal != 0) {
// HACK: The semantics of the HSA signal is to
// decrement the current signal value
// I'm going to cheat here and read out
// the value from main memory using functional
// access, and then just DMA the decremented value.
uint64_t signal_value = gpu_device->functionalReadHsaSignal(\
bar_and_pkt->completion_signal);
DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
" completion signal! Addr: %x\n",
bar_and_pkt->completion_signal);
gpu_device->updateHsaSignal(bar_and_pkt->completion_signal,
signal_value - 1);
}
}
if (dep_sgnl_rd_st->pendingReads > 0) {
// Atleast one DepSignalsReadDmaEvent is scheduled this cycle
dep_sgnl_rd_st->allRead = false;
dep_sgnl_rd_st->discardRead = false;
}
} else if (pkt_type == HSA_PACKET_TYPE_BARRIER_OR) {
fatal("Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR");
} else if (pkt_type == HSA_PACKET_TYPE_INVALID) {
fatal("Unsupported packet type HSA_PACKET_TYPE_INVALID");
} else if (pkt_type == HSA_PACKET_TYPE_AGENT_DISPATCH) {
DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \
" active list ID = %d\n", __FUNCTION__, rl_idx);
// Submit packet to HSA device (dispatcher)
gpu_device->submitAgentDispatchPkt(
(void *)disp_pkt, rl_idx, host_pkt_addr);
is_submitted = UNBLOCKED;
sendAgentDispatchCompletionSignal((void *)disp_pkt,0);
} else {
fatal("Unsupported packet type %d\n", pkt_type);
}
return is_submitted;
}
// Wakes up every fixed time interval (pktProcessDelay) and processes a single
// packet from the queue that scheduled this wakeup. If there are more
// packets in that queue, the next wakeup is scheduled.
void
HSAPacketProcessor::QueueProcessEvent::process()
{
AQLRingBuffer *aqlRingBuffer = hsaPP->regdQList[rqIdx]->qCntxt.aqlBuf;
DPRINTF(HSAPacketProcessor,
"%s: Qwakeup , rdIdx %d, wrIdx %d," \
" dispIdx %d, active list ID = %d\n",
__FUNCTION__, aqlRingBuffer->rdIdx(),
aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx);
// If barrier bit is set, then this wakeup is a dummy wakeup
// just to model the processing time. Do nothing.
if (hsaPP->regdQList[rqIdx]->getBarrierBit()) {
DPRINTF(HSAPacketProcessor,
"Dummy wakeup with barrier bit for rdIdx %d\n", rqIdx);
return;
}
// In the future, we may support batch processing of packets.
// Then, we can just remove the break statements and the code
// will support batch processing. That is why we are using a
// "while loop" here instead on an "if" condition.
while (hsaPP->regdQList[rqIdx]->dispPending()) {
void *pkt = aqlRingBuffer->ptr(aqlRingBuffer->dispIdx());
DPRINTF(HSAPacketProcessor, "%s: Attempting dispatch @ dispIdx[%d]\n",
__FUNCTION__, aqlRingBuffer->dispIdx());
Addr host_addr = aqlRingBuffer->hostDispAddr();
Q_STATE q_state = hsaPP->processPkt(pkt, rqIdx, host_addr);
if (q_state == UNBLOCKED) {
aqlRingBuffer->incDispIdx(1);
DPRINTF(HSAPacketProcessor, "%s: Increment dispIdx[%d]\n",
__FUNCTION__, aqlRingBuffer->dispIdx());
if (hsaPP->regdQList[rqIdx]->dispPending()) {
hsaPP->schedAQLProcessing(rqIdx);
}
break;
} else if (q_state == BLOCKED_BPKT) {
// This queue is blocked by barrier packet,
// schedule a processing event
hsaPP->schedAQLProcessing(rqIdx);
break;
} else if (q_state == BLOCKED_BBIT) {
// This queue is blocked by barrier bit, and processing event
// should be scheduled from finishPkt(). However, to elapse
// "pktProcessDelay" processing time, let us schedule a dummy
// wakeup once which will just wakeup and will do nothing.
hsaPP->schedAQLProcessing(rqIdx);
break;
} else {
panic("Unknown queue state\n");
}
}
}
void
HSAPacketProcessor::SignalState::handleReadDMA()
{
assert(pendingReads > 0);
pendingReads--;
if (pendingReads == 0) {
allRead = true;
if (discardRead) {
resetSigVals();
}
}
}
void
HSAPacketProcessor::getCommandsFromHost(int pid, uint32_t rl_idx)
{
HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
AQLRingBuffer *aqlRingBuffer = regdQList[rl_idx]->qCntxt.aqlBuf;
DPRINTF(HSAPacketProcessor,
"%s: read-pointer offset[0x%x], write-pointer offset[0x%x]"
" doorbell(%d)[0x%x] \n",
__FUNCTION__, qDesc->readIndex,
qDesc->writeIndex, pid, qDesc->doorbellPointer);
if (qDesc->dmaInProgress) {
// we'll try again when this dma transfer completes in updateReadIndex
return;
}
uint32_t num_umq = qDesc->spaceUsed();
if (num_umq == 0)
return; // nothing to be gotten
uint32_t umq_nxt = qDesc->readIndex;
// Total AQL buffer size
uint32_t ttl_aql_buf = aqlRingBuffer->numObjs();
// Available AQL buffer size. If the available buffer is less than
// demanded, number of available buffer is returned
uint32_t got_aql_buf = aqlRingBuffer->allocEntry(num_umq);
qDesc->readIndex += got_aql_buf;
uint32_t dma_start_ix = (aqlRingBuffer->wrIdx() - got_aql_buf) %
ttl_aql_buf;
dma_series_ctx *series_ctx = NULL;
DPRINTF(HSAPacketProcessor, "%s: umq_nxt = %d, ttl_aql_buf = %d, "
"dma_start_ix = %d, num_umq = %d\n", __FUNCTION__, umq_nxt,
ttl_aql_buf, dma_start_ix, num_umq);
if (got_aql_buf == 0) {
// we'll try again when some dma bufs are freed in freeEntry
qDesc->stalledOnDmaBufAvailability = true;
return;
} else {
qDesc->stalledOnDmaBufAvailability = false;
}
uint32_t dma_b4_wrap = ttl_aql_buf - dma_start_ix;
while (got_aql_buf != 0 && num_umq != 0) {
uint32_t umq_b4_wrap = qDesc->numObjs() -
(umq_nxt % qDesc->objSize());
uint32_t num_2_xfer
= std::min({umq_b4_wrap, dma_b4_wrap, num_umq, got_aql_buf});
if (!series_ctx) {
qDesc->dmaInProgress = true;
series_ctx = new dma_series_ctx(got_aql_buf, got_aql_buf,
dma_start_ix, rl_idx);
}
void *aql_buf = aqlRingBuffer->ptr(dma_start_ix);
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint32_t &dma_data)
{ this->cmdQueueCmdDma(this, pid, true, dma_start_ix,
num_2_xfer, series_ctx, aql_buf); }, 0);
dmaReadVirt(qDesc->ptr(umq_nxt), num_2_xfer * qDesc->objSize(),
cb, aql_buf);
aqlRingBuffer->saveHostDispAddr(qDesc->ptr(umq_nxt), num_2_xfer,
dma_start_ix);
DPRINTF(HSAPacketProcessor,
"%s: aql_buf = %p, umq_nxt = %d, dma_ix = %d, num2xfer = %d\n",
__FUNCTION__, aql_buf, umq_nxt, dma_start_ix, num_2_xfer);
num_umq -= num_2_xfer;
got_aql_buf -= num_2_xfer;
dma_start_ix = (dma_start_ix + num_2_xfer) % ttl_aql_buf;
umq_nxt = (umq_nxt + num_2_xfer) % qDesc->numObjs();
if (got_aql_buf == 0 && num_umq != 0) {
// There are more packets in the queue but
// not enough DMA buffers. Set the stalledOnDmaBufAvailability,
// we will try again in freeEntry
qDesc->stalledOnDmaBufAvailability = true;
}
}
}
void
HSAPacketProcessor::displayQueueDescriptor(int pid, uint32_t rl_idx)
{
GEM5_VAR_USED HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
DPRINTF(HSAPacketProcessor,
"%s: pid[%d], basePointer[0x%lx], dBPointer[0x%lx], "
"writeIndex[0x%x], readIndex[0x%x], size(bytes)[0x%x]\n",
__FUNCTION__, pid, qDesc->basePointer,
qDesc->doorbellPointer, qDesc->writeIndex,
qDesc->readIndex, qDesc->numElts);
}
AQLRingBuffer::AQLRingBuffer(uint32_t size,
const std::string name)
: _name(name), _wrIdx(0), _rdIdx(0), _dispIdx(0)
{
_aqlBuf.resize(size);
_aqlComplete.resize(size);
_hostDispAddresses.resize(size);
// Mark all packets as invalid and incomplete
for (auto& it : _aqlBuf)
it.header = HSA_PACKET_TYPE_INVALID;
std::fill(_aqlComplete.begin(), _aqlComplete.end(), false);
}
bool
AQLRingBuffer::freeEntry(void *pkt)
{
_aqlComplete[(hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data()] = true;
DPRINTF(HSAPacketProcessor, "%s: pkt_ix = %d; "\
" # free entries = %d, wrIdx = %d, rdIdx = %d\n", __FUNCTION__,
(hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data(),
nFree(), wrIdx(), rdIdx());
// Packets can complete out-of-order. This code "retires" packets in-order
// by updating the read pointer in the MQD when a contiguous chunk of
// packets have finished.
uint32_t old_rdIdx = rdIdx();
while (_aqlComplete[rdIdx() % numObjs()]) {
_aqlComplete[rdIdx() % numObjs()] = false;
_aqlBuf[rdIdx() % numObjs()].header = HSA_PACKET_TYPE_INVALID;
incRdIdx(1);
}
return (old_rdIdx != rdIdx());
}
void
HSAPacketProcessor::setDevice(GPUCommandProcessor *dev)
{
this->gpu_device = dev;
}
int
AQLRingBuffer::allocEntry(uint32_t nBufReq)
{
DPRINTF(HSAPacketProcessor, "%s: nReq = %d\n", __FUNCTION__, nBufReq);
if (nFree() == 0) {
DPRINTF(HSAPacketProcessor, "%s: return = %d\n", __FUNCTION__, 0);
return 0;
}
if (nBufReq > nFree())
nBufReq = nFree();
DPRINTF(HSAPacketProcessor, "%s: ix1stFree = %d\n", __FUNCTION__, wrIdx());
incWrIdx(nBufReq);
DPRINTF(HSAPacketProcessor, "%s: return = %d, wrIdx = %d\n",
__FUNCTION__, nBufReq, wrIdx());
return nBufReq;
}
void
HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
{
HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
// if barrier bit was set and this is the last
// outstanding packet from that queue,
// unset it here
if (regdQList[rl_idx]->getBarrierBit() &&
regdQList[rl_idx]->isLastOutstandingPkt()) {
DPRINTF(HSAPacketProcessor,
"Unset barrier bit for active list ID %d\n", rl_idx);
regdQList[rl_idx]->setBarrierBit(false);
// if pending kernels in the queue after this kernel, reschedule
if (regdQList[rl_idx]->dispPending()) {
DPRINTF(HSAPacketProcessor,
"Rescheduling active list ID %d after unsetting barrier "
"bit\n", rl_idx);
schedAQLProcessing(rl_idx);
}
}
// If set, then blocked schedule, so need to reschedule
if (regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt))
updateReadIndex(0, rl_idx);
DPRINTF(HSAPacketProcessor,
"%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
" q size = %d, stalled = %s, empty = %s, active list ID = %d\n",
__FUNCTION__, qDesc->readIndex, qDesc->writeIndex,
qDesc->spaceUsed(), qDesc->numElts,
qDesc->stalledOnDmaBufAvailability? "true" : "false",
qDesc->isEmpty()? "true" : "false", rl_idx);
// DMA buffer is freed, check the queue to see if there are DMA
// accesses blocked becasue of non-availability of DMA buffer
if (qDesc->stalledOnDmaBufAvailability) {
assert(!qDesc->isEmpty());
getCommandsFromHost(0, rl_idx); // TODO:assign correct pid
// when implementing
// multi-process support
}
}
void
HSAPacketProcessor::sendAgentDispatchCompletionSignal(
void *pkt, hsa_signal_value_t signal)
{
auto agent_pkt = (_hsa_agent_dispatch_packet_t *)pkt;
uint64_t signal_addr =
(uint64_t) (((uint64_t *)agent_pkt->completion_signal) + 1);
DPRINTF(HSAPacketProcessor, "Triggering Agent Dispatch packet" \
" completion signal: %x!\n", signal_addr);
/**
* HACK: The semantics of the HSA signal is to
* decrement the current signal value.
* I'm going to cheat here and read out
* the value from main memory using functional
* access, and then just DMA the decremented value.
* The reason for this is that the DMASequencer does
* not support atomic operations.
*/
VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
DPRINTF(HSAPacketProcessor,"HSADriver: Sending signal to %lu\n",
(uint64_t)sys->threads[0]->cpuId());
hsa_signal_value_t *new_signal = new hsa_signal_value_t;
*new_signal = (hsa_signal_value_t) *prev_signal - 1;
dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
}
void
HSAPacketProcessor::sendCompletionSignal(hsa_signal_value_t signal)
{
uint64_t signal_addr = (uint64_t) (((uint64_t *)signal) + 1);
DPRINTF(HSAPacketProcessor, "Triggering completion signal: %x!\n",
signal_addr);
/**
* HACK: The semantics of the HSA signal is to
* decrement the current signal value.
* I'm going to cheat here and read out
* the value from main memory using functional
* access, and then just DMA the decremented value.
* The reason for this is that the DMASequencer does
* not support atomic operations.
*/
VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
hsa_signal_value_t *new_signal = new hsa_signal_value_t;
*new_signal = (hsa_signal_value_t) *prev_signal - 1;
dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
}
} // namespace gem5