Files
gem5/src/dev/hsa/hsa_packet_processor.hh
Matthew Poremba 63caa780c2 misc: Remove all references to GCN3
Replace instances of "GCN3" with Vega. Remove gfx801 and gfx803. Rename
FIJI to Vega and Carrizo to Raven.

Using misc since there is not enough room to fit all the tags.

Change-Id: Ibafc939d49a69be9068107a906e878408c7a5891
2024-01-17 11:11:06 -06:00

399 lines
14 KiB
C++

/*
* Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __DEV_HSA_HSA_PACKET_PROCESSOR__
#define __DEV_HSA_HSA_PACKET_PROCESSOR__
#include <algorithm>
#include <cstdint>
#include <vector>
#include "base/types.hh"
#include "debug/HSAPacketProcessor.hh"
#include "dev/dma_virt_device.hh"
#include "dev/hsa/hsa.h"
#include "dev/hsa/hsa_queue.hh"
#include "enums/GfxVersion.hh"
#include "params/HSAPacketProcessor.hh"
#include "sim/eventq.hh"
#define AQL_PACKET_SIZE 64
#define PAGE_SIZE 4096
#define NUM_DMA_BUFS 16
#define DMA_BUF_SIZE (AQL_PACKET_SIZE * NUM_DMA_BUFS)
// HSA runtime supports only 5 signals per barrier packet
#define NumSignalsPerBarrier 5
namespace gem5
{
class AMDGPUDevice;
// Ideally, each queue should store this status and
// the processPkt() should make decisions based on that
// status variable.
enum Q_STATE
{
UNBLOCKED = 0, // Unblocked queue, can submit packets.
BLOCKED_BBIT, // Queue blocked by barrier bit.
// Can submit packet packets after
// previous packet completes.
BLOCKED_BPKT, // Queue blocked by barrier packet.
// Can submit packet packets after
// barrier packet completes.
};
class GPUCommandProcessor;
class HWScheduler;
// Our internal representation of an HSA queue
class HSAQueueDescriptor
{
public:
uint64_t basePointer;
uint64_t doorbellPointer;
uint64_t writeIndex;
uint64_t readIndex;
uint32_t numElts;
uint64_t hostReadIndexPtr;
bool stalledOnDmaBufAvailability;
bool dmaInProgress;
GfxVersion gfxVersion;
HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
uint64_t hri_ptr, uint32_t size,
GfxVersion gfxVersion)
: basePointer(base_ptr), doorbellPointer(db_ptr),
writeIndex(0), readIndex(0),
numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr),
stalledOnDmaBufAvailability(false),
dmaInProgress(false), gfxVersion(gfxVersion)
{ }
uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
uint64_t spaceUsed() { return writeIndex - readIndex; }
uint32_t objSize() { return AQL_PACKET_SIZE; }
uint32_t numObjs() { return numElts; }
bool isFull() { return spaceRemaining() == 0; }
bool isEmpty() { return spaceRemaining() == numElts; }
uint64_t ptr(uint64_t ix)
{
/*
* Based on ROCm Documentation:
* - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
rocr/src/core/runtime/amd_aql_queue.cpp#L99
* - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
rocr/src/core/runtime/amd_aql_queue.cpp#L624
*
*/
uint64_t retAddr = 0ll;
retAddr = basePointer + ((ix % numElts) * objSize());
DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
"index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
"retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
retAddr);
return retAddr;
}
};
/**
* Internal ring buffer which is used to prefetch/store copies of the
* in-memory HSA ring buffer. Each packet in the queue has three implicit
* states tracked by a packet's relative location to the write, read, and
* dispatch pointers.
*
* FREE: Entry is empty
* ALLOCATED: Entry has been allocated for a packet, but the DMA has not
* yet completed
* SUBMITTED: Packet has been submitted to the GPUCommandProcessor, but has not
* yet completed
*/
class AQLRingBuffer
{
private:
std::vector<hsa_kernel_dispatch_packet_t> _aqlBuf;
std::string _name;
std::vector<Addr> _hostDispAddresses;
std::vector<bool> _aqlComplete;
uint64_t _wrIdx; // Points to next write location
uint64_t _rdIdx; // Read pointer of AQL buffer
uint64_t _dispIdx; // Dispatch pointer of AQL buffer
public:
std::string name() {return _name;}
AQLRingBuffer(uint32_t size, const std::string name);
int allocEntry(uint32_t nBufReq);
bool freeEntry(void *pkt);
/**
* the kernel may try to read from the dispatch packet,
* so we need to keep the host address that corresponds
* to each of the dispatch packets this AQL buffer is
* storing. when we call submitPkt(), we send along the
* corresponding host address for the packet so the
* wavefront can properly initialize its SGPRs - which
* may include a pointer to the dispatch packet
*/
void
saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
{
for (int i = 0; i < num_pkts; ++i) {
_hostDispAddresses[ix % numObjs()] = host_pkt_addr + i * objSize();
++ix;
}
}
Addr
hostDispAddr() const
{
return _hostDispAddresses[dispIdx() % numObjs()];
}
bool
dispPending() const
{
int packet_type = (_aqlBuf[_dispIdx % _aqlBuf.size()].header
>> HSA_PACKET_HEADER_TYPE) &
((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1);
return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID;
}
/**
* Packets aren't guaranteed to be completed in-order, and we need
* to know when the last packet is finished in order to un-set
* the barrier bit. In order to confirm if the packet at _rdIdx
* is the last packet, we check if the packets ahead of _rdIdx
* are finished. If they are, _rdIdx is the last packet. If not,
* there are other outstanding packets.
*/
bool
isLastOutstandingPkt() const
{
for (int i = _rdIdx + 1; i < _dispIdx; i++) {
if (!_aqlComplete[i % _aqlBuf.size()]) {
return false;
}
}
return !_aqlComplete[_rdIdx % _aqlBuf.size()] && _rdIdx != _dispIdx;
}
uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); }
void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); }
uint32_t numObjs() const { return _aqlBuf.size(); };
uint32_t objSize() const { return AQL_PACKET_SIZE; }
uint64_t dispIdx() const { return _dispIdx; }
uint64_t wrIdx() const { return _wrIdx; }
uint64_t rdIdx() const { return _rdIdx; }
uint64_t* rdIdxPtr() { return &_rdIdx; }
void incRdIdx(uint64_t value) { _rdIdx += value; }
void incWrIdx(uint64_t value) { _wrIdx += value; }
void incDispIdx(uint64_t value) { _dispIdx += value; }
uint64_t compltnPending() { return (_dispIdx - _rdIdx); }
void setRdIdx(uint64_t value);
void setWrIdx(uint64_t value);
void setDispIdx(uint64_t value);
};
struct QCntxt
{
HSAQueueDescriptor* qDesc;
AQLRingBuffer* aqlBuf;
// used for HSA packets that enforce synchronization with barrier bit
bool barrierBit;
QCntxt(HSAQueueDescriptor* q_desc, AQLRingBuffer* aql_buf) :
qDesc(q_desc), aqlBuf(aql_buf), barrierBit(false)
{}
QCntxt() : qDesc(NULL), aqlBuf(NULL), barrierBit(false) {}
};
class HSAPacketProcessor: public DmaVirtDevice
{
friend class HWScheduler;
protected:
typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
GPUCommandProcessor *gpu_device;
HWScheduler *hwSchdlr;
AMDGPUDevice *gpuDevice;
VegaISA::Walker *walker;
// Structure to store the read values of dependency signals
// from shared memory. Also used for tracking the status of
// those reads while they are in progress
class SignalState
{
public:
SignalState()
: pendingReads(0), allRead(false), discardRead(false)
{
values.resize(NumSignalsPerBarrier);
}
void handleReadDMA();
int pendingReads;
bool allRead;
// If this queue is unmapped when there are pending reads, then
// the pending reads has to be discarded.
bool discardRead;
// values stores the value of already read dependency signal
std::vector<hsa_signal_value_t> values;
void
resetSigVals()
{
std::fill(values.begin(), values.end(), 1);
}
};
class QueueProcessEvent : public Event
{
private:
HSAPacketProcessor *hsaPP;
uint32_t rqIdx;
public:
QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
: Event(Default_Pri), hsaPP(_hsaPP), rqIdx(_rqIdx)
{}
virtual void process();
virtual const char *description() const;
};
// Registered queue list entry; each entry has one queueDescriptor and
// associated AQL buffer
class RQLEntry
{
public:
RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
: aqlProcessEvent(hsaPP, rqIdx) {}
QCntxt qCntxt;
bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
uint64_t compltnPending() { return qCntxt.aqlBuf->compltnPending(); }
SignalState depSignalRdState;
QueueProcessEvent aqlProcessEvent;
void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; }
bool getBarrierBit() const { return qCntxt.barrierBit; }
bool isLastOutstandingPkt() const
{
return qCntxt.aqlBuf->isLastOutstandingPkt();
}
};
// Keeps track of queueDescriptors of registered queues
std::vector<class RQLEntry *> regdQList;
Q_STATE processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
void displayQueueDescriptor(int pid, uint32_t rl_idx);
public:
HSAQueueDescriptor*
getQueueDesc(uint32_t queId)
{
return regdQList.at(queId)->qCntxt.qDesc;
}
class RQLEntry*
getRegdListEntry(uint32_t queId)
{
return regdQList.at(queId);
}
uint64_t
inFlightPkts(uint32_t queId)
{
auto aqlBuf = regdQList.at(queId)->qCntxt.aqlBuf;
return aqlBuf->dispIdx() - aqlBuf->rdIdx();
}
int numHWQueues;
Addr pioAddr;
Addr pioSize;
Tick pioDelay;
const Tick pktProcessDelay;
typedef HSAPacketProcessorParams Params;
HSAPacketProcessor(const Params &p);
~HSAPacketProcessor();
TranslationGenPtr translate(Addr vaddr, Addr size) override;
void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
uint64_t basePointer,
uint64_t queue_id,
uint32_t size, int doorbellSize,
GfxVersion gfxVersion,
Addr offset = 0, uint64_t rd_idx = 0);
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
void setDevice(GPUCommandProcessor * dev);
void setGPUDevice(AMDGPUDevice *gpu_device);
void updateReadIndex(int, uint32_t);
void getCommandsFromHost(int pid, uint32_t rl_idx);
HWScheduler *hwScheduler() { return hwSchdlr; }
// PIO interface
virtual Tick read(Packet*) override;
virtual Tick write(Packet*) override;
virtual AddrRangeList getAddrRanges() const override;
void finishPkt(void *pkt, uint32_t rl_idx);
void finishPkt(void *pkt) { finishPkt(pkt, 0); }
void schedAQLProcessing(uint32_t rl_idx);
void schedAQLProcessing(uint32_t rl_idx, Tick delay);
void sendAgentDispatchCompletionSignal(void *pkt,
hsa_signal_value_t signal);
void sendCompletionSignal(hsa_signal_value_t signal);
/**
* Calls getCurrentEntry once the queueEntry has been dmaRead.
*/
struct dma_series_ctx
{
// deal with the fact dma ops can complete out of issue order
uint32_t pkts_ttl;
uint32_t pkts_2_go;
uint32_t start_ix;
uint32_t rl_idx;
dma_series_ctx(uint32_t _pkts_ttl,
uint32_t _pkts_2_go,
uint32_t _start_ix,
uint32_t _rl_idx)
: pkts_ttl(_pkts_2_go), pkts_2_go(_pkts_2_go),
start_ix(_start_ix), rl_idx(_rl_idx)
{};
~dma_series_ctx() {};
};
void updateReadDispIdDma();
void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead,
uint32_t ix_start, unsigned num_pkts,
dma_series_ctx *series_ctx, void *dest_4debug);
void handleReadDMA();
};
} // namespace gem5
#endif // __DEV_HSA_HSA_PACKET_PROCESSOR__