Files
gem5/src/gpu-compute/fetch_unit.cc
Matthew Poremba 539a2e2bcd arch-vega: Add VEGA page tables and TLB
Add the page table walker, page table format, TLB, TLB coalescer, and
associated support in the AMDGPUDevice. This page table format used the
hardware format for dGPU and is very different from APU/GCN3 which use
the X86 page table format.

In order to support either format for the GPU model, a common
TranslationState called GpuTranslation state is created which holds the
combined fields of both the APU and Vega translation state. Similarly
the TlbEntry is cast at runtime by the corresponding arch files as they
are the only files which touch the internals of the TlbEntry. The GPU
model only checks if a TlbEntry is non-null and thus does not need to
cast to peek inside the data structure.

Change-Id: I4484c66239b48df5224d61caa6e968e56eea38a5
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/51848
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
2022-03-17 00:11:14 +00:00

647 lines
22 KiB
C++

/*
* Copyright (c) 2014-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpu-compute/fetch_unit.hh"
#include "arch/amdgpu/common/gpu_translation_state.hh"
#include "arch/amdgpu/common/tlb.hh"
#include "base/bitfield.hh"
#include "debug/GPUFetch.hh"
#include "debug/GPUPort.hh"
#include "debug/GPUTLB.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/ruby/system/RubySystem.hh"
namespace gem5
{
uint32_t FetchUnit::globalFetchUnitID;
FetchUnit::FetchUnit(const ComputeUnitParams &p, ComputeUnit &cu)
: timingSim(true), computeUnit(cu), fetchScheduler(p),
waveList(nullptr), fetchDepth(p.fetch_depth)
{
}
FetchUnit::~FetchUnit()
{
fetchQueue.clear();
fetchStatusQueue.clear();
}
void
FetchUnit::init()
{
timingSim = computeUnit.shader->timingSim;
fetchQueue.clear();
fetchStatusQueue.resize(computeUnit.shader->n_wf);
fetchBuf.resize(computeUnit.shader->n_wf, FetchBufDesc());
for (int i = 0; i < computeUnit.shader->n_wf; ++i) {
Wavefront *wf = waveList->at(i);
assert(wf->wfSlotId == i);
fetchStatusQueue[i] = std::make_pair(wf, false);
fetchBuf[i].allocateBuf(fetchDepth, computeUnit.cacheLineSize(), wf);
fetchBuf[i].decoder(&decoder);
}
fetchScheduler.bindList(&fetchQueue);
}
void
FetchUnit::exec()
{
/**
* now we check if any of the fetch buffers have
* buffered instruction data that can be decoded
* and sent to its wavefront's instruction buffer.
* then we check if any of the fetch buffer entries
* can be released. we only check if we can
* release a buffer
*/
for (auto &fetch_buf : fetchBuf) {
if (!fetch_buf.hasFreeSpace()) {
fetch_buf.checkWaveReleaseBuf();
}
if (fetch_buf.hasFetchDataToProcess()) {
fetch_buf.decodeInsts();
}
}
// re-evaluate waves which are marked as not ready for fetch
for (int j = 0; j < computeUnit.shader->n_wf; ++j) {
// Following code assumes 64-bit opertaion and all insts are
// represented by 64-bit pointers to inst objects.
Wavefront *curWave = fetchStatusQueue[j].first;
assert (curWave);
// The wavefront has to be active, the IB occupancy has to be
// 4 or less instructions and it can not have any branches to
// prevent speculative instruction fetches
if (!fetchStatusQueue[j].second) {
if ((curWave->getStatus() == Wavefront::S_RUNNING ||
curWave->getStatus() == Wavefront::S_WAITCNT) &&
fetchBuf[j].hasFreeSpace() &&
!curWave->stopFetch() &&
!curWave->pendingFetch) {
fetchQueue.push_back(curWave);
fetchStatusQueue[j].second = true;
}
}
}
// Fetch only if there is some wave ready to be fetched
// An empty fetchQueue will cause the schedular to panic
if (fetchQueue.size()) {
Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
waveToBeFetched->pendingFetch = true;
fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
initiateFetch(waveToBeFetched);
}
}
void
FetchUnit::initiateFetch(Wavefront *wavefront)
{
assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace());
/**
* calculate the virtual address to fetch from the SQC. the fetch
* buffer holds a configurable number of cache lines. we start
* fetching at the address of the cache line immediately following
* the buffered line(s).
*/
Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr();
// this should already be aligned to a cache line
assert(vaddr == ruby::makeLineAddress(vaddr,
computeUnit.getCacheLineBits()));
// shouldn't be fetching a line that is already buffered
assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
"from pc: %d %#x\n", computeUnit.cu_id, wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
// set up virtual request
RequestPtr req = std::make_shared<Request>(
vaddr, computeUnit.cacheLineSize(), Request::INST_FETCH,
computeUnit.requestorId(), 0, 0, nullptr);
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
if (timingSim) {
// SenderState needed on Return
pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
// Sender State needed by TLB hierarchy
pkt->senderState =
new GpuTranslationState(BaseMMU::Execute,
computeUnit.shader->gpuTc,
false, pkt->senderState);
if (computeUnit.sqcTLBPort.isStalled()) {
assert(computeUnit.sqcTLBPort.retries.size() > 0);
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
vaddr);
computeUnit.sqcTLBPort.retries.push_back(pkt);
} else if (!computeUnit.sqcTLBPort.sendTimingReq(pkt)) {
// Stall the data port;
// No more packet is issued till
// ruby indicates resources are freed by
// a recvReqRetry() call back on this port.
computeUnit.sqcTLBPort.stallPort();
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
vaddr);
computeUnit.sqcTLBPort.retries.push_back(pkt);
} else {
DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
}
} else {
pkt->senderState =
new GpuTranslationState(BaseMMU::Execute,
computeUnit.shader->gpuTc);
computeUnit.sqcTLBPort.sendFunctional(pkt);
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
delete sender_state->tlbEntry;
delete sender_state;
// fetch the instructions from the SQC when we operate in
// functional mode only
fetch(pkt, wavefront);
}
}
void
FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
{
assert(pkt->req->hasPaddr());
assert(pkt->req->hasSize());
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
/**
* this is necessary because the GPU TLB receives packets instead of
* requests. when the translation is complete, all relevent fields in
* the request will be populated, but not in the packet. here we create
* the new packet so we can set the size, addr, and proper flags.
*/
PacketPtr oldPkt = pkt;
pkt = new Packet(oldPkt->req, oldPkt->cmd);
delete oldPkt;
/**
* if we have not reserved an entry in the fetch buffer,
* stop fetching. this can happen due to a branch instruction
* flushing the fetch buffer while an ITLB or I-cache request is still
* pending, in the same cycle another instruction is trying to fetch.
*/
if (!fetchBuf.at(wavefront->wfSlotId).isReserved(pkt->req->getVaddr())) {
wavefront->dropFetch = false;
wavefront->pendingFetch = false;
return;
}
/**
* we should have reserved an entry in the fetch buffer
* for this cache line. here we get the pointer to the
* entry used to buffer this request's line data.
*/
pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId)
.reservedBuf(pkt->req->getVaddr()));
// New SenderState for the memory access
pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
if (timingSim) {
// translation is done. Send the appropriate timing memory request.
if (!computeUnit.sqcPort.sendTimingReq(pkt)) {
computeUnit.sqcPort.retries.push_back(std::make_pair(pkt,
wavefront));
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
} else {
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
}
} else {
computeUnit.sqcPort.sendFunctional(pkt);
processFetchReturn(pkt);
}
}
void
FetchUnit::processFetchReturn(PacketPtr pkt)
{
ComputeUnit::SQCPort::SenderState *sender_state =
safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
Wavefront *wavefront = sender_state->wavefront;
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
"%d bytes!\n", computeUnit.cu_id, wavefront->simdId,
wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
if (wavefront->dropFetch) {
assert(wavefront->instructionBuffer.empty());
assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
wavefront->dropFetch = false;
} else {
fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
}
wavefront->pendingFetch = false;
delete pkt->senderState;
delete pkt;
}
void
FetchUnit::flushBuf(int wfSlotId)
{
fetchBuf.at(wfSlotId).flushBuf();
}
void
FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
{
waveList = wave_list;
}
/** FetchBufDesc */
void
FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size,
Wavefront *wf)
{
wavefront = wf;
fetchDepth = fetch_depth;
maxIbSize = wavefront->maxIbSize;
cacheLineSize = cache_line_size;
maxFbSize = cacheLineSize * fetchDepth;
// Calculate the number of bits to address a cache line
panic_if(!isPowerOf2(cacheLineSize),
"Cache line size should be a power of two.");
cacheLineBits = floorLog2(cacheLineSize);
bufStart = new uint8_t[maxFbSize];
readPtr = bufStart;
bufEnd = bufStart + maxFbSize;
for (int i = 0; i < fetchDepth; ++i) {
freeList.emplace_back(readPtr + i * cacheLineSize);
}
}
void
FetchUnit::FetchBufDesc::flushBuf()
{
restartFromBranch = true;
/**
* free list may have some entries
* so we clear it here to avoid duplicates
*/
freeList.clear();
bufferedPCs.clear();
reservedPCs.clear();
readPtr = bufStart;
for (int i = 0; i < fetchDepth; ++i) {
freeList.push_back(bufStart + i * cacheLineSize);
}
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
"buffer\n", wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId);
}
Addr
FetchUnit::FetchBufDesc::nextFetchAddr()
{
Addr next_line = 0;
if (bufferedAndReservedLines()) {
Addr last_line_fetched = 0;
if (!reservedLines()) {
/**
* get the PC of the most recently fetched cache line,
* then return the address of the next line.
*/
last_line_fetched = bufferedPCs.rbegin()->first;
} else {
last_line_fetched = reservedPCs.rbegin()->first;
}
next_line = last_line_fetched + cacheLineSize;
/**
* should not be trying to fetch a line that has already
* been fetched.
*/
assert(bufferedPCs.find(next_line) == bufferedPCs.end());
assert(reservedPCs.find(next_line) == reservedPCs.end());
} else {
/**
* we do not have any buffered cache lines yet, so we
* assume this is the initial fetch, or the first fetch
* after a branch, and get the PC directly from the WF.
* in the case of a branch, we may not start at the
* beginning of a cache line, so we adjust the readPtr by
* the current PC's offset from the start of the line.
*/
next_line = ruby::makeLineAddress(wavefront->pc(), cacheLineBits);
readPtr = bufStart;
/**
* if we are here we have no buffered lines. in the case we flushed
* the buffer due to a branch, we may need to start fetching from
* some offset from the start of the fetch buffer, so we adjust for
* that here.
*/
if (restartFromBranch) {
restartFromBranch = false;
int byte_offset
= wavefront->pc() - ruby::makeLineAddress(wavefront->pc(),
cacheLineBits);
readPtr += byte_offset;
}
}
return next_line;
}
void
FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
{
// we should have free buffer space, and the line
// at vaddr should not already be cached.
assert(hasFreeSpace());
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
assert(reservedPCs.find(vaddr) == reservedPCs.end());
assert(bufferedAndReservedLines() < fetchDepth);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry "
"for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId, vaddr);
/**
* we reserve buffer space, by moving it out of the
* free list, however we do not mark the buffered
* line as valid until the fetch unit for this buffer
* has receieved the response from the memory system.
*/
uint8_t *inst_buf = freeList.front();
reservedPCs.emplace(vaddr, inst_buf);
freeList.pop_front();
}
void
FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
{
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId, vaddr);
/**
* this address should have an entry reserved in the
* fetch buffer already, however it should be invalid
* until the fetch completes.
*/
auto reserved_pc = reservedPCs.find(vaddr);
assert(reserved_pc != reservedPCs.end());
bufferedPCs.emplace(vaddr, reserved_pc->second);
if (readPtr == bufEnd) {
readPtr = bufStart;
}
reserved_pc->second = nullptr;
reservedPCs.erase(reserved_pc);
}
bool
FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
{
return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst);
}
void
FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
{
Addr cur_wave_pc = roundDown(wavefront->pc(),
wavefront->computeUnit->cacheLineSize());
if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still "
"being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId, cur_wave_pc);
// should be reserved, but not buffered yet
assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
return;
}
auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
auto oldest_buffered_pc = bufferedPCs.begin();
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
"(PC = %#x) can be released.\n", wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
wavefront->pc());
#ifdef DEBUG
int idx = 0;
for (const auto &buf_pc : bufferedPCs) {
DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first);
++idx;
}
#endif
// if we haven't buffered data for this PC, we shouldn't
// be fetching from it.
assert(current_buffered_pc != bufferedPCs.end());
/**
* we're using a std::map so the addresses are sorted. if this
* PC is not the oldest one in the map, we must be fetching from
* a newer block, and we can release the oldest PC's fetch buffer
* entry back to the free list.
*/
if (current_buffered_pc != oldest_buffered_pc) {
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
"removing it from the fetch buffer.\n", wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId,
oldest_buffered_pc->first);
freeList.emplace_back(oldest_buffered_pc->second);
oldest_buffered_pc->second = nullptr;
bufferedPCs.erase(oldest_buffered_pc);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n",
wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
bufferedLines());
}
}
void
FetchUnit::FetchBufDesc::decodeInsts()
{
assert(readPtr);
if (splitDecode()) {
decodeSplitInst();
}
while (wavefront->instructionBuffer.size() < maxIbSize
&& hasFetchDataToProcess()) {
if (splitDecode()) {
decodeSplitInst();
} else {
TheGpuISA::MachInst mach_inst
= reinterpret_cast<TheGpuISA::MachInst>(readPtr);
GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
readPtr += gpu_static_inst->instSize();
assert(readPtr <= bufEnd);
GPUDynInstPtr gpu_dyn_inst
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
wavefront, gpu_static_inst,
wavefront->computeUnit->
getAndIncSeqNum());
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
"%d bytes remain.\n", wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId,
gpu_static_inst->disassemble(),
gpu_static_inst->instSize(),
fetchBytesRemaining());
}
}
}
void
FetchUnit::FetchBufDesc::decodeSplitInst()
{
TheGpuISA::RawMachInst split_inst = 0;
int dword_size = sizeof(uint32_t);
int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
for (int i = 0; i < num_dwords; ++i) {
replaceBits(split_inst, 32*(i+1)-1, 32*i,
*reinterpret_cast<uint32_t*>(readPtr));
if (readPtr + dword_size >= bufEnd) {
readPtr = bufStart;
}
}
assert(readPtr == bufStart);
TheGpuISA::MachInst mach_inst
= reinterpret_cast<TheGpuISA::MachInst>(&split_inst);
GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
readPtr += (gpu_static_inst->instSize() - dword_size);
assert(readPtr < bufEnd);
GPUDynInstPtr gpu_dyn_inst
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
wavefront, gpu_static_inst,
wavefront->computeUnit->
getAndIncSeqNum());
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
"(%d bytes). %d bytes remain in %d buffered lines.\n",
wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
gpu_static_inst->disassemble(), split_inst,
gpu_static_inst->instSize(), fetchBytesRemaining(),
bufferedLines());
}
bool
FetchUnit::FetchBufDesc::splitDecode() const
{
/**
* if a read of a raw instruction would go beyond the end
* of the fetch buffer, then we must perform a split decode.
*/
bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd;
return is_split;
}
int
FetchUnit::FetchBufDesc::fetchBytesRemaining() const
{
int bytes_remaining = 0;
if (bufferedLines() && readPtr != bufEnd) {
auto last_buf_pc = bufferedPCs.rbegin();
uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
int byte_diff = end_ptr - readPtr;
if (end_ptr > readPtr) {
bytes_remaining = byte_diff;
} else if (end_ptr < readPtr) {
bytes_remaining = bufferedBytes() + byte_diff;
}
}
assert(bytes_remaining <= bufferedBytes());
return bytes_remaining;
}
} // namespace gem5