Move GpuTLB and TLBCoalescer to GCN3 as the TLB format is specific to GCN3 and SE mode / APU simulation. Vega will have its own TLB, coalescer, and walker suitable for a dGPU. This also adds a using alias for the TLB translation state to reduce the number of references to TheISA and X86ISA. X86 specific includes are also removed. Change-Id: I34448bb4e5ddb9980b34a55bc717bbcea0e03db5 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/49847 Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
647 lines
22 KiB
C++
647 lines
22 KiB
C++
/*
|
|
* Copyright (c) 2014-2017 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "gpu-compute/fetch_unit.hh"
|
|
|
|
#include "base/bitfield.hh"
|
|
#include "debug/GPUFetch.hh"
|
|
#include "debug/GPUPort.hh"
|
|
#include "debug/GPUTLB.hh"
|
|
#include "gpu-compute/compute_unit.hh"
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
#include "gpu-compute/gpu_static_inst.hh"
|
|
#include "gpu-compute/shader.hh"
|
|
#include "gpu-compute/wavefront.hh"
|
|
#include "mem/ruby/system/RubySystem.hh"
|
|
|
|
namespace gem5
|
|
{
|
|
|
|
uint32_t FetchUnit::globalFetchUnitID;
|
|
|
|
FetchUnit::FetchUnit(const ComputeUnitParams &p, ComputeUnit &cu)
|
|
: timingSim(true), computeUnit(cu), fetchScheduler(p),
|
|
waveList(nullptr), fetchDepth(p.fetch_depth)
|
|
{
|
|
}
|
|
|
|
FetchUnit::~FetchUnit()
|
|
{
|
|
fetchQueue.clear();
|
|
fetchStatusQueue.clear();
|
|
}
|
|
|
|
void
|
|
FetchUnit::init()
|
|
{
|
|
timingSim = computeUnit.shader->timingSim;
|
|
fetchQueue.clear();
|
|
fetchStatusQueue.resize(computeUnit.shader->n_wf);
|
|
fetchBuf.resize(computeUnit.shader->n_wf, FetchBufDesc());
|
|
|
|
for (int i = 0; i < computeUnit.shader->n_wf; ++i) {
|
|
Wavefront *wf = waveList->at(i);
|
|
assert(wf->wfSlotId == i);
|
|
fetchStatusQueue[i] = std::make_pair(wf, false);
|
|
fetchBuf[i].allocateBuf(fetchDepth, computeUnit.cacheLineSize(), wf);
|
|
fetchBuf[i].decoder(&decoder);
|
|
}
|
|
|
|
fetchScheduler.bindList(&fetchQueue);
|
|
}
|
|
|
|
void
|
|
FetchUnit::exec()
|
|
{
|
|
/**
|
|
* now we check if any of the fetch buffers have
|
|
* buffered instruction data that can be decoded
|
|
* and sent to its wavefront's instruction buffer.
|
|
* then we check if any of the fetch buffer entries
|
|
* can be released. we only check if we can
|
|
* release a buffer
|
|
*/
|
|
for (auto &fetch_buf : fetchBuf) {
|
|
if (!fetch_buf.hasFreeSpace()) {
|
|
fetch_buf.checkWaveReleaseBuf();
|
|
}
|
|
if (fetch_buf.hasFetchDataToProcess()) {
|
|
fetch_buf.decodeInsts();
|
|
}
|
|
}
|
|
|
|
// re-evaluate waves which are marked as not ready for fetch
|
|
for (int j = 0; j < computeUnit.shader->n_wf; ++j) {
|
|
// Following code assumes 64-bit opertaion and all insts are
|
|
// represented by 64-bit pointers to inst objects.
|
|
Wavefront *curWave = fetchStatusQueue[j].first;
|
|
assert (curWave);
|
|
|
|
// The wavefront has to be active, the IB occupancy has to be
|
|
// 4 or less instructions and it can not have any branches to
|
|
// prevent speculative instruction fetches
|
|
if (!fetchStatusQueue[j].second) {
|
|
if ((curWave->getStatus() == Wavefront::S_RUNNING ||
|
|
curWave->getStatus() == Wavefront::S_WAITCNT) &&
|
|
fetchBuf[j].hasFreeSpace() &&
|
|
!curWave->stopFetch() &&
|
|
!curWave->pendingFetch) {
|
|
fetchQueue.push_back(curWave);
|
|
fetchStatusQueue[j].second = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fetch only if there is some wave ready to be fetched
|
|
// An empty fetchQueue will cause the schedular to panic
|
|
if (fetchQueue.size()) {
|
|
Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
|
|
waveToBeFetched->pendingFetch = true;
|
|
fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
|
|
initiateFetch(waveToBeFetched);
|
|
}
|
|
}
|
|
|
|
void
|
|
FetchUnit::initiateFetch(Wavefront *wavefront)
|
|
{
|
|
assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace());
|
|
|
|
/**
|
|
* calculate the virtual address to fetch from the SQC. the fetch
|
|
* buffer holds a configurable number of cache lines. we start
|
|
* fetching at the address of the cache line immediately following
|
|
* the buffered line(s).
|
|
*/
|
|
Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr();
|
|
|
|
// this should already be aligned to a cache line
|
|
assert(vaddr == ruby::makeLineAddress(vaddr,
|
|
computeUnit.getCacheLineBits()));
|
|
|
|
// shouldn't be fetching a line that is already buffered
|
|
assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
|
|
|
|
fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
|
|
|
|
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
|
|
"from pc: %d %#x\n", computeUnit.cu_id, wavefront->simdId,
|
|
wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
|
|
|
|
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
|
|
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
|
|
|
|
// set up virtual request
|
|
RequestPtr req = std::make_shared<Request>(
|
|
vaddr, computeUnit.cacheLineSize(), Request::INST_FETCH,
|
|
computeUnit.requestorId(), 0, 0, nullptr);
|
|
|
|
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
|
|
|
|
if (timingSim) {
|
|
// SenderState needed on Return
|
|
pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
|
|
|
|
// Sender State needed by TLB hierarchy
|
|
pkt->senderState =
|
|
new GpuTranslationState(BaseMMU::Execute,
|
|
computeUnit.shader->gpuTc,
|
|
false, pkt->senderState);
|
|
|
|
if (computeUnit.sqcTLBPort.isStalled()) {
|
|
assert(computeUnit.sqcTLBPort.retries.size() > 0);
|
|
|
|
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
|
|
vaddr);
|
|
|
|
computeUnit.sqcTLBPort.retries.push_back(pkt);
|
|
} else if (!computeUnit.sqcTLBPort.sendTimingReq(pkt)) {
|
|
// Stall the data port;
|
|
// No more packet is issued till
|
|
// ruby indicates resources are freed by
|
|
// a recvReqRetry() call back on this port.
|
|
computeUnit.sqcTLBPort.stallPort();
|
|
|
|
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
|
|
vaddr);
|
|
|
|
computeUnit.sqcTLBPort.retries.push_back(pkt);
|
|
} else {
|
|
DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
|
|
}
|
|
} else {
|
|
pkt->senderState =
|
|
new GpuTranslationState(BaseMMU::Execute,
|
|
computeUnit.shader->gpuTc);
|
|
|
|
computeUnit.sqcTLBPort.sendFunctional(pkt);
|
|
|
|
GpuTranslationState *sender_state =
|
|
safe_cast<GpuTranslationState*>(pkt->senderState);
|
|
|
|
delete sender_state->tlbEntry;
|
|
delete sender_state;
|
|
// fetch the instructions from the SQC when we operate in
|
|
// functional mode only
|
|
fetch(pkt, wavefront);
|
|
}
|
|
}
|
|
|
|
void
|
|
FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
|
|
{
|
|
assert(pkt->req->hasPaddr());
|
|
assert(pkt->req->hasSize());
|
|
|
|
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
|
|
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
|
|
pkt->req->getPaddr());
|
|
|
|
/**
|
|
* this is necessary because the GPU TLB receives packets instead of
|
|
* requests. when the translation is complete, all relevent fields in
|
|
* the request will be populated, but not in the packet. here we create
|
|
* the new packet so we can set the size, addr, and proper flags.
|
|
*/
|
|
PacketPtr oldPkt = pkt;
|
|
pkt = new Packet(oldPkt->req, oldPkt->cmd);
|
|
delete oldPkt;
|
|
|
|
/**
|
|
* if we have not reserved an entry in the fetch buffer,
|
|
* stop fetching. this can happen due to a branch instruction
|
|
* flushing the fetch buffer while an ITLB or I-cache request is still
|
|
* pending, in the same cycle another instruction is trying to fetch.
|
|
*/
|
|
if (!fetchBuf.at(wavefront->wfSlotId).isReserved(pkt->req->getVaddr())) {
|
|
wavefront->dropFetch = false;
|
|
wavefront->pendingFetch = false;
|
|
return;
|
|
}
|
|
|
|
/**
|
|
* we should have reserved an entry in the fetch buffer
|
|
* for this cache line. here we get the pointer to the
|
|
* entry used to buffer this request's line data.
|
|
*/
|
|
pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId)
|
|
.reservedBuf(pkt->req->getVaddr()));
|
|
|
|
// New SenderState for the memory access
|
|
pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
|
|
|
|
if (timingSim) {
|
|
// translation is done. Send the appropriate timing memory request.
|
|
|
|
if (!computeUnit.sqcPort.sendTimingReq(pkt)) {
|
|
computeUnit.sqcPort.retries.push_back(std::make_pair(pkt,
|
|
wavefront));
|
|
|
|
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
|
|
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
|
|
pkt->req->getPaddr());
|
|
} else {
|
|
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
|
|
computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
|
|
pkt->req->getPaddr());
|
|
}
|
|
} else {
|
|
computeUnit.sqcPort.sendFunctional(pkt);
|
|
processFetchReturn(pkt);
|
|
}
|
|
}
|
|
|
|
void
|
|
FetchUnit::processFetchReturn(PacketPtr pkt)
|
|
{
|
|
ComputeUnit::SQCPort::SenderState *sender_state =
|
|
safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
|
|
|
|
Wavefront *wavefront = sender_state->wavefront;
|
|
|
|
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
|
|
"%d bytes!\n", computeUnit.cu_id, wavefront->simdId,
|
|
wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
|
|
|
|
if (wavefront->dropFetch) {
|
|
assert(wavefront->instructionBuffer.empty());
|
|
assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
|
|
wavefront->dropFetch = false;
|
|
} else {
|
|
fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
|
|
}
|
|
|
|
wavefront->pendingFetch = false;
|
|
|
|
delete pkt->senderState;
|
|
delete pkt;
|
|
}
|
|
|
|
void
|
|
FetchUnit::flushBuf(int wfSlotId)
|
|
{
|
|
fetchBuf.at(wfSlotId).flushBuf();
|
|
}
|
|
|
|
void
|
|
FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
|
|
{
|
|
waveList = wave_list;
|
|
}
|
|
|
|
/** FetchBufDesc */
|
|
void
|
|
FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size,
|
|
Wavefront *wf)
|
|
{
|
|
wavefront = wf;
|
|
fetchDepth = fetch_depth;
|
|
maxIbSize = wavefront->maxIbSize;
|
|
cacheLineSize = cache_line_size;
|
|
maxFbSize = cacheLineSize * fetchDepth;
|
|
|
|
// Calculate the number of bits to address a cache line
|
|
panic_if(!isPowerOf2(cacheLineSize),
|
|
"Cache line size should be a power of two.");
|
|
cacheLineBits = floorLog2(cacheLineSize);
|
|
|
|
bufStart = new uint8_t[maxFbSize];
|
|
readPtr = bufStart;
|
|
bufEnd = bufStart + maxFbSize;
|
|
|
|
for (int i = 0; i < fetchDepth; ++i) {
|
|
freeList.emplace_back(readPtr + i * cacheLineSize);
|
|
}
|
|
}
|
|
|
|
void
|
|
FetchUnit::FetchBufDesc::flushBuf()
|
|
{
|
|
restartFromBranch = true;
|
|
/**
|
|
* free list may have some entries
|
|
* so we clear it here to avoid duplicates
|
|
*/
|
|
freeList.clear();
|
|
bufferedPCs.clear();
|
|
reservedPCs.clear();
|
|
readPtr = bufStart;
|
|
|
|
for (int i = 0; i < fetchDepth; ++i) {
|
|
freeList.push_back(bufStart + i * cacheLineSize);
|
|
}
|
|
|
|
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
|
|
"buffer\n", wavefront->simdId, wavefront->wfSlotId,
|
|
wavefront->wfDynId);
|
|
}
|
|
|
|
Addr
|
|
FetchUnit::FetchBufDesc::nextFetchAddr()
|
|
{
|
|
Addr next_line = 0;
|
|
|
|
if (bufferedAndReservedLines()) {
|
|
Addr last_line_fetched = 0;
|
|
if (!reservedLines()) {
|
|
/**
|
|
* get the PC of the most recently fetched cache line,
|
|
* then return the address of the next line.
|
|
*/
|
|
last_line_fetched = bufferedPCs.rbegin()->first;
|
|
} else {
|
|
last_line_fetched = reservedPCs.rbegin()->first;
|
|
}
|
|
|
|
next_line = last_line_fetched + cacheLineSize;
|
|
|
|
/**
|
|
* should not be trying to fetch a line that has already
|
|
* been fetched.
|
|
*/
|
|
assert(bufferedPCs.find(next_line) == bufferedPCs.end());
|
|
assert(reservedPCs.find(next_line) == reservedPCs.end());
|
|
} else {
|
|
/**
|
|
* we do not have any buffered cache lines yet, so we
|
|
* assume this is the initial fetch, or the first fetch
|
|
* after a branch, and get the PC directly from the WF.
|
|
* in the case of a branch, we may not start at the
|
|
* beginning of a cache line, so we adjust the readPtr by
|
|
* the current PC's offset from the start of the line.
|
|
*/
|
|
next_line = ruby::makeLineAddress(wavefront->pc(), cacheLineBits);
|
|
readPtr = bufStart;
|
|
|
|
/**
|
|
* if we are here we have no buffered lines. in the case we flushed
|
|
* the buffer due to a branch, we may need to start fetching from
|
|
* some offset from the start of the fetch buffer, so we adjust for
|
|
* that here.
|
|
*/
|
|
if (restartFromBranch) {
|
|
restartFromBranch = false;
|
|
int byte_offset
|
|
= wavefront->pc() - ruby::makeLineAddress(wavefront->pc(),
|
|
cacheLineBits);
|
|
readPtr += byte_offset;
|
|
}
|
|
}
|
|
|
|
return next_line;
|
|
}
|
|
|
|
void
|
|
FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
|
|
{
|
|
// we should have free buffer space, and the line
|
|
// at vaddr should not already be cached.
|
|
assert(hasFreeSpace());
|
|
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
|
|
assert(reservedPCs.find(vaddr) == reservedPCs.end());
|
|
assert(bufferedAndReservedLines() < fetchDepth);
|
|
|
|
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry "
|
|
"for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
|
|
wavefront->wfDynId, vaddr);
|
|
|
|
/**
|
|
* we reserve buffer space, by moving it out of the
|
|
* free list, however we do not mark the buffered
|
|
* line as valid until the fetch unit for this buffer
|
|
* has receieved the response from the memory system.
|
|
*/
|
|
uint8_t *inst_buf = freeList.front();
|
|
reservedPCs.emplace(vaddr, inst_buf);
|
|
freeList.pop_front();
|
|
}
|
|
|
|
void
|
|
FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
|
|
{
|
|
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
|
|
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
|
|
wavefront->simdId, wavefront->wfSlotId,
|
|
wavefront->wfDynId, vaddr);
|
|
|
|
/**
|
|
* this address should have an entry reserved in the
|
|
* fetch buffer already, however it should be invalid
|
|
* until the fetch completes.
|
|
*/
|
|
auto reserved_pc = reservedPCs.find(vaddr);
|
|
assert(reserved_pc != reservedPCs.end());
|
|
bufferedPCs.emplace(vaddr, reserved_pc->second);
|
|
|
|
if (readPtr == bufEnd) {
|
|
readPtr = bufStart;
|
|
}
|
|
|
|
reserved_pc->second = nullptr;
|
|
reservedPCs.erase(reserved_pc);
|
|
}
|
|
|
|
bool
|
|
FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
|
|
{
|
|
return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst);
|
|
}
|
|
|
|
void
|
|
FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
|
|
{
|
|
Addr cur_wave_pc = roundDown(wavefront->pc(),
|
|
wavefront->computeUnit->cacheLineSize());
|
|
if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
|
|
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still "
|
|
"being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
|
|
wavefront->wfDynId, cur_wave_pc);
|
|
|
|
// should be reserved, but not buffered yet
|
|
assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
|
|
|
|
return;
|
|
}
|
|
|
|
auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
|
|
auto oldest_buffered_pc = bufferedPCs.begin();
|
|
|
|
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
|
|
"(PC = %#x) can be released.\n", wavefront->simdId,
|
|
wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
|
|
wavefront->pc());
|
|
|
|
#ifdef DEBUG
|
|
int idx = 0;
|
|
for (const auto &buf_pc : bufferedPCs) {
|
|
DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first);
|
|
++idx;
|
|
}
|
|
#endif
|
|
|
|
// if we haven't buffered data for this PC, we shouldn't
|
|
// be fetching from it.
|
|
assert(current_buffered_pc != bufferedPCs.end());
|
|
|
|
/**
|
|
* we're using a std::map so the addresses are sorted. if this
|
|
* PC is not the oldest one in the map, we must be fetching from
|
|
* a newer block, and we can release the oldest PC's fetch buffer
|
|
* entry back to the free list.
|
|
*/
|
|
if (current_buffered_pc != oldest_buffered_pc) {
|
|
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
|
|
"removing it from the fetch buffer.\n", wavefront->simdId,
|
|
wavefront->wfSlotId, wavefront->wfDynId,
|
|
oldest_buffered_pc->first);
|
|
|
|
freeList.emplace_back(oldest_buffered_pc->second);
|
|
oldest_buffered_pc->second = nullptr;
|
|
bufferedPCs.erase(oldest_buffered_pc);
|
|
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n",
|
|
wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
|
|
bufferedLines());
|
|
}
|
|
}
|
|
|
|
void
|
|
FetchUnit::FetchBufDesc::decodeInsts()
|
|
{
|
|
assert(readPtr);
|
|
|
|
if (splitDecode()) {
|
|
decodeSplitInst();
|
|
}
|
|
|
|
while (wavefront->instructionBuffer.size() < maxIbSize
|
|
&& hasFetchDataToProcess()) {
|
|
if (splitDecode()) {
|
|
decodeSplitInst();
|
|
} else {
|
|
TheGpuISA::MachInst mach_inst
|
|
= reinterpret_cast<TheGpuISA::MachInst>(readPtr);
|
|
GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
|
|
readPtr += gpu_static_inst->instSize();
|
|
|
|
assert(readPtr <= bufEnd);
|
|
|
|
GPUDynInstPtr gpu_dyn_inst
|
|
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
|
|
wavefront, gpu_static_inst,
|
|
wavefront->computeUnit->
|
|
getAndIncSeqNum());
|
|
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
|
|
|
|
DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
|
|
"%d bytes remain.\n", wavefront->simdId,
|
|
wavefront->wfSlotId, wavefront->wfDynId,
|
|
gpu_static_inst->disassemble(),
|
|
gpu_static_inst->instSize(),
|
|
fetchBytesRemaining());
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
FetchUnit::FetchBufDesc::decodeSplitInst()
|
|
{
|
|
TheGpuISA::RawMachInst split_inst = 0;
|
|
int dword_size = sizeof(uint32_t);
|
|
int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
|
|
|
|
for (int i = 0; i < num_dwords; ++i) {
|
|
replaceBits(split_inst, 32*(i+1)-1, 32*i,
|
|
*reinterpret_cast<uint32_t*>(readPtr));
|
|
if (readPtr + dword_size >= bufEnd) {
|
|
readPtr = bufStart;
|
|
}
|
|
}
|
|
|
|
assert(readPtr == bufStart);
|
|
|
|
TheGpuISA::MachInst mach_inst
|
|
= reinterpret_cast<TheGpuISA::MachInst>(&split_inst);
|
|
GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
|
|
readPtr += (gpu_static_inst->instSize() - dword_size);
|
|
assert(readPtr < bufEnd);
|
|
|
|
GPUDynInstPtr gpu_dyn_inst
|
|
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
|
|
wavefront, gpu_static_inst,
|
|
wavefront->computeUnit->
|
|
getAndIncSeqNum());
|
|
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
|
|
|
|
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
|
|
"(%d bytes). %d bytes remain in %d buffered lines.\n",
|
|
wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
|
|
gpu_static_inst->disassemble(), split_inst,
|
|
gpu_static_inst->instSize(), fetchBytesRemaining(),
|
|
bufferedLines());
|
|
}
|
|
|
|
bool
|
|
FetchUnit::FetchBufDesc::splitDecode() const
|
|
{
|
|
/**
|
|
* if a read of a raw instruction would go beyond the end
|
|
* of the fetch buffer, then we must perform a split decode.
|
|
*/
|
|
bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd;
|
|
|
|
return is_split;
|
|
}
|
|
|
|
int
|
|
FetchUnit::FetchBufDesc::fetchBytesRemaining() const
|
|
{
|
|
int bytes_remaining = 0;
|
|
|
|
if (bufferedLines() && readPtr != bufEnd) {
|
|
auto last_buf_pc = bufferedPCs.rbegin();
|
|
uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
|
|
int byte_diff = end_ptr - readPtr;
|
|
|
|
if (end_ptr > readPtr) {
|
|
bytes_remaining = byte_diff;
|
|
} else if (end_ptr < readPtr) {
|
|
bytes_remaining = bufferedBytes() + byte_diff;
|
|
}
|
|
}
|
|
|
|
assert(bytes_remaining <= bufferedBytes());
|
|
return bytes_remaining;
|
|
}
|
|
|
|
} // namespace gem5
|