gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Tony Gutierrez
2018-05-01 16:59:35 -04:00
committed by Anthony Gutierrez
parent b0eac7857a
commit b8da9abba7
86 changed files with 10299 additions and 3734 deletions

View File

@@ -45,11 +45,9 @@
uint32_t FetchUnit::globalFetchUnitID;
FetchUnit::FetchUnit(const ComputeUnitParams* params) :
timingSim(true),
computeUnit(nullptr),
fetchScheduler(params),
waveList(nullptr)
FetchUnit::FetchUnit(const ComputeUnitParams* params)
: timingSim(true), computeUnit(nullptr), fetchScheduler(params),
waveList(nullptr), fetchDepth(params->fetch_depth)
{
}
@@ -66,9 +64,14 @@ FetchUnit::init(ComputeUnit *cu)
timingSim = computeUnit->shader->timingSim;
fetchQueue.clear();
fetchStatusQueue.resize(computeUnit->shader->n_wf);
fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc());
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
for (int i = 0; i < computeUnit->shader->n_wf; ++i) {
Wavefront *wf = waveList->at(i);
assert(wf->wfSlotId == i);
fetchStatusQueue[i] = std::make_pair(wf, false);
fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf);
fetchBuf[i].decoder(&decoder);
}
fetchScheduler.bindList(&fetchQueue);
@@ -77,6 +80,23 @@ FetchUnit::init(ComputeUnit *cu)
void
FetchUnit::exec()
{
/**
* now we check if any of the fetch buffers have
* buffered instruction data that can be decoded
* and sent to its wavefront's instruction buffer.
* then we check if any of the fetch buffer entries
* can be released. we only check if we can
* release a buffer
*/
for (auto &fetch_buf : fetchBuf) {
if (!fetch_buf.hasFreeSpace()) {
fetch_buf.checkWaveReleaseBuf();
}
if (fetch_buf.hasFetchDataToProcess()) {
fetch_buf.decodeInsts();
}
}
// re-evaluate waves which are marked as not ready for fetch
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
// Following code assumes 64-bit opertaion and all insts are
@@ -88,9 +108,10 @@ FetchUnit::exec()
// 4 or less instructions and it can not have any branches to
// prevent speculative instruction fetches
if (!fetchStatusQueue[j].second) {
if (curWave->status == Wavefront::S_RUNNING &&
curWave->instructionBuffer.size() <= 4 &&
!curWave->instructionBufferHasBranch() &&
if ((curWave->getStatus() == Wavefront::S_RUNNING ||
curWave->getStatus() == Wavefront::S_WAITCNT) &&
fetchBuf[j].hasFreeSpace() &&
!curWave->stopFetch() &&
!curWave->pendingFetch) {
fetchQueue.push_back(curWave);
fetchStatusQueue[j].second = true;
@@ -111,45 +132,38 @@ FetchUnit::exec()
void
FetchUnit::initiateFetch(Wavefront *wavefront)
{
// calculate the virtual address to fetch from the SQC
Addr vaddr = wavefront->pc();
assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace());
/**
* the instruction buffer holds one instruction per entry, regardless
* of the underlying instruction's size. the PC, however, addresses
* instrutions on a 32b granularity so we must account for that here.
*/
for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) {
vaddr +=
wavefront->instructionBuffer.at(i)->staticInstruction()->instSize();
}
vaddr = wavefront->basePtr + vaddr;
* calculate the virtual address to fetch from the SQC. the fetch
* buffer holds a configurable number of cache lines. we start
* fetching at the address of the cache line immediately following
* the buffered line(s).
*/
Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr();
// this should already be aligned to a cache line
assert(vaddr == makeLineAddress(vaddr,
computeUnit->getCacheLineBits()));
// shouldn't be fetching a line that is already buffered
assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
"from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
// Since this is an instruction prefetch, if you're split then just finish
// out the current line.
int block_size = computeUnit->cacheLineSize();
// check for split accesses
Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
int size = block_size;
if (split_addr > vaddr) {
// misaligned access, just grab the rest of the line
size = split_addr - vaddr;
}
// set up virtual request
RequestPtr req = std::make_shared<Request>(
vaddr, size, Request::INST_FETCH,
vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH,
computeUnit->masterId(), 0, 0, nullptr);
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
// This fetchBlock is kind of faux right now - because the translations so
// far don't actually return Data
uint64_t fetchBlock;
pkt->dataStatic(&fetchBlock);
if (timingSim) {
// SenderState needed on Return
@@ -210,19 +224,23 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
// this is necessary because the GPU TLB receives packets instead of
// requests. when the translation is complete, all relevent fields in the
// request will be populated, but not in the packet. here we create the
// new packet so we can set the size, addr, and proper flags.
/**
* this is necessary because the GPU TLB receives packets instead of
* requests. when the translation is complete, all relevent fields in
* the request will be populated, but not in the packet. here we create
* the new packet so we can set the size, addr, and proper flags.
*/
PacketPtr oldPkt = pkt;
pkt = new Packet(oldPkt->req, oldPkt->cmd);
delete oldPkt;
TheGpuISA::RawMachInst *data =
new TheGpuISA::RawMachInst[pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst)];
pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
/**
* we should have reserved an entry in the fetch buffer
* for this cache line. here we get the pointer to the
* entry used to buffer this request's line data.
*/
pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId)
.reservedBuf(pkt->req->getVaddr()));
// New SenderState for the memory access
pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
@@ -257,47 +275,15 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
Wavefront *wavefront = sender_state->wavefront;
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
"%d bytes, %d instructions!\n", computeUnit->cu_id,
wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
pkt->req->getSize(), pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst));
"%d bytes!\n", computeUnit->cu_id, wavefront->simdId,
wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
if (wavefront->dropFetch) {
assert(wavefront->instructionBuffer.empty());
assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
wavefront->dropFetch = false;
} else {
TheGpuISA::RawMachInst *inst_index_ptr =
(TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
assert(wavefront->instructionBuffer.size() <= 4);
for (int i = 0; i < pkt->req->getSize() /
sizeof(TheGpuISA::RawMachInst); ++i) {
GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
assert(inst_ptr);
if (inst_ptr->instSize() == 8) {
/**
* this instruction occupies 2 consecutive
* entries in the instruction array, the
* second of which contains a nullptr. so if
* this inst is 8 bytes we advance two entries
* instead of 1
*/
++i;
}
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
computeUnit->cu_id, wavefront->simdId,
wavefront->wfSlotId, inst_ptr->disassemble());
GPUDynInstPtr gpuDynInst =
std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
computeUnit->getAndIncSeqNum());
wavefront->instructionBuffer.push_back(gpuDynInst);
}
fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
}
wavefront->pendingFetch = false;
@@ -306,8 +292,337 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
delete pkt;
}
void
FetchUnit::flushBuf(int wfSlotId)
{
fetchBuf.at(wfSlotId).flushBuf();
}
void
FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
{
waveList = wave_list;
}
/** FetchBufDesc */
void
FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size,
Wavefront *wf)
{
wavefront = wf;
fetchDepth = fetch_depth;
maxIbSize = wavefront->maxIbSize;
cacheLineSize = cache_line_size;
maxFbSize = cacheLineSize * fetchDepth;
// Calculate the number of bits to address a cache line
panic_if(!isPowerOf2(cacheLineSize),
"Cache line size should be a power of two.");
cacheLineBits = floorLog2(cacheLineSize);
bufStart = new uint8_t[maxFbSize];
readPtr = bufStart;
bufEnd = bufStart + maxFbSize;
for (int i = 0; i < fetchDepth; ++i) {
freeList.emplace_back(readPtr + i * cacheLineSize);
}
}
void
FetchUnit::FetchBufDesc::flushBuf()
{
restartFromBranch = true;
/**
* free list may have some entries
* so we clear it here to avoid duplicates
*/
freeList.clear();
bufferedPCs.clear();
reservedPCs.clear();
readPtr = bufStart;
for (int i = 0; i < fetchDepth; ++i) {
freeList.push_back(bufStart + i * cacheLineSize);
}
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
"buffer\n", wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId);
}
Addr
FetchUnit::FetchBufDesc::nextFetchAddr()
{
Addr next_line = 0;
if (bufferedAndReservedLines()) {
Addr last_line_fetched = 0;
if (!reservedLines()) {
/**
* get the PC of the most recently fetched cache line,
* then return the address of the next line.
*/
last_line_fetched = bufferedPCs.rbegin()->first;
} else {
last_line_fetched = reservedPCs.rbegin()->first;
}
next_line = last_line_fetched + cacheLineSize;
/**
* should not be trying to fetch a line that has already
* been fetched.
*/
assert(bufferedPCs.find(next_line) == bufferedPCs.end());
assert(reservedPCs.find(next_line) == reservedPCs.end());
} else {
/**
* we do not have any buffered cache lines yet, so we
* assume this is the initial fetch, or the first fetch
* after a branch, and get the PC directly from the WF.
* in the case of a branch, we may not start at the
* beginning of a cache line, so we adjust the readPtr by
* the current PC's offset from the start of the line.
*/
next_line = makeLineAddress(wavefront->pc(), cacheLineBits);
readPtr = bufStart;
/**
* if we are here we have no buffered lines. in the case we flushed
* the buffer due to a branch, we may need to start fetching from
* some offset from the start of the fetch buffer, so we adjust for
* that here.
*/
if (restartFromBranch) {
restartFromBranch = false;
int byte_offset
= wavefront->pc() - makeLineAddress(wavefront->pc(),
cacheLineBits);
readPtr += byte_offset;
}
}
return next_line;
}
void
FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
{
// we should have free buffer space, and the line
// at vaddr should not already be cached.
assert(hasFreeSpace());
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
assert(reservedPCs.find(vaddr) == reservedPCs.end());
assert(bufferedAndReservedLines() < fetchDepth);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry "
"for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId, vaddr);
/**
* we reserve buffer space, by moving it out of the
* free list, however we do not mark the buffered
* line as valid until the fetch unit for this buffer
* has receieved the response from the memory system.
*/
uint8_t *inst_buf = freeList.front();
reservedPCs.emplace(vaddr, inst_buf);
freeList.pop_front();
}
void
FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
{
assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId, vaddr);
/**
* this address should have an entry reserved in the
* fetch buffer already, however it should be invalid
* until the fetch completes.
*/
auto reserved_pc = reservedPCs.find(vaddr);
assert(reserved_pc != reservedPCs.end());
bufferedPCs.emplace(vaddr, reserved_pc->second);
if (readPtr == bufEnd) {
readPtr = bufStart;
}
reserved_pc->second = nullptr;
reservedPCs.erase(reserved_pc);
}
bool
FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
{
return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst);
}
void
FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
{
Addr cur_wave_pc = roundDown(wavefront->pc(),
wavefront->computeUnit->cacheLineSize());
if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still "
"being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
wavefront->wfDynId, cur_wave_pc);
// should be reserved, but not buffered yet
assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
return;
}
auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
auto oldest_buffered_pc = bufferedPCs.begin();
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
"(PC = %#x) can be released.\n", wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
wavefront->pc());
#ifdef DEBUG
int idx = 0;
for (const auto &buf_pc : bufferedPCs) {
DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first);
++idx;
}
#endif
// if we haven't buffered data for this PC, we shouldn't
// be fetching from it.
assert(current_buffered_pc != bufferedPCs.end());
/**
* we're using a std::map so the addresses are sorted. if this
* PC is not the oldest one in the map, we must be fetching from
* a newer block, and we can release the oldest PC's fetch buffer
* entry back to the free list.
*/
if (current_buffered_pc != oldest_buffered_pc) {
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
"removing it from the fetch buffer.\n", wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId,
oldest_buffered_pc->first);
freeList.emplace_back(oldest_buffered_pc->second);
oldest_buffered_pc->second = nullptr;
bufferedPCs.erase(oldest_buffered_pc);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n",
wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
bufferedLines());
}
}
void
FetchUnit::FetchBufDesc::decodeInsts()
{
assert(readPtr);
if (splitDecode()) {
decodeSplitInst();
}
while (wavefront->instructionBuffer.size() < maxIbSize
&& hasFetchDataToProcess()) {
if (splitDecode()) {
decodeSplitInst();
} else {
TheGpuISA::MachInst mach_inst
= reinterpret_cast<TheGpuISA::MachInst>(readPtr);
GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
readPtr += gpu_static_inst->instSize();
assert(readPtr <= bufEnd);
GPUDynInstPtr gpu_dyn_inst
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
wavefront, gpu_static_inst,
wavefront->computeUnit->
getAndIncSeqNum());
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
"%d bytes remain.\n", wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId,
gpu_static_inst->disassemble(),
gpu_static_inst->instSize(),
fetchBytesRemaining());
}
}
}
void
FetchUnit::FetchBufDesc::decodeSplitInst()
{
TheGpuISA::RawMachInst split_inst = 0;
int dword_size = sizeof(uint32_t);
int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
for (int i = 0; i < num_dwords; ++i) {
((uint32_t*)(&split_inst))[i] = *reinterpret_cast<uint32_t*>(readPtr);
if (readPtr + dword_size >= bufEnd) {
readPtr = bufStart;
}
}
assert(readPtr == bufStart);
TheGpuISA::MachInst mach_inst
= reinterpret_cast<TheGpuISA::MachInst>(&split_inst);
GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
readPtr += (gpu_static_inst->instSize() - dword_size);
assert(readPtr < bufEnd);
GPUDynInstPtr gpu_dyn_inst
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
wavefront, gpu_static_inst,
wavefront->computeUnit->
getAndIncSeqNum());
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
"(%d bytes). %d bytes remain in %d buffered lines.\n",
wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
gpu_static_inst->disassemble(), split_inst,
gpu_static_inst->instSize(), fetchBytesRemaining(),
bufferedLines());
}
bool
FetchUnit::FetchBufDesc::splitDecode() const
{
/**
* if a read of a raw instruction would go beyond the end
* of the fetch buffer, then we must perform a split decode.
*/
bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd;
return is_split;
}
int
FetchUnit::FetchBufDesc::fetchBytesRemaining() const
{
int bytes_remaining = 0;
if (bufferedLines() && readPtr != bufEnd) {
auto last_buf_pc = bufferedPCs.rbegin();
uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
int byte_diff = end_ptr - readPtr;
if (end_ptr > readPtr) {
bytes_remaining = byte_diff;
} else if (end_ptr < readPtr) {
bytes_remaining = bufferedBytes() + byte_diff;
}
}
assert(bytes_remaining <= bufferedBytes());
return bytes_remaining;
}