/* * Copyright (c) 2014-2017 Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "gpu-compute/fetch_unit.hh" #include "arch/amdgpu/common/gpu_translation_state.hh" #include "arch/amdgpu/common/tlb.hh" #include "base/bitfield.hh" #include "debug/GPUFetch.hh" #include "debug/GPUPort.hh" #include "debug/GPUTLB.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/gpu_static_inst.hh" #include "gpu-compute/shader.hh" #include "gpu-compute/wavefront.hh" #include "mem/ruby/system/RubySystem.hh" namespace gem5 { uint32_t FetchUnit::globalFetchUnitID; FetchUnit::FetchUnit(const ComputeUnitParams &p, ComputeUnit &cu) : timingSim(true), computeUnit(cu), fetchScheduler(p), waveList(nullptr), fetchDepth(p.fetch_depth) { } FetchUnit::~FetchUnit() { fetchQueue.clear(); fetchStatusQueue.clear(); } void FetchUnit::init() { timingSim = computeUnit.shader->timingSim; fetchQueue.clear(); fetchStatusQueue.resize(computeUnit.shader->n_wf); fetchBuf.resize(computeUnit.shader->n_wf, FetchBufDesc()); for (int i = 0; i < computeUnit.shader->n_wf; ++i) { Wavefront *wf = waveList->at(i); assert(wf->wfSlotId == i); fetchStatusQueue[i] = std::make_pair(wf, false); fetchBuf[i].allocateBuf(fetchDepth, computeUnit.cacheLineSize(), wf); fetchBuf[i].decoder(&decoder); } fetchScheduler.bindList(&fetchQueue); } void FetchUnit::exec() { /** * now we check if any of the fetch buffers have * buffered instruction data that can be decoded * and sent to its wavefront's instruction buffer. * then we check if any of the fetch buffer entries * can be released. we only check if we can * release a buffer */ for (auto &fetch_buf : fetchBuf) { if (!fetch_buf.hasFreeSpace()) { fetch_buf.checkWaveReleaseBuf(); } if (fetch_buf.hasFetchDataToProcess()) { fetch_buf.decodeInsts(); } } // re-evaluate waves which are marked as not ready for fetch for (int j = 0; j < computeUnit.shader->n_wf; ++j) { // Following code assumes 64-bit opertaion and all insts are // represented by 64-bit pointers to inst objects. Wavefront *curWave = fetchStatusQueue[j].first; assert (curWave); // The wavefront has to be active, the IB occupancy has to be // 4 or less instructions and it can not have any branches to // prevent speculative instruction fetches if (!fetchStatusQueue[j].second) { if ((curWave->getStatus() == Wavefront::S_RUNNING || curWave->getStatus() == Wavefront::S_WAITCNT) && fetchBuf[j].hasFreeSpace() && !curWave->stopFetch() && !curWave->pendingFetch) { fetchQueue.push_back(curWave); fetchStatusQueue[j].second = true; } } } // Fetch only if there is some wave ready to be fetched // An empty fetchQueue will cause the schedular to panic if (fetchQueue.size()) { Wavefront *waveToBeFetched = fetchScheduler.chooseWave(); waveToBeFetched->pendingFetch = true; fetchStatusQueue[waveToBeFetched->wfSlotId].second = false; initiateFetch(waveToBeFetched); } } void FetchUnit::initiateFetch(Wavefront *wavefront) { assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace()); /** * calculate the virtual address to fetch from the SQC. the fetch * buffer holds a configurable number of cache lines. we start * fetching at the address of the cache line immediately following * the buffered line(s). */ Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr(); // this should already be aligned to a cache line assert(vaddr == ruby::makeLineAddress(vaddr, computeUnit.getCacheLineBits())); // shouldn't be fetching a line that is already buffered assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr)); fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr); DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch " "from pc: %d %#x\n", computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); // set up virtual request RequestPtr req = std::make_shared( vaddr, computeUnit.cacheLineSize(), Request::INST_FETCH, computeUnit.requestorId(), 0, 0, nullptr); PacketPtr pkt = new Packet(req, MemCmd::ReadReq); if (timingSim) { // SenderState needed on Return pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront); // Sender State needed by TLB hierarchy pkt->senderState = new GpuTranslationState(BaseMMU::Execute, computeUnit.shader->gpuTc, false, pkt->senderState); if (computeUnit.sqcTLBPort.isStalled()) { assert(computeUnit.sqcTLBPort.retries.size() > 0); DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", vaddr); computeUnit.sqcTLBPort.retries.push_back(pkt); } else if (!computeUnit.sqcTLBPort.sendTimingReq(pkt)) { // Stall the data port; // No more packet is issued till // ruby indicates resources are freed by // a recvReqRetry() call back on this port. computeUnit.sqcTLBPort.stallPort(); DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", vaddr); computeUnit.sqcTLBPort.retries.push_back(pkt); } else { DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr); } } else { pkt->senderState = new GpuTranslationState(BaseMMU::Execute, computeUnit.shader->gpuTc); computeUnit.sqcTLBPort.sendFunctional(pkt); /** * For full system, if this is a device request we need to set the * requestor ID of the packet to the GPU memory manager so it is routed * through Ruby as a memory request and not a PIO request. */ if (!pkt->req->systemReq()) { pkt->req->requestorId(computeUnit.vramRequestorId()); } GpuTranslationState *sender_state = safe_cast(pkt->senderState); delete sender_state->tlbEntry; delete sender_state; // fetch the instructions from the SQC when we operate in // functional mode only fetch(pkt, wavefront); } } void FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) { assert(pkt->req->hasPaddr()); assert(pkt->req->hasSize()); DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n", computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr()); /** * this is necessary because the GPU TLB receives packets instead of * requests. when the translation is complete, all relevent fields in * the request will be populated, but not in the packet. here we create * the new packet so we can set the size, addr, and proper flags. */ PacketPtr oldPkt = pkt; pkt = new Packet(oldPkt->req, oldPkt->cmd); delete oldPkt; /** * if we have not reserved an entry in the fetch buffer, * stop fetching. this can happen due to a branch instruction * flushing the fetch buffer while an ITLB or I-cache request is still * pending, in the same cycle another instruction is trying to fetch. */ if (!fetchBuf.at(wavefront->wfSlotId).isReserved(pkt->req->getVaddr())) { wavefront->dropFetch = false; wavefront->pendingFetch = false; return; } /** * For full system, if this is a device request we need to set the * requestor ID of the packet to the GPU memory manager so it is routed * through Ruby as a memory request and not a PIO request. */ if (!pkt->req->systemReq()) { pkt->req->requestorId(computeUnit.vramRequestorId()); } /** * we should have reserved an entry in the fetch buffer * for this cache line. here we get the pointer to the * entry used to buffer this request's line data. */ pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId) .reservedBuf(pkt->req->getVaddr())); // New SenderState for the memory access pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront); if (timingSim) { // translation is done. Send the appropriate timing memory request. if (pkt->req->systemReq()) { SystemHubEvent *resp_event = new SystemHubEvent(pkt, this); assert(computeUnit.shader->systemHub); computeUnit.shader->systemHub->sendRequest(pkt, resp_event); } else if (!computeUnit.sqcPort.sendTimingReq(pkt)) { computeUnit.sqcPort.retries.push_back(std::make_pair(pkt, wavefront)); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n", computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr()); } else { DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n", computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr()); } } else { computeUnit.sqcPort.sendFunctional(pkt); processFetchReturn(pkt); } } void FetchUnit::processFetchReturn(PacketPtr pkt) { ComputeUnit::SQCPort::SenderState *sender_state = safe_cast(pkt->senderState); Wavefront *wavefront = sender_state->wavefront; DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned " "%d bytes!\n", computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize()); if (wavefront->dropFetch) { assert(wavefront->instructionBuffer.empty()); assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess()); wavefront->dropFetch = false; } else { fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr()); } wavefront->pendingFetch = false; delete pkt->senderState; delete pkt; } void FetchUnit::flushBuf(int wfSlotId) { fetchBuf.at(wfSlotId).flushBuf(); } void FetchUnit::bindWaveList(std::vector *wave_list) { waveList = wave_list; } /** FetchBufDesc */ void FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf) { wavefront = wf; fetchDepth = fetch_depth; maxIbSize = wavefront->maxIbSize; cacheLineSize = cache_line_size; maxFbSize = cacheLineSize * fetchDepth; // Calculate the number of bits to address a cache line panic_if(!isPowerOf2(cacheLineSize), "Cache line size should be a power of two."); cacheLineBits = floorLog2(cacheLineSize); bufStart = new uint8_t[maxFbSize]; readPtr = bufStart; bufEnd = bufStart + maxFbSize; for (int i = 0; i < fetchDepth; ++i) { freeList.emplace_back(readPtr + i * cacheLineSize); } } void FetchUnit::FetchBufDesc::flushBuf() { restartFromBranch = true; /** * free list may have some entries * so we clear it here to avoid duplicates */ freeList.clear(); bufferedPCs.clear(); reservedPCs.clear(); readPtr = bufStart; for (int i = 0; i < fetchDepth; ++i) { freeList.push_back(bufStart + i * cacheLineSize); } DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch " "buffer\n", wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId); } Addr FetchUnit::FetchBufDesc::nextFetchAddr() { Addr next_line = 0; if (bufferedAndReservedLines()) { Addr last_line_fetched = 0; if (!reservedLines()) { /** * get the PC of the most recently fetched cache line, * then return the address of the next line. */ last_line_fetched = bufferedPCs.rbegin()->first; } else { last_line_fetched = reservedPCs.rbegin()->first; } next_line = last_line_fetched + cacheLineSize; /** * should not be trying to fetch a line that has already * been fetched. */ assert(bufferedPCs.find(next_line) == bufferedPCs.end()); assert(reservedPCs.find(next_line) == reservedPCs.end()); } else { /** * we do not have any buffered cache lines yet, so we * assume this is the initial fetch, or the first fetch * after a branch, and get the PC directly from the WF. * in the case of a branch, we may not start at the * beginning of a cache line, so we adjust the readPtr by * the current PC's offset from the start of the line. */ next_line = ruby::makeLineAddress(wavefront->pc(), cacheLineBits); readPtr = bufStart; /** * if we are here we have no buffered lines. in the case we flushed * the buffer due to a branch, we may need to start fetching from * some offset from the start of the fetch buffer, so we adjust for * that here. */ if (restartFromBranch) { restartFromBranch = false; int byte_offset = wavefront->pc() - ruby::makeLineAddress(wavefront->pc(), cacheLineBits); readPtr += byte_offset; } } return next_line; } void FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr) { // we should have free buffer space, and the line // at vaddr should not already be cached. assert(hasFreeSpace()); assert(bufferedPCs.find(vaddr) == bufferedPCs.end()); assert(reservedPCs.find(vaddr) == reservedPCs.end()); assert(bufferedAndReservedLines() < fetchDepth); DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry " "for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, vaddr); /** * we reserve buffer space, by moving it out of the * free list, however we do not mark the buffered * line as valid until the fetch unit for this buffer * has receieved the response from the memory system. */ uint8_t *inst_buf = freeList.front(); reservedPCs.emplace(vaddr, inst_buf); freeList.pop_front(); } void FetchUnit::FetchBufDesc::fetchDone(Addr vaddr) { assert(bufferedPCs.find(vaddr) == bufferedPCs.end()); DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n", wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, vaddr); /** * this address should have an entry reserved in the * fetch buffer already, however it should be invalid * until the fetch completes. */ auto reserved_pc = reservedPCs.find(vaddr); assert(reserved_pc != reservedPCs.end()); bufferedPCs.emplace(vaddr, reserved_pc->second); if (readPtr == bufEnd) { readPtr = bufStart; } reserved_pc->second = nullptr; reservedPCs.erase(reserved_pc); } bool FetchUnit::FetchBufDesc::hasFetchDataToProcess() const { return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst); } void FetchUnit::FetchBufDesc::checkWaveReleaseBuf() { Addr cur_wave_pc = roundDown(wavefront->pc(), wavefront->computeUnit->cacheLineSize()); if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) { DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still " "being fetched.\n", wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc); // should be reserved, but not buffered yet assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end()); return; } auto current_buffered_pc = bufferedPCs.find(cur_wave_pc); auto oldest_buffered_pc = bufferedPCs.begin(); DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x" "(PC = %#x) can be released.\n", wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc, wavefront->pc()); #ifdef GEM5_DEBUG int idx = 0; for (const auto &buf_pc : bufferedPCs) { DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first); ++idx; } #endif // if we haven't buffered data for this PC, we shouldn't // be fetching from it. assert(current_buffered_pc != bufferedPCs.end()); /** * we're using a std::map so the addresses are sorted. if this * PC is not the oldest one in the map, we must be fetching from * a newer block, and we can release the oldest PC's fetch buffer * entry back to the free list. */ if (current_buffered_pc != oldest_buffered_pc) { DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, " "removing it from the fetch buffer.\n", wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, oldest_buffered_pc->first); freeList.emplace_back(oldest_buffered_pc->second); oldest_buffered_pc->second = nullptr; bufferedPCs.erase(oldest_buffered_pc); DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n", wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, bufferedLines()); } } void FetchUnit::FetchBufDesc::decodeInsts() { assert(readPtr); if (splitDecode()) { decodeSplitInst(); } while (wavefront->instructionBuffer.size() < maxIbSize && hasFetchDataToProcess()) { if (splitDecode()) { decodeSplitInst(); } else { TheGpuISA::MachInst mach_inst = reinterpret_cast(readPtr); GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst); readPtr += gpu_static_inst->instSize(); assert(readPtr <= bufEnd); GPUDynInstPtr gpu_dyn_inst = std::make_shared(wavefront->computeUnit, wavefront, gpu_static_inst, wavefront->computeUnit-> getAndIncSeqNum()); wavefront->instructionBuffer.push_back(gpu_dyn_inst); DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). " "%d bytes remain.\n", wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, gpu_static_inst->disassemble(), gpu_static_inst->instSize(), fetchBytesRemaining()); } } } void FetchUnit::FetchBufDesc::decodeSplitInst() { TheGpuISA::RawMachInst split_inst = 0; int dword_size = sizeof(uint32_t); int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size; for (int i = 0; i < num_dwords; ++i) { replaceBits(split_inst, 32*(i+1)-1, 32*i, *reinterpret_cast(readPtr)); if (readPtr + dword_size >= bufEnd) { readPtr = bufStart; } } assert(readPtr == bufStart); TheGpuISA::MachInst mach_inst = reinterpret_cast(&split_inst); GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst); readPtr += (gpu_static_inst->instSize() - dword_size); assert(readPtr < bufEnd); GPUDynInstPtr gpu_dyn_inst = std::make_shared(wavefront->computeUnit, wavefront, gpu_static_inst, wavefront->computeUnit-> getAndIncSeqNum()); wavefront->instructionBuffer.push_back(gpu_dyn_inst); DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) " "(%d bytes). %d bytes remain in %d buffered lines.\n", wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, gpu_static_inst->disassemble(), split_inst, gpu_static_inst->instSize(), fetchBytesRemaining(), bufferedLines()); } bool FetchUnit::FetchBufDesc::splitDecode() const { /** * if a read of a raw instruction would go beyond the end * of the fetch buffer, then we must perform a split decode. */ bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd; return is_split; } int FetchUnit::FetchBufDesc::fetchBytesRemaining() const { int bytes_remaining = 0; if (bufferedLines() && readPtr != bufEnd) { auto last_buf_pc = bufferedPCs.rbegin(); uint8_t *end_ptr = last_buf_pc->second + cacheLineSize; int byte_diff = end_ptr - readPtr; if (end_ptr > readPtr) { bytes_remaining = byte_diff; } else if (end_ptr < readPtr) { bytes_remaining = bufferedBytes() + byte_diff; } } assert(bytes_remaining <= bufferedBytes()); return bytes_remaining; } void FetchUnit::SystemHubEvent::process() { reqPkt->makeResponse(); fetchUnit->computeUnit.handleSQCReturn(reqPkt); } } // namespace gem5