gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
2018-05-01 16:59:35 -04:00
parent b0eac7857a
commit b8da9abba7
86 changed files with 10299 additions and 3734 deletions
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -45,11 +45,9 @@

 uint32_t FetchUnit::globalFetchUnitID;

-FetchUnit::FetchUnit(const ComputeUnitParams* params) :
-    timingSim(true),
-    computeUnit(nullptr),
-    fetchScheduler(params),
-    waveList(nullptr)
+FetchUnit::FetchUnit(const ComputeUnitParams* params)
+    : timingSim(true), computeUnit(nullptr), fetchScheduler(params),
+      waveList(nullptr), fetchDepth(params->fetch_depth)
 {
 }

@@ -66,9 +64,14 @@ FetchUnit::init(ComputeUnit *cu)
    timingSim = computeUnit->shader->timingSim;
    fetchQueue.clear();
    fetchStatusQueue.resize(computeUnit->shader->n_wf);
+    fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc());

-    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
-        fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
+    for (int i = 0; i < computeUnit->shader->n_wf; ++i) {
+        Wavefront *wf = waveList->at(i);
+        assert(wf->wfSlotId == i);
+        fetchStatusQueue[i] = std::make_pair(wf, false);
+        fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf);
+        fetchBuf[i].decoder(&decoder);
    }

    fetchScheduler.bindList(&fetchQueue);
@@ -77,6 +80,23 @@ FetchUnit::init(ComputeUnit *cu)
 void
 FetchUnit::exec()
 {
+    /**
+     * now we check if any of the fetch buffers have
+     * buffered instruction data that can be decoded
+     * and sent to its wavefront's instruction buffer.
+     * then we check if any of the fetch buffer entries
+     * can be released. we only check if we can
+     * release a buffer
+     */
+    for (auto &fetch_buf : fetchBuf) {
+        if (!fetch_buf.hasFreeSpace()) {
+            fetch_buf.checkWaveReleaseBuf();
+        }
+        if (fetch_buf.hasFetchDataToProcess()) {
+            fetch_buf.decodeInsts();
+        }
+    }
+
    // re-evaluate waves which are marked as not ready for fetch
    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
        // Following code assumes 64-bit opertaion and all insts are
@@ -88,9 +108,10 @@ FetchUnit::exec()
        // 4 or less instructions and it can not have any branches to
        // prevent speculative instruction fetches
        if (!fetchStatusQueue[j].second) {
-            if (curWave->status == Wavefront::S_RUNNING &&
-                curWave->instructionBuffer.size() <= 4 &&
-                !curWave->instructionBufferHasBranch() &&
+            if ((curWave->getStatus() == Wavefront::S_RUNNING ||
+                curWave->getStatus() == Wavefront::S_WAITCNT) &&
+                fetchBuf[j].hasFreeSpace() &&
+                !curWave->stopFetch() &&
                !curWave->pendingFetch) {
                fetchQueue.push_back(curWave);
                fetchStatusQueue[j].second = true;
@@ -111,45 +132,38 @@ FetchUnit::exec()
 void
 FetchUnit::initiateFetch(Wavefront *wavefront)
 {
-    // calculate the virtual address to fetch from the SQC
-    Addr vaddr = wavefront->pc();
+    assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace());

    /**
-     * the instruction buffer holds one instruction per entry, regardless
-     * of the underlying instruction's size. the PC, however, addresses
-     * instrutions on a 32b granularity so we must account for that here.
-    */
-    for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) {
-        vaddr +=
-            wavefront->instructionBuffer.at(i)->staticInstruction()->instSize();
-    }
-    vaddr = wavefront->basePtr +  vaddr;
+     * calculate the virtual address to fetch from the SQC. the fetch
+     * buffer holds a configurable number of cache lines. we start
+     * fetching at the address of the cache line immediately following
+     * the buffered line(s).
+     */
+    Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr();
+
+    // this should already be aligned to a cache line
+    assert(vaddr == makeLineAddress(vaddr,
+           computeUnit->getCacheLineBits()));
+
+    // shouldn't be fetching a line that is already buffered
+    assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
+
+    fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
+
+    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
+            "from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId,
+            wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);

    DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);

-    // Since this is an instruction prefetch, if you're split then just finish
-    // out the current line.
-    int block_size = computeUnit->cacheLineSize();
-    // check for split accesses
-    Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
-    int size = block_size;
-
-    if (split_addr > vaddr) {
-        // misaligned access, just grab the rest of the line
-        size = split_addr - vaddr;
-    }
-
    // set up virtual request
    RequestPtr req = std::make_shared<Request>(
-        vaddr, size, Request::INST_FETCH,
+        vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH,
        computeUnit->masterId(), 0, 0, nullptr);

    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    // This fetchBlock is kind of faux right now - because the translations so
-    // far don't actually return Data
-    uint64_t fetchBlock;
-    pkt->dataStatic(&fetchBlock);

    if (timingSim) {
        // SenderState needed on Return
@@ -210,19 +224,23 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
            pkt->req->getPaddr());

-    // this is necessary because the GPU TLB receives packets instead of
-    // requests. when the translation is complete, all relevent fields in the
-    // request will be populated, but not in the packet. here we create the
-    // new packet so we can set the size, addr, and proper flags.
+    /**
+     * this is necessary because the GPU TLB receives packets instead of
+     * requests. when the translation is complete, all relevent fields in
+     * the request will be populated, but not in the packet. here we create
+     * the new packet so we can set the size, addr, and proper flags.
+     */
    PacketPtr oldPkt = pkt;
    pkt = new Packet(oldPkt->req, oldPkt->cmd);
    delete oldPkt;

-    TheGpuISA::RawMachInst *data =
-        new TheGpuISA::RawMachInst[pkt->req->getSize() /
-        sizeof(TheGpuISA::RawMachInst)];
-
-    pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
+    /**
+     * we should have reserved an entry in the fetch buffer
+     * for this cache line. here we get the pointer to the
+     * entry used to buffer this request's line data.
+     */
+    pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId)
+                    .reservedBuf(pkt->req->getVaddr()));

    // New SenderState for the memory access
    pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
@@ -257,47 +275,15 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
    Wavefront *wavefront = sender_state->wavefront;

    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
-            "%d bytes, %d instructions!\n", computeUnit->cu_id,
-            wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
-            pkt->req->getSize(), pkt->req->getSize() /
-            sizeof(TheGpuISA::RawMachInst));
+            "%d bytes!\n", computeUnit->cu_id, wavefront->simdId,
+            wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());

    if (wavefront->dropFetch) {
        assert(wavefront->instructionBuffer.empty());
+        assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
        wavefront->dropFetch = false;
    } else {
-        TheGpuISA::RawMachInst *inst_index_ptr =
-            (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
-
-        assert(wavefront->instructionBuffer.size() <= 4);
-
-        for (int i = 0; i < pkt->req->getSize() /
-             sizeof(TheGpuISA::RawMachInst); ++i) {
-            GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
-
-            assert(inst_ptr);
-
-            if (inst_ptr->instSize() == 8) {
-                /**
-                 * this instruction occupies 2 consecutive
-                 * entries in the instruction array, the
-                 * second of which contains a nullptr. so if
-                 * this inst is 8 bytes we advance two entries
-                 * instead of 1
-                 */
-                ++i;
-            }
-
-            DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
-                    computeUnit->cu_id, wavefront->simdId,
-                    wavefront->wfSlotId, inst_ptr->disassemble());
-
-            GPUDynInstPtr gpuDynInst =
-                std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
-                                             computeUnit->getAndIncSeqNum());
-
-            wavefront->instructionBuffer.push_back(gpuDynInst);
-        }
+        fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
    }

    wavefront->pendingFetch = false;
@@ -306,8 +292,337 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
    delete pkt;
 }

+void
+FetchUnit::flushBuf(int wfSlotId)
+{
+    fetchBuf.at(wfSlotId).flushBuf();
+}
+
 void
 FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
 {
    waveList = wave_list;
 }
+
+/** FetchBufDesc */
+void
+FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size,
+                                     Wavefront *wf)
+{
+    wavefront = wf;
+    fetchDepth = fetch_depth;
+    maxIbSize = wavefront->maxIbSize;
+    cacheLineSize = cache_line_size;
+    maxFbSize = cacheLineSize * fetchDepth;
+
+    // Calculate the number of bits to address a cache line
+    panic_if(!isPowerOf2(cacheLineSize),
+        "Cache line size should be a power of two.");
+    cacheLineBits = floorLog2(cacheLineSize);
+
+    bufStart = new uint8_t[maxFbSize];
+    readPtr = bufStart;
+    bufEnd = bufStart + maxFbSize;
+
+    for (int i = 0; i < fetchDepth; ++i) {
+        freeList.emplace_back(readPtr + i * cacheLineSize);
+    }
+}
+
+void
+FetchUnit::FetchBufDesc::flushBuf()
+{
+    restartFromBranch = true;
+    /**
+     * free list may have some entries
+     * so we clear it here to avoid duplicates
+     */
+    freeList.clear();
+    bufferedPCs.clear();
+    reservedPCs.clear();
+    readPtr = bufStart;
+
+    for (int i = 0; i < fetchDepth; ++i) {
+        freeList.push_back(bufStart + i * cacheLineSize);
+    }
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
+            "buffer\n", wavefront->simdId, wavefront->wfSlotId,
+            wavefront->wfDynId);
+}
+
+Addr
+FetchUnit::FetchBufDesc::nextFetchAddr()
+{
+    Addr next_line = 0;
+
+    if (bufferedAndReservedLines()) {
+        Addr last_line_fetched = 0;
+        if (!reservedLines()) {
+            /**
+             * get the PC of the most recently fetched cache line,
+             * then return the address of the next line.
+             */
+            last_line_fetched = bufferedPCs.rbegin()->first;
+        } else {
+            last_line_fetched = reservedPCs.rbegin()->first;
+        }
+
+        next_line = last_line_fetched + cacheLineSize;
+
+        /**
+         * should not be trying to fetch a line that has already
+         * been fetched.
+         */
+        assert(bufferedPCs.find(next_line) == bufferedPCs.end());
+        assert(reservedPCs.find(next_line) == reservedPCs.end());
+    } else {
+        /**
+         * we do not have any buffered cache lines yet, so we
+         * assume this is the initial fetch, or the first fetch
+         * after a branch, and get the PC directly from the WF.
+         * in the case of a branch, we may not start at the
+         * beginning of a cache line, so we adjust the readPtr by
+         * the current PC's offset from the start of the line.
+         */
+        next_line = makeLineAddress(wavefront->pc(), cacheLineBits);
+        readPtr = bufStart;
+
+        /**
+         * if we are here we have no buffered lines. in the case we flushed
+         * the buffer due to a branch, we may need to start fetching from
+         * some offset from the start of the fetch buffer, so we adjust for
+         * that here.
+         */
+        if (restartFromBranch) {
+            restartFromBranch = false;
+            int byte_offset
+                = wavefront->pc() - makeLineAddress(wavefront->pc(),
+                                    cacheLineBits);
+            readPtr += byte_offset;
+        }
+    }
+
+    return next_line;
+}
+
+void
+FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
+{
+    // we should have free buffer space, and the line
+    // at vaddr should not already be cached.
+    assert(hasFreeSpace());
+    assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
+    assert(reservedPCs.find(vaddr) == reservedPCs.end());
+    assert(bufferedAndReservedLines() < fetchDepth);
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry "
+            "for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
+            wavefront->wfDynId, vaddr);
+
+    /**
+     * we reserve buffer space, by moving it out of the
+     * free list, however we do not mark the buffered
+     * line as valid until the fetch unit for this buffer
+     * has receieved the response from the memory system.
+     */
+    uint8_t *inst_buf = freeList.front();
+    reservedPCs.emplace(vaddr, inst_buf);
+    freeList.pop_front();
+}
+
+void
+FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
+{
+    assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
+            wavefront->simdId, wavefront->wfSlotId,
+            wavefront->wfDynId, vaddr);
+
+    /**
+     * this address should have an entry reserved in the
+     * fetch buffer already, however it should be invalid
+     * until the fetch completes.
+     */
+    auto reserved_pc = reservedPCs.find(vaddr);
+    assert(reserved_pc != reservedPCs.end());
+    bufferedPCs.emplace(vaddr, reserved_pc->second);
+
+    if (readPtr == bufEnd) {
+        readPtr = bufStart;
+    }
+
+    reserved_pc->second = nullptr;
+    reservedPCs.erase(reserved_pc);
+}
+
+bool
+FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
+{
+    return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst);
+}
+
+void
+FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
+{
+    Addr cur_wave_pc = roundDown(wavefront->pc(),
+                                 wavefront->computeUnit->cacheLineSize());
+    if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
+        DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still "
+                "being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
+                wavefront->wfDynId, cur_wave_pc);
+
+        // should be reserved, but not buffered yet
+        assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
+
+        return;
+    }
+
+    auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
+    auto oldest_buffered_pc = bufferedPCs.begin();
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
+            "(PC = %#x) can be released.\n", wavefront->simdId,
+            wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
+            wavefront->pc());
+
+#ifdef DEBUG
+    int idx = 0;
+    for (const auto &buf_pc : bufferedPCs) {
+        DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first);
+        ++idx;
+    }
+#endif
+
+    // if we haven't buffered data for this PC, we shouldn't
+    // be fetching from it.
+    assert(current_buffered_pc != bufferedPCs.end());
+
+    /**
+     * we're using a std::map so the addresses are sorted. if this
+     * PC is not the oldest one in the map, we must be fetching from
+     * a newer block, and we can release the oldest PC's fetch buffer
+     * entry back to the free list.
+     */
+    if (current_buffered_pc != oldest_buffered_pc) {
+        DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
+                "removing it from the fetch buffer.\n", wavefront->simdId,
+                wavefront->wfSlotId, wavefront->wfDynId,
+                oldest_buffered_pc->first);
+
+        freeList.emplace_back(oldest_buffered_pc->second);
+        oldest_buffered_pc->second = nullptr;
+        bufferedPCs.erase(oldest_buffered_pc);
+        DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n",
+                wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
+                bufferedLines());
+    }
+}
+
+void
+FetchUnit::FetchBufDesc::decodeInsts()
+{
+    assert(readPtr);
+
+    if (splitDecode()) {
+        decodeSplitInst();
+    }
+
+    while (wavefront->instructionBuffer.size() < maxIbSize
+           && hasFetchDataToProcess()) {
+        if (splitDecode()) {
+            decodeSplitInst();
+        } else {
+            TheGpuISA::MachInst mach_inst
+                = reinterpret_cast<TheGpuISA::MachInst>(readPtr);
+            GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
+            readPtr += gpu_static_inst->instSize();
+
+            assert(readPtr <= bufEnd);
+
+            GPUDynInstPtr gpu_dyn_inst
+                = std::make_shared<GPUDynInst>(wavefront->computeUnit,
+                                               wavefront, gpu_static_inst,
+                                               wavefront->computeUnit->
+                                                   getAndIncSeqNum());
+            wavefront->instructionBuffer.push_back(gpu_dyn_inst);
+
+            DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
+                    "%d bytes remain.\n", wavefront->simdId,
+                    wavefront->wfSlotId, wavefront->wfDynId,
+                    gpu_static_inst->disassemble(),
+                    gpu_static_inst->instSize(),
+                    fetchBytesRemaining());
+        }
+    }
+}
+
+void
+FetchUnit::FetchBufDesc::decodeSplitInst()
+{
+    TheGpuISA::RawMachInst split_inst = 0;
+    int dword_size = sizeof(uint32_t);
+    int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
+
+    for (int i = 0; i < num_dwords; ++i) {
+        ((uint32_t*)(&split_inst))[i] = *reinterpret_cast<uint32_t*>(readPtr);
+        if (readPtr + dword_size >= bufEnd) {
+            readPtr = bufStart;
+        }
+    }
+
+    assert(readPtr == bufStart);
+
+    TheGpuISA::MachInst mach_inst
+        = reinterpret_cast<TheGpuISA::MachInst>(&split_inst);
+    GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
+    readPtr += (gpu_static_inst->instSize() - dword_size);
+    assert(readPtr < bufEnd);
+
+    GPUDynInstPtr gpu_dyn_inst
+        = std::make_shared<GPUDynInst>(wavefront->computeUnit,
+                                       wavefront, gpu_static_inst,
+                                       wavefront->computeUnit->
+                                           getAndIncSeqNum());
+    wavefront->instructionBuffer.push_back(gpu_dyn_inst);
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
+            "(%d bytes). %d bytes remain in %d buffered lines.\n",
+            wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
+            gpu_static_inst->disassemble(), split_inst,
+            gpu_static_inst->instSize(), fetchBytesRemaining(),
+            bufferedLines());
+}
+
+bool
+FetchUnit::FetchBufDesc::splitDecode() const
+{
+    /**
+     * if a read of a raw instruction would go beyond the end
+     * of the fetch buffer, then we must perform a split decode.
+     */
+    bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd;
+
+    return is_split;
+}
+
+int
+FetchUnit::FetchBufDesc::fetchBytesRemaining() const
+{
+    int bytes_remaining = 0;
+
+    if (bufferedLines() && readPtr != bufEnd) {
+        auto last_buf_pc = bufferedPCs.rbegin();
+        uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
+        int byte_diff = end_ptr - readPtr;
+
+        if (end_ptr > readPtr) {
+            bytes_remaining = byte_diff;
+        } else if (end_ptr < readPtr) {
+            bytes_remaining = bufferedBytes() + byte_diff;
+        }
+    }
+
+    assert(bytes_remaining <= bufferedBytes());
+    return bytes_remaining;
+}