diff --git a/src/arch/gcn3/gpu_mem_helpers.hh b/src/arch/gcn3/gpu_mem_helpers.hh new file mode 100644 index 0000000000..40ca56561b --- /dev/null +++ b/src/arch/gcn3/gpu_mem_helpers.hh @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Matt Sinclair + */ + +#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__ +#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__ + +#include "arch/gcn3/insts/gpu_static_inst.hh" +#include "arch/gcn3/insts/op_encodings.hh" +#include "debug/GPUMem.hh" +#include "gpu-compute/gpu_dyn_inst.hh" + +/** + * Helper function for instructions declared in op_encodings. This function + * takes in all of the arguments for a given memory request we are trying to + * initialize, then submits the request or requests depending on if the + * original request is aligned or unaligned. + */ +template +inline void +initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type, + bool is_atomic=false) +{ + // local variables + int req_size = N * sizeof(T); + int block_size = gpuDynInst->computeUnit()->cacheLineSize(); + Addr vaddr = 0, split_addr = 0; + bool misaligned_acc = false; + RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr; + PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr; + + gpuDynInst->resetEntireStatusVector(); + for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vaddr = gpuDynInst->addr[lane]; + + /** + * the base address of the cache line where the the last + * byte of the request will be stored. + */ + split_addr = roundDown(vaddr + req_size - 1, block_size); + + assert(split_addr <= vaddr || split_addr - vaddr < block_size); + /** + * if the base cache line address of the last byte is + * greater than the address of the first byte then we have + * a misaligned access. + */ + misaligned_acc = split_addr > vaddr; + + if (is_atomic) { + req = std::make_shared(vaddr, sizeof(T), 0, + gpuDynInst->computeUnit()->masterId(), 0, + gpuDynInst->wfDynId, + gpuDynInst->makeAtomicOpFunctor( + &(reinterpret_cast(gpuDynInst->a_data))[lane], + &(reinterpret_cast(gpuDynInst->x_data))[lane])); + } else { + req = std::make_shared(vaddr, req_size, 0, + gpuDynInst->computeUnit()->masterId(), 0, + gpuDynInst->wfDynId); + } + + if (misaligned_acc) { + gpuDynInst->setStatusVector(lane, 2); + req->splitOnVaddr(split_addr, req1, req2); + gpuDynInst->setRequestFlags(req1); + gpuDynInst->setRequestFlags(req2); + pkt1 = new Packet(req1, mem_req_type); + pkt2 = new Packet(req2, mem_req_type); + pkt1->dataStatic(&(reinterpret_cast( + gpuDynInst->d_data))[lane * N]); + pkt2->dataStatic(&(reinterpret_cast( + gpuDynInst->d_data))[lane * N + req1->getSize()]); + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory " + "request for %#x\n", gpuDynInst->cu_id, + gpuDynInst->simdId, gpuDynInst->wfSlotId, lane, + split_addr); + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1); + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2); + } else { + gpuDynInst->setStatusVector(lane, 1); + gpuDynInst->setRequestFlags(req); + pkt = new Packet(req, mem_req_type); + pkt->dataStatic(&(reinterpret_cast( + gpuDynInst->d_data))[lane * N]); + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt); + } + } else { // if lane is not active, then no pending requests + gpuDynInst->setStatusVector(lane, 0); + } + } +} + +/** + * Helper function for scalar instructions declared in op_encodings. This + * function takes in all of the arguments for a given memory request we are + * trying to initialize, then submits the request or requests depending on if + * the original request is aligned or unaligned. + */ +template +inline void +initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type) +{ + int req_size = N * sizeof(T); + int block_size = gpuDynInst->computeUnit()->cacheLineSize(); + Addr vaddr = gpuDynInst->scalarAddr; + + /** + * the base address of the cache line where the the last byte of + * the request will be stored. + */ + Addr split_addr = roundDown(vaddr + req_size - 1, block_size); + + assert(split_addr <= vaddr || split_addr - vaddr < block_size); + /** + * if the base cache line address of the last byte is greater + * than the address of the first byte then we have a misaligned + * access. + */ + bool misaligned_acc = split_addr > vaddr; + + RequestPtr req = std::make_shared(vaddr, req_size, 0, + gpuDynInst->computeUnit()->masterId(), 0, + gpuDynInst->wfDynId); + + if (misaligned_acc) { + RequestPtr req1, req2; + req->splitOnVaddr(split_addr, req1, req2); + gpuDynInst->numScalarReqs = 2; + gpuDynInst->setRequestFlags(req1); + gpuDynInst->setRequestFlags(req2); + PacketPtr pkt1 = new Packet(req1, mem_req_type); + PacketPtr pkt2 = new Packet(req2, mem_req_type); + pkt1->dataStatic(gpuDynInst->scalar_data); + pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize()); + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for" + " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, split_addr); + gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1); + gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2); + } else { + gpuDynInst->numScalarReqs = 1; + gpuDynInst->setRequestFlags(req); + PacketPtr pkt = new Packet(req, mem_req_type); + pkt->dataStatic(gpuDynInst->scalar_data); + gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt); + } +} + +#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__ diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh index 3197dc078f..308560a5f7 100644 --- a/src/arch/gcn3/insts/op_encodings.hh +++ b/src/arch/gcn3/insts/op_encodings.hh @@ -37,6 +37,7 @@ #define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__ #include "arch/gcn3/gpu_decoder.hh" +#include "arch/gcn3/gpu_mem_helpers.hh" #include "arch/gcn3/insts/gpu_static_inst.hh" #include "arch/gcn3/operand.hh" #include "debug/GPUExec.hh" @@ -174,47 +175,8 @@ namespace Gcn3ISA void initMemRead(GPUDynInstPtr gpuDynInst) { - int block_size = gpuDynInst->computeUnit()->cacheLineSize(); - int req_size = N * sizeof(ScalarRegU32); - Addr vaddr = gpuDynInst->scalarAddr; - - /** - * the base address of the cache line where the the last byte of - * the request will be stored. - */ - Addr split_addr = roundDown(vaddr + req_size - 1, block_size); - - assert(split_addr <= vaddr || split_addr - vaddr < block_size); - /** - * if the base cache line address of the last byte is greater - * than the address of the first byte then we have a misaligned - * access. - */ - bool misaligned_acc = split_addr > vaddr; - - RequestPtr req = std::make_shared(vaddr, req_size, 0, - gpuDynInst->computeUnit()->masterId(), 0, - gpuDynInst->wfDynId); - - if (misaligned_acc) { - RequestPtr req1, req2; - req->splitOnVaddr(split_addr, req1, req2); - gpuDynInst->numScalarReqs = 2; - gpuDynInst->setRequestFlags(req1); - gpuDynInst->setRequestFlags(req2); - PacketPtr pkt1 = new Packet(req1, MemCmd::ReadReq); - PacketPtr pkt2 = new Packet(req2, MemCmd::ReadReq); - pkt1->dataStatic(gpuDynInst->scalar_data); - pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize()); - gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1); - gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2); - } else { - gpuDynInst->numScalarReqs = 1; - gpuDynInst->setRequestFlags(req); - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); - pkt->dataStatic(gpuDynInst->scalar_data); - gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt); - } + initMemReqScalarHelper(gpuDynInst, + MemCmd::ReadReq); } /** @@ -224,47 +186,8 @@ namespace Gcn3ISA void initMemWrite(GPUDynInstPtr gpuDynInst) { - int block_size = gpuDynInst->computeUnit()->cacheLineSize(); - int req_size = N * sizeof(ScalarRegU32); - Addr vaddr = gpuDynInst->scalarAddr; - - /** - * the base address of the cache line where the the last byte of - * the request will be stored. - */ - Addr split_addr = roundDown(vaddr + req_size - 1, block_size); - - assert(split_addr <= vaddr || split_addr - vaddr < block_size); - /** - * if the base cache line address of the last byte is greater - * than the address of the first byte then we have a misaligned - * access. - */ - bool misaligned_acc = split_addr > vaddr; - - RequestPtr req = std::make_shared(vaddr, req_size, 0, - gpuDynInst->computeUnit()->masterId(), 0, - gpuDynInst->wfDynId); - - if (misaligned_acc) { - RequestPtr req1, req2; - req->splitOnVaddr(split_addr, req1, req2); - gpuDynInst->numScalarReqs = 2; - gpuDynInst->setRequestFlags(req1); - gpuDynInst->setRequestFlags(req2); - PacketPtr pkt1 = new Packet(req1, MemCmd::WriteReq); - PacketPtr pkt2 = new Packet(req2, MemCmd::WriteReq); - pkt1->dataStatic(gpuDynInst->scalar_data); - pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize()); - gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1); - gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2); - } else { - gpuDynInst->numScalarReqs = 1; - gpuDynInst->setRequestFlags(req); - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); - pkt->dataStatic(gpuDynInst->scalar_data); - gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt); - } + initMemReqScalarHelper(gpuDynInst, + MemCmd::WriteReq); } void @@ -566,59 +489,22 @@ namespace Gcn3ISA void initMemRead(GPUDynInstPtr gpuDynInst) { - gpuDynInst->statusBitVector = gpuDynInst->exec_mask; - - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (gpuDynInst->exec_mask[lane]) { - Addr vaddr = gpuDynInst->addr[lane]; - - RequestPtr req = std::make_shared(vaddr, - sizeof(T), 0, - gpuDynInst->computeUnit()->masterId(), 0, - gpuDynInst->wfDynId); - - gpuDynInst->setRequestFlags(req); - - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); - pkt->dataStatic(&(reinterpret_cast( - gpuDynInst->d_data))[lane]); - - gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, - pkt); - } - } + initMemReqHelper(gpuDynInst, MemCmd::ReadReq); } template void initMemWrite(GPUDynInstPtr gpuDynInst) { - gpuDynInst->statusBitVector = gpuDynInst->exec_mask; - - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (gpuDynInst->exec_mask[lane]) { - Addr vaddr = gpuDynInst->addr[lane]; - - RequestPtr req = std::make_shared(vaddr, - sizeof(T), 0, - gpuDynInst->computeUnit()->masterId(), - 0, gpuDynInst->wfDynId); - - gpuDynInst->setRequestFlags(req); - PacketPtr pkt = new Packet(req, MemCmd::WriteReq); - pkt->dataStatic(&(reinterpret_cast( - gpuDynInst->d_data))[lane]); - gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, - pkt); - } - } + initMemReqHelper(gpuDynInst, MemCmd::WriteReq); } void injectGlobalMemFence(GPUDynInstPtr gpuDynInst) { // create request and set flags - gpuDynInst->statusBitVector = VectorMask(1); + gpuDynInst->resetEntireStatusVector(); + gpuDynInst->setStatusVector(0, 1); RequestPtr req = std::make_shared(0, 0, 0, gpuDynInst->computeUnit()-> masterId(), 0, @@ -771,133 +657,35 @@ namespace Gcn3ISA void initMemRead(GPUDynInstPtr gpuDynInst) { - gpuDynInst->statusBitVector = gpuDynInst->exec_mask; - - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (gpuDynInst->exec_mask[lane]) { - Addr vaddr = gpuDynInst->addr[lane]; - - RequestPtr req = std::make_shared(vaddr, - sizeof(T), 0, - gpuDynInst->computeUnit()->masterId(), 0, - gpuDynInst->wfDynId); - - gpuDynInst->setRequestFlags(req); - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); - pkt->dataStatic(&(reinterpret_cast( - gpuDynInst->d_data))[lane]); - gpuDynInst->computeUnit() - ->sendRequest(gpuDynInst, lane, pkt); - } - } + initMemReqHelper(gpuDynInst, MemCmd::ReadReq); } template void initMemRead(GPUDynInstPtr gpuDynInst) { - int req_size = N * sizeof(VecElemU32); - gpuDynInst->statusBitVector = gpuDynInst->exec_mask; - - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (gpuDynInst->exec_mask[lane]) { - Addr vaddr = gpuDynInst->addr[lane]; - - RequestPtr req = std::make_shared(vaddr, req_size, - 0, - gpuDynInst->computeUnit()->masterId(), 0, - gpuDynInst->wfDynId); - - gpuDynInst->setRequestFlags(req); - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); - pkt->dataStatic(&(reinterpret_cast( - gpuDynInst->d_data))[lane * N]); - gpuDynInst->computeUnit() - ->sendRequest(gpuDynInst, lane, pkt); - } - } + initMemReqHelper(gpuDynInst, MemCmd::ReadReq); } template void initMemWrite(GPUDynInstPtr gpuDynInst) { - gpuDynInst->statusBitVector = gpuDynInst->exec_mask; - - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (gpuDynInst->exec_mask[lane]) { - Addr vaddr = gpuDynInst->addr[lane]; - - RequestPtr req = std::make_shared(vaddr, - sizeof(T), 0, - gpuDynInst->computeUnit()->masterId(), - 0, gpuDynInst->wfDynId); - - gpuDynInst->setRequestFlags(req); - PacketPtr pkt = new Packet(req, MemCmd::WriteReq); - pkt->dataStatic(&(reinterpret_cast( - gpuDynInst->d_data))[lane]); - gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, - pkt); - } - } + initMemReqHelper(gpuDynInst, MemCmd::WriteReq); } template void initMemWrite(GPUDynInstPtr gpuDynInst) { - int req_size = N * sizeof(VecElemU32); - gpuDynInst->statusBitVector = gpuDynInst->exec_mask; - - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (gpuDynInst->exec_mask[lane]) { - Addr vaddr = gpuDynInst->addr[lane]; - - RequestPtr req = std::make_shared(vaddr, req_size, - 0, - gpuDynInst->computeUnit()->masterId(), - 0, gpuDynInst->wfDynId); - - gpuDynInst->setRequestFlags(req); - PacketPtr pkt = new Packet(req, MemCmd::WriteReq); - pkt->dataStatic(&(reinterpret_cast( - gpuDynInst->d_data))[lane * N]); - gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, - pkt); - } - } + initMemReqHelper(gpuDynInst, MemCmd::WriteReq); } template void initAtomicAccess(GPUDynInstPtr gpuDynInst) { - gpuDynInst->statusBitVector = gpuDynInst->exec_mask; - - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (gpuDynInst->exec_mask[lane]) { - Addr vaddr = gpuDynInst->addr[lane]; - - RequestPtr req = std::make_shared(vaddr, - sizeof(T), 0, - gpuDynInst->computeUnit()->masterId(), 0, - gpuDynInst->wfDynId, - gpuDynInst->makeAtomicOpFunctor( - &(reinterpret_cast(gpuDynInst->a_data))[lane], - &(reinterpret_cast( - gpuDynInst->x_data))[lane])); - - gpuDynInst->setRequestFlags(req); - - PacketPtr pkt = new Packet(req, MemCmd::SwapReq); - pkt->dataStatic(&(reinterpret_cast( - gpuDynInst->d_data))[lane]); - - gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, - pkt); - } - } + initMemReqHelper(gpuDynInst, MemCmd::SwapReq, true); } void diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index feeb803e19..b0616d677b 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -832,7 +832,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) gpuDynInst->wfSlotId, gpuDynInst->wfDynId, gpuDynInst->disassemble(), w->outstandingReqs, w->outstandingReqs - 1); - if (gpuDynInst->statusBitVector.none()) { + if (gpuDynInst->allLanesZero()) { // ask gm pipe to decrement request counters, instead of directly // performing here, to avoid asynchronous counter update and // instruction retirement (which may hurt waincnt effects) @@ -1078,7 +1078,6 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index); gpuDynInst->tlbHitLevel[index] = hit_level; - // translation is done. Schedule the mem_req_event at the // appropriate cycle to send the timing memory request to ruby EventFunctionWrapper *mem_req_event = @@ -1116,9 +1115,9 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) } } else { if (pkt->cmd == MemCmd::MemSyncReq) { - gpuDynInst->statusBitVector = VectorMask(0); + gpuDynInst->resetEntireStatusVector(); } else { - gpuDynInst->statusBitVector &= (~(1ll << index)); + gpuDynInst->decrementStatusVector(index); } // New SenderState for the memory access @@ -1289,12 +1288,10 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) gpuDynInst->memStatusVector[paddr].pop_back(); gpuDynInst->pAddr = pkt->req->getPaddr(); - gpuDynInst->statusBitVector &= (~(1ULL << index)); + gpuDynInst->decrementStatusVector(index); + DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector()); - DPRINTF(GPUMem, "bitvector is now %#x\n", - gpuDynInst->statusBitVector); - - if (gpuDynInst->statusBitVector == VectorMask(0)) { + if (gpuDynInst->allLanesZero()) { auto iter = gpuDynInst->memStatusVector.begin(); auto end = gpuDynInst->memStatusVector.end(); diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 74b963b73c..2a49522da9 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -42,9 +42,10 @@ GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, InstSeqNum instSeqNum) : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(), - (Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false), + (Addr)0), numScalarReqs(0), isSaveRestore(false), _staticInst(static_inst), _seqNum(instSeqNum) { + statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0); tlbHitLevel.assign(computeUnit()->wfSize(), -1); // vector instructions can have up to 4 source/destination operands d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)]; diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index 392b57d12d..3d2fa0d3f3 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -39,6 +39,8 @@ #include "base/amo.hh" #include "base/logging.hh" +#include "base/trace.hh" +#include "debug/GPUMem.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_exec_context.hh" @@ -307,13 +309,103 @@ class GPUDynInst : public GPUExecContext } } + // reset the number of pending memory requests for all lanes + void + resetEntireStatusVector() + { + assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg); + for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) { + resetStatusVector(lane); + } + } + + // reset the number of pending memory requests for the inputted lane + void + resetStatusVector(int lane) + { + setStatusVector(lane, 0); + } + + // set the number of pending memory requests for the inputted lane + void + setStatusVector(int lane, int newVal) + { + // currently we can have up to 2 memory requests per lane (if the + // lane's request goes across multiple cache lines) + assert((newVal >= 0) && (newVal <= 2)); + statusVector[lane] = newVal; + } + + // subtracts the number of pending memory requests for the inputted lane + // by 1 + void + decrementStatusVector(int lane) + { + // this lane may have multiple requests, so only subtract one for + // this request + assert(statusVector[lane] >= 1); + statusVector[lane]--; + } + + // return the current number of pending memory requests for the inputted + // lane + int + getLaneStatus(int lane) const + { + return statusVector[lane]; + } + + // returns true if all memory requests from all lanes have been received, + // else returns false + bool + allLanesZero() const + { + // local variables + bool allZero = true; + + // iterate over all lanes, checking the number of pending memory + // requests they have + for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) { + // if any lane still has pending requests, return false + if (statusVector[lane] > 0) { + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending " + "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane, + statusVector[lane], addr[lane]); + allZero = false; + } + } + + if (allZero) { + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending" + " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]); + } + return allZero; + } + + // returns a string representing the current state of the statusVector + std::string + printStatusVector() const + { + std::string statusVec_str = "["; + + // iterate over all lanes, adding the current number of pending + // requests for this lane to the string + for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) { + statusVec_str += std::to_string(statusVector[lane]); + } + statusVec_str += "]"; + + return statusVec_str; + } + // Map returned packets and the addresses they satisfy with which lane they // were requested from typedef std::unordered_map> StatusVector; StatusVector memStatusVector; - // Track the status of memory requests per lane, a bit per lane - VectorMask statusBitVector; + // Track the status of memory requests per lane, an int per lane to allow + // unaligned accesses + std::vector statusVector; // for ld_v# or st_v# std::vector tlbHitLevel; diff --git a/src/mem/ruby/common/DataBlock.cc b/src/mem/ruby/common/DataBlock.cc index a4d7f4916b..359f6bb0f7 100644 --- a/src/mem/ruby/common/DataBlock.cc +++ b/src/mem/ruby/common/DataBlock.cc @@ -107,7 +107,6 @@ DataBlock::getDataMod(int offset) void DataBlock::setData(const uint8_t *data, int offset, int len) { - assert(offset + len <= RubySystem::getBlockSizeBytes()); memcpy(&m_data[offset], data, len); } diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index 83aaa1a507..92fed81dd2 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -267,9 +267,6 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt) curTick() + rs->clockPeriod()); return true; } - - assert(getOffset(pkt->getAddr()) + pkt->getSize() <= - RubySystem::getBlockSizeBytes()); } // Save the port in the sender state object to be used later to