diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh index 0904800ed3..5a11af4ca4 100644 --- a/src/cpu/o3/dyn_inst.hh +++ b/src/cpu/o3/dyn_inst.hh @@ -384,11 +384,11 @@ class BaseO3DynInst : public ExecContext, public RefCounted /** Load queue index. */ ssize_t lqIdx = -1; - typename ::LSQUnit::LQIterator lqIt; + typename LSQUnit::LQIterator lqIt; /** Store queue index. */ ssize_t sqIdx = -1; - typename ::LSQUnit::SQIterator sqIt; + typename LSQUnit::SQIterator sqIt; /////////////////////// TLB Miss ////////////////////// diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 021bab5380..7bc15f6ccf 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -68,7 +68,6 @@ class FullO3CPU; template class DefaultIEW; -template class LSQUnit; template @@ -291,7 +290,7 @@ class LSQ bool isDelayed() { return flags.isSet(Flag::Delayed); } public: - LSQUnit& _port; + LSQUnit& _port; const O3DynInstPtr _inst; uint32_t _taskId; PacketDataPtr _data; @@ -306,9 +305,9 @@ class LSQ uint32_t _numOutstandingPackets; AtomicOpFunctorPtr _amo_op; protected: - LSQUnit* lsqUnit() { return &_port; } - LSQRequest(LSQUnit* port, const O3DynInstPtr& inst, bool isLoad); - LSQRequest(LSQUnit* port, const O3DynInstPtr& inst, bool isLoad, + LSQUnit* lsqUnit() { return &_port; } + LSQRequest(LSQUnit* port, const O3DynInstPtr& inst, bool isLoad); + LSQRequest(LSQUnit* port, const O3DynInstPtr& inst, bool isLoad, const Addr& addr, const uint32_t& size, const Request::Flags& flags_, PacketDataPtr data=nullptr, uint64_t* res=nullptr, AtomicOpFunctorPtr amo_op=nullptr); @@ -668,7 +667,7 @@ class LSQ using LSQRequest::_numOutstandingPackets; using LSQRequest::_amo_op; public: - SingleDataRequest(LSQUnit* port, const O3DynInstPtr& inst, + SingleDataRequest(LSQUnit* port, const O3DynInstPtr& inst, bool isLoad, const Addr& addr, const uint32_t& size, const Request::Flags& flags_, PacketDataPtr data=nullptr, uint64_t* res=nullptr, AtomicOpFunctorPtr amo_op=nullptr) : @@ -706,7 +705,7 @@ class LSQ using LSQRequest::flags; using LSQRequest::setState; public: - HtmCmdRequest(LSQUnit* port, const O3DynInstPtr& inst, + HtmCmdRequest(LSQUnit* port, const O3DynInstPtr& inst, const Request::Flags& flags_); inline virtual ~HtmCmdRequest() {} virtual void initiateTranslation(); @@ -753,7 +752,7 @@ class LSQ PacketPtr _mainPacket; public: - SplitDataRequest(LSQUnit* port, const O3DynInstPtr& inst, + SplitDataRequest(LSQUnit* port, const O3DynInstPtr& inst, bool isLoad, const Addr& addr, const uint32_t& size, const Request::Flags & flags_, PacketDataPtr data=nullptr, uint64_t* res=nullptr) : @@ -829,14 +828,12 @@ class LSQ /** * Commits loads up until the given sequence number for a specific thread. */ - void commitLoads(InstSeqNum &youngest_inst, ThreadID tid) - { thread.at(tid).commitLoads(youngest_inst); } + void commitLoads(InstSeqNum &youngest_inst, ThreadID tid); /** * Commits stores up until the given sequence number for a specific thread. */ - void commitStores(InstSeqNum &youngest_inst, ThreadID tid) - { thread.at(tid).commitStores(youngest_inst); } + void commitStores(InstSeqNum &youngest_inst, ThreadID tid); /** * Attempts to write back stores until all cache ports are used or the @@ -849,99 +846,55 @@ class LSQ /** * Squash instructions from a thread until the specified sequence number. */ - void - squash(const InstSeqNum &squashed_num, ThreadID tid) - { - thread.at(tid).squash(squashed_num); - } + void squash(const InstSeqNum &squashed_num, ThreadID tid); /** Returns whether or not there was a memory ordering violation. */ bool violation(); + /** * Returns whether or not there was a memory ordering violation for a * specific thread. */ - bool violation(ThreadID tid) { return thread.at(tid).violation(); } + bool violation(ThreadID tid); /** Gets the instruction that caused the memory ordering violation. */ - O3DynInstPtr - getMemDepViolator(ThreadID tid) - { - return thread.at(tid).getMemDepViolator(); - } + O3DynInstPtr getMemDepViolator(ThreadID tid); /** Returns the head index of the load queue for a specific thread. */ - int getLoadHead(ThreadID tid) { return thread.at(tid).getLoadHead(); } + int getLoadHead(ThreadID tid); /** Returns the sequence number of the head of the load queue. */ - InstSeqNum - getLoadHeadSeqNum(ThreadID tid) - { - return thread.at(tid).getLoadHeadSeqNum(); - } + InstSeqNum getLoadHeadSeqNum(ThreadID tid); /** Returns the head index of the store queue. */ - int getStoreHead(ThreadID tid) { return thread.at(tid).getStoreHead(); } + int getStoreHead(ThreadID tid); /** Returns the sequence number of the head of the store queue. */ - InstSeqNum - getStoreHeadSeqNum(ThreadID tid) - { - return thread.at(tid).getStoreHeadSeqNum(); - } + InstSeqNum getStoreHeadSeqNum(ThreadID tid); /** Returns the number of instructions in all of the queues. */ int getCount(); /** Returns the number of instructions in the queues of one thread. */ - int getCount(ThreadID tid) { return thread.at(tid).getCount(); } + int getCount(ThreadID tid); /** Returns the total number of loads in the load queue. */ int numLoads(); /** Returns the total number of loads for a single thread. */ - int numLoads(ThreadID tid) { return thread.at(tid).numLoads(); } + int numLoads(ThreadID tid); /** Returns the total number of stores in the store queue. */ int numStores(); /** Returns the total number of stores for a single thread. */ - int numStores(ThreadID tid) { return thread.at(tid).numStores(); } + int numStores(ThreadID tid); // hardware transactional memory - int numHtmStarts(ThreadID tid) const - { - if (tid == InvalidThreadID) - return 0; - else - return thread[tid].numHtmStarts(); - } - int numHtmStops(ThreadID tid) const - { - if (tid == InvalidThreadID) - return 0; - else - return thread[tid].numHtmStops(); - } - - void resetHtmStartsStops(ThreadID tid) - { - if (tid != InvalidThreadID) - thread[tid].resetHtmStartsStops(); - } - - uint64_t getLatestHtmUid(ThreadID tid) const - { - if (tid == InvalidThreadID) - return 0; - else - return thread[tid].getLatestHtmUid(); - } - - void setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid) - { - if (tid != InvalidThreadID) - thread[tid].setLastRetiredHtmUid(htmUid); - } + int numHtmStarts(ThreadID tid) const; + int numHtmStops(ThreadID tid) const; + void resetHtmStartsStops(ThreadID tid); + uint64_t getLatestHtmUid(ThreadID tid) const; + void setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid); /** Returns the number of free load entries. */ unsigned numFreeLoadEntries(); @@ -1000,22 +953,22 @@ class LSQ /** Returns whether or not a specific thread has any stores to write back * to memory. */ - bool hasStoresToWB(ThreadID tid) { return thread.at(tid).hasStoresToWB(); } + bool hasStoresToWB(ThreadID tid); /** Returns the number of stores a specific thread has to write back. */ - int numStoresToWB(ThreadID tid) { return thread.at(tid).numStoresToWB(); } + int numStoresToWB(ThreadID tid); /** Returns if the LSQ will write back to memory this cycle. */ bool willWB(); /** Returns if the LSQ of a specific thread will write back to memory this * cycle. */ - bool willWB(ThreadID tid) { return thread.at(tid).willWB(); } + bool willWB(ThreadID tid); /** Debugging function to print out all instructions. */ void dumpInsts() const; /** Debugging function to print out instructions from a specific thread. */ - void dumpInsts(ThreadID tid) const { thread.at(tid).dumpInsts(); } + void dumpInsts(ThreadID tid) const; /** Executes a read operation, using the load specified at the load * index. @@ -1122,28 +1075,10 @@ class LSQ DcachePort dcachePort; /** The LSQ units for individual threads. */ - std::vector> thread; + std::vector thread; /** Number of Threads. */ ThreadID numThreads; }; -template -Fault -LSQ::read(LSQRequest* req, int load_idx) -{ - ThreadID tid = cpu->contextToThread(req->request()->contextId()); - - return thread.at(tid).read(req, load_idx); -} - -template -Fault -LSQ::write(LSQRequest* req, uint8_t *data, int store_idx) -{ - ThreadID tid = cpu->contextToThread(req->request()->contextId()); - - return thread.at(tid).write(req, data, store_idx); -} - #endif // __CPU_O3_LSQ_HH__ diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh index bb53bb62fc..8ef440ecfc 100644 --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -262,6 +262,20 @@ LSQ::executeStore(const O3DynInstPtr &inst) return thread[tid].executeStore(inst); } +template +void +LSQ::commitLoads(InstSeqNum &youngest_inst, ThreadID tid) +{ + thread.at(tid).commitLoads(youngest_inst); +} + +template +void +LSQ::commitStores(InstSeqNum &youngest_inst, ThreadID tid) +{ + thread.at(tid).commitStores(youngest_inst); +} + template void LSQ::writebackStores() @@ -281,6 +295,13 @@ LSQ::writebackStores() } } +template +void +LSQ::squash(const InstSeqNum &squashed_num, ThreadID tid) +{ + thread.at(tid).squash(squashed_num); +} + template bool LSQ::violation() @@ -299,6 +320,98 @@ LSQ::violation() return false; } +template +bool LSQ::violation(ThreadID tid) { return thread.at(tid).violation(); } + +template +O3DynInstPtr +LSQ::getMemDepViolator(ThreadID tid) +{ + return thread.at(tid).getMemDepViolator(); +} + +template +int +LSQ::getLoadHead(ThreadID tid) +{ + return thread.at(tid).getLoadHead(); +} + +template +InstSeqNum +LSQ::getLoadHeadSeqNum(ThreadID tid) +{ + return thread.at(tid).getLoadHeadSeqNum(); +} + +template +int +LSQ::getStoreHead(ThreadID tid) +{ + return thread.at(tid).getStoreHead(); +} + +template +InstSeqNum +LSQ::getStoreHeadSeqNum(ThreadID tid) +{ + return thread.at(tid).getStoreHeadSeqNum(); +} + +template +int LSQ::getCount(ThreadID tid) { return thread.at(tid).getCount(); } + +template +int LSQ::numLoads(ThreadID tid) { return thread.at(tid).numLoads(); } + +template +int LSQ::numStores(ThreadID tid) { return thread.at(tid).numStores(); } + +template +int +LSQ::numHtmStarts(ThreadID tid) const +{ + if (tid == InvalidThreadID) + return 0; + else + return thread[tid].numHtmStarts(); +} +template +int +LSQ::numHtmStops(ThreadID tid) const +{ + if (tid == InvalidThreadID) + return 0; + else + return thread[tid].numHtmStops(); +} + +template +void +LSQ::resetHtmStartsStops(ThreadID tid) +{ + if (tid != InvalidThreadID) + thread[tid].resetHtmStartsStops(); +} + +template +uint64_t +LSQ::getLatestHtmUid(ThreadID tid) const +{ + if (tid == InvalidThreadID) + return 0; + else + return thread[tid].getLatestHtmUid(); +} + +template +void +LSQ::setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid) +{ + if (tid != InvalidThreadID) + thread[tid].setLastRetiredHtmUid(htmUid); +} + template void LSQ::recvReqRetry() @@ -651,6 +764,20 @@ LSQ::hasStoresToWB() return false; } +template +bool +LSQ::hasStoresToWB(ThreadID tid) +{ + return thread.at(tid).hasStoresToWB(); +} + +template +int +LSQ::numStoresToWB(ThreadID tid) +{ + return thread.at(tid).numStoresToWB(); +} + template bool LSQ::willWB() @@ -668,6 +795,13 @@ LSQ::willWB() return false; } +template +bool +LSQ::willWB(ThreadID tid) +{ + return thread.at(tid).willWB(); +} + template void LSQ::dumpInsts() const @@ -682,6 +816,13 @@ LSQ::dumpInsts() const } } +template +void +LSQ::dumpInsts(ThreadID tid) const +{ + thread.at(tid).dumpInsts(); +} + template Fault LSQ::pushRequest(const O3DynInstPtr& inst, bool isLoad, uint8_t *data, @@ -961,7 +1102,7 @@ LSQ::SplitDataRequest::initiateTranslation() template LSQ::LSQRequest::LSQRequest( - LSQUnit *port, const O3DynInstPtr& inst, bool isLoad) : + LSQUnit *port, const O3DynInstPtr& inst, bool isLoad) : _state(State::NotIssued), _senderState(nullptr), _port(*port), _inst(inst), _data(nullptr), _res(nullptr), _addr(0), _size(0), _flags(0), @@ -976,7 +1117,7 @@ LSQ::LSQRequest::LSQRequest( template LSQ::LSQRequest::LSQRequest( - LSQUnit* port, const O3DynInstPtr& inst, bool isLoad, + LSQUnit *port, const O3DynInstPtr& inst, bool isLoad, const Addr& addr, const uint32_t& size, const Request::Flags& flags_, PacketDataPtr data, uint64_t* res, AtomicOpFunctorPtr amo_op) : _state(State::NotIssued), _senderState(nullptr), @@ -1312,7 +1453,7 @@ LSQ::DcachePort::recvReqRetry() } template -LSQ::HtmCmdRequest::HtmCmdRequest(LSQUnit* port, +LSQ::HtmCmdRequest::HtmCmdRequest(LSQUnit* port, const O3DynInstPtr& inst, const Request::Flags& flags_) : SingleDataRequest(port, inst, true, 0x0lu, 8, flags_, @@ -1366,4 +1507,22 @@ LSQ::HtmCmdRequest::finish(const Fault &fault, const RequestPtr &req, panic("unexpected behaviour"); } +template +Fault +LSQ::read(LSQRequest* req, int load_idx) +{ + ThreadID tid = cpu->contextToThread(req->request()->contextId()); + + return thread.at(tid).read(req, load_idx); +} + +template +Fault +LSQ::write(LSQRequest* req, uint8_t *data, int store_idx) +{ + ThreadID tid = cpu->contextToThread(req->request()->contextId()); + + return thread.at(tid).write(req, data, store_idx); +} + #endif//__CPU_O3_LSQ_IMPL_HH__ diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 6b3a6529f2..9e8abd03ee 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -1,4 +1,17 @@ /* + * Copyright (c) 2010-2014, 2017-2020 ARM Limited + * Copyright (c) 2013 Advanced Micro Devices, Inc. + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * * Copyright (c) 2004-2006 The Regents of The University of Michigan * All rights reserved. * @@ -26,9 +39,1596 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "cpu/o3/isa_specific.hh" -#include "cpu/o3/lsq_unit_impl.hh" +#include "cpu/o3/lsq_unit.hh" -// Force the instantiation of LDSTQ for all the implementations we care about. -template class LSQUnit; +#include "arch/generic/debugfaults.hh" +#include "arch/locked_mem.hh" +#include "base/str.hh" +#include "config/the_isa.hh" +#include "cpu/checker/cpu.hh" +#include "cpu/o3/limits.hh" +#include "cpu/o3/lsq.hh" +#include "debug/Activity.hh" +#include "debug/HtmCpu.hh" +#include "debug/IEW.hh" +#include "debug/LSQUnit.hh" +#include "debug/O3PipeView.hh" +#include "mem/packet.hh" +#include "mem/request.hh" +LSQUnit::WritebackEvent::WritebackEvent(const O3DynInstPtr &_inst, + PacketPtr _pkt, LSQUnit *lsq_ptr) + : Event(Default_Pri, AutoDelete), + inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr) +{ + assert(_inst->savedReq); + _inst->savedReq->writebackScheduled(); +} + +void +LSQUnit::WritebackEvent::process() +{ + assert(!lsqPtr->cpu->switchedOut()); + + lsqPtr->writeback(inst, pkt); + + assert(inst->savedReq); + inst->savedReq->writebackDone(); + delete pkt; +} + +const char * +LSQUnit::WritebackEvent::description() const +{ + return "Store writeback"; +} + +bool +LSQUnit::recvTimingResp(PacketPtr pkt) +{ + auto senderState = dynamic_cast(pkt->senderState); + LSQRequest* req = senderState->request(); + assert(req != nullptr); + bool ret = true; + /* Check that the request is still alive before any further action. */ + if (senderState->alive()) { + ret = req->recvTimingResp(pkt); + } else { + senderState->outstanding--; + } + return ret; + +} + +void +LSQUnit::completeDataAccess(PacketPtr pkt) +{ + LSQSenderState *state = dynamic_cast(pkt->senderState); + O3DynInstPtr inst = state->inst; + + // hardware transactional memory + // sanity check + if (pkt->isHtmTransactional() && !inst->isSquashed()) { + assert(inst->getHtmTransactionUid() == pkt->getHtmTransactionUid()); + } + + // if in a HTM transaction, it's possible + // to abort within the cache hierarchy. + // This is signalled back to the processor + // through responses to memory requests. + if (pkt->htmTransactionFailedInCache()) { + // cannot do this for write requests because + // they cannot tolerate faults + const HtmCacheFailure htm_rc = + pkt->getHtmTransactionFailedInCacheRC(); + if (pkt->isWrite()) { + DPRINTF(HtmCpu, + "store notification (ignored) of HTM transaction failure " + "in cache - addr=0x%lx - rc=%s - htmUid=%d\n", + pkt->getAddr(), htmFailureToStr(htm_rc), + pkt->getHtmTransactionUid()); + } else { + HtmFailureFaultCause fail_reason = + HtmFailureFaultCause::INVALID; + + if (htm_rc == HtmCacheFailure::FAIL_SELF) { + fail_reason = HtmFailureFaultCause::SIZE; + } else if (htm_rc == HtmCacheFailure::FAIL_REMOTE) { + fail_reason = HtmFailureFaultCause::MEMORY; + } else if (htm_rc == HtmCacheFailure::FAIL_OTHER) { + // these are likely loads that were issued out of order + // they are faulted here, but it's unlikely that these will + // ever reach the commit head. + fail_reason = HtmFailureFaultCause::OTHER; + } else { + panic("HTM error - unhandled return code from cache (%s)", + htmFailureToStr(htm_rc)); + } + + inst->fault = + std::make_shared( + inst->getHtmTransactionUid(), + fail_reason); + + DPRINTF(HtmCpu, + "load notification of HTM transaction failure " + "in cache - pc=%s - addr=0x%lx - " + "rc=%u - htmUid=%d\n", + inst->pcState(), pkt->getAddr(), + htmFailureToStr(htm_rc), pkt->getHtmTransactionUid()); + } + } + + cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt)); + + /* Notify the sender state that the access is complete (for ownership + * tracking). */ + state->complete(); + + assert(!cpu->switchedOut()); + if (!inst->isSquashed()) { + if (state->needWB) { + // Only loads, store conditionals and atomics perform the writeback + // after receving the response from the memory + assert(inst->isLoad() || inst->isStoreConditional() || + inst->isAtomic()); + + // hardware transactional memory + if (pkt->htmTransactionFailedInCache()) { + state->request()->mainPacket()->setHtmTransactionFailedInCache( + pkt->getHtmTransactionFailedInCacheRC() ); + } + + writeback(inst, state->request()->mainPacket()); + if (inst->isStore() || inst->isAtomic()) { + auto ss = dynamic_cast(state); + ss->writebackDone(); + completeStore(ss->idx); + } + } else if (inst->isStore()) { + // This is a regular store (i.e., not store conditionals and + // atomics), so it can complete without writing back + completeStore(dynamic_cast(state)->idx); + } + } +} + +LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries) + : lsqID(-1), storeQueue(sqEntries+1), loadQueue(lqEntries+1), + loads(0), stores(0), storesToWB(0), + htmStarts(0), htmStops(0), + lastRetiredHtmUid(0), + cacheBlockMask(0), stalled(false), + isStoreBlocked(false), storeInFlight(false), stats(nullptr) +{ +} + +void +LSQUnit::init(FullO3CPU *cpu_ptr, DefaultIEW *iew_ptr, + const DerivO3CPUParams ¶ms, LSQ *lsq_ptr, unsigned id) +{ + lsqID = id; + + cpu = cpu_ptr; + iewStage = iew_ptr; + + lsq = lsq_ptr; + + cpu->addStatGroup(csprintf("lsq%i", lsqID).c_str(), &stats); + + DPRINTF(LSQUnit, "Creating LSQUnit%i object.\n",lsqID); + + depCheckShift = params.LSQDepCheckShift; + checkLoads = params.LSQCheckLoads; + needsTSO = params.needsTSO; + + resetState(); +} + + +void +LSQUnit::resetState() +{ + loads = stores = storesToWB = 0; + + // hardware transactional memory + // nesting depth + htmStarts = htmStops = 0; + + storeWBIt = storeQueue.begin(); + + retryPkt = NULL; + memDepViolator = NULL; + + stalled = false; + + cacheBlockMask = ~(cpu->cacheLineSize() - 1); +} + +std::string +LSQUnit::name() const +{ + if (O3MaxThreads == 1) { + return iewStage->name() + ".lsq"; + } else { + return iewStage->name() + ".lsq.thread" + std::to_string(lsqID); + } +} + +LSQUnit::LSQUnitStats::LSQUnitStats(Stats::Group *parent) + : Stats::Group(parent), + ADD_STAT(forwLoads, Stats::Units::Count::get(), + "Number of loads that had data forwarded from stores"), + ADD_STAT(squashedLoads, Stats::Units::Count::get(), + "Number of loads squashed"), + ADD_STAT(ignoredResponses, Stats::Units::Count::get(), + "Number of memory responses ignored because the instruction is " + "squashed"), + ADD_STAT(memOrderViolation, Stats::Units::Count::get(), + "Number of memory ordering violations"), + ADD_STAT(squashedStores, Stats::Units::Count::get(), + "Number of stores squashed"), + ADD_STAT(rescheduledLoads, Stats::Units::Count::get(), + "Number of loads that were rescheduled"), + ADD_STAT(blockedByCache, Stats::Units::Count::get(), + "Number of times an access to memory failed due to the cache " + "being blocked") +{ +} + +void +LSQUnit::setDcachePort(RequestPort *dcache_port) +{ + dcachePort = dcache_port; +} + +void +LSQUnit::drainSanityCheck() const +{ + for (int i = 0; i < loadQueue.capacity(); ++i) + assert(!loadQueue[i].valid()); + + assert(storesToWB == 0); + assert(!retryPkt); +} + +void +LSQUnit::takeOverFrom() +{ + resetState(); +} + +void +LSQUnit::insert(const O3DynInstPtr &inst) +{ + assert(inst->isMemRef()); + + assert(inst->isLoad() || inst->isStore() || inst->isAtomic()); + + if (inst->isLoad()) { + insertLoad(inst); + } else { + insertStore(inst); + } + + inst->setInLSQ(); +} + +void +LSQUnit::insertLoad(const O3DynInstPtr &load_inst) +{ + assert(!loadQueue.full()); + assert(loads < loadQueue.capacity()); + + DPRINTF(LSQUnit, "Inserting load PC %s, idx:%i [sn:%lli]\n", + load_inst->pcState(), loadQueue.tail(), load_inst->seqNum); + + /* Grow the queue. */ + loadQueue.advance_tail(); + + load_inst->sqIt = storeQueue.end(); + + assert(!loadQueue.back().valid()); + loadQueue.back().set(load_inst); + load_inst->lqIdx = loadQueue.tail(); + assert(load_inst->lqIdx > 0); + load_inst->lqIt = loadQueue.getIterator(load_inst->lqIdx); + + ++loads; + + // hardware transactional memory + // transactional state and nesting depth must be tracked + // in the in-order part of the core. + if (load_inst->isHtmStart()) { + htmStarts++; + DPRINTF(HtmCpu, ">> htmStarts++ (%d) : htmStops (%d)\n", + htmStarts, htmStops); + + const int htm_depth = htmStarts - htmStops; + const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr(); + auto htm_uid = htm_cpt->getHtmUid(); + + // for debugging purposes + if (!load_inst->inHtmTransactionalState()) { + htm_uid = htm_cpt->newHtmUid(); + DPRINTF(HtmCpu, "generating new htmUid=%u\n", htm_uid); + if (htm_depth != 1) { + DPRINTF(HtmCpu, + "unusual HTM transactional depth (%d)" + " possibly caused by mispeculation - htmUid=%u\n", + htm_depth, htm_uid); + } + } + load_inst->setHtmTransactionalState(htm_uid, htm_depth); + } + + if (load_inst->isHtmStop()) { + htmStops++; + DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops++ (%d)\n", + htmStarts, htmStops); + + if (htmStops==1 && htmStarts==0) { + DPRINTF(HtmCpu, + "htmStops==1 && htmStarts==0. " + "This generally shouldn't happen " + "(unless due to misspeculation)\n"); + } + } +} + +void +LSQUnit::insertStore(const O3DynInstPtr& store_inst) +{ + // Make sure it is not full before inserting an instruction. + assert(!storeQueue.full()); + assert(stores < storeQueue.capacity()); + + DPRINTF(LSQUnit, "Inserting store PC %s, idx:%i [sn:%lli]\n", + store_inst->pcState(), storeQueue.tail(), store_inst->seqNum); + storeQueue.advance_tail(); + + store_inst->sqIdx = storeQueue.tail(); + store_inst->lqIdx = loadQueue.tail() + 1; + assert(store_inst->lqIdx > 0); + store_inst->lqIt = loadQueue.end(); + + storeQueue.back().set(store_inst); + + ++stores; +} + +O3DynInstPtr +LSQUnit::getMemDepViolator() +{ + O3DynInstPtr temp = memDepViolator; + + memDepViolator = NULL; + + return temp; +} + +unsigned +LSQUnit::numFreeLoadEntries() +{ + //LQ has an extra dummy entry to differentiate + //empty/full conditions. Subtract 1 from the free entries. + DPRINTF(LSQUnit, "LQ size: %d, #loads occupied: %d\n", + 1 + loadQueue.capacity(), loads); + return loadQueue.capacity() - loads; +} + +unsigned +LSQUnit::numFreeStoreEntries() +{ + //SQ has an extra dummy entry to differentiate + //empty/full conditions. Subtract 1 from the free entries. + DPRINTF(LSQUnit, "SQ size: %d, #stores occupied: %d\n", + 1 + storeQueue.capacity(), stores); + return storeQueue.capacity() - stores; + + } + +void +LSQUnit::checkSnoop(PacketPtr pkt) +{ + // Should only ever get invalidations in here + assert(pkt->isInvalidate()); + + DPRINTF(LSQUnit, "Got snoop for address %#x\n", pkt->getAddr()); + + for (int x = 0; x < cpu->numContexts(); x++) { + ThreadContext *tc = cpu->getContext(x); + bool no_squash = cpu->thread[x]->noSquashFromTC; + cpu->thread[x]->noSquashFromTC = true; + TheISA::handleLockedSnoop(tc, pkt, cacheBlockMask); + cpu->thread[x]->noSquashFromTC = no_squash; + } + + if (loadQueue.empty()) + return; + + auto iter = loadQueue.begin(); + + Addr invalidate_addr = pkt->getAddr() & cacheBlockMask; + + O3DynInstPtr ld_inst = iter->instruction(); + assert(ld_inst); + LSQRequest *req = iter->request(); + + // Check that this snoop didn't just invalidate our lock flag + if (ld_inst->effAddrValid() && + req->isCacheBlockHit(invalidate_addr, cacheBlockMask) + && ld_inst->memReqFlags & Request::LLSC) + TheISA::handleLockedSnoopHit(ld_inst.get()); + + bool force_squash = false; + + while (++iter != loadQueue.end()) { + ld_inst = iter->instruction(); + assert(ld_inst); + req = iter->request(); + if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) + continue; + + DPRINTF(LSQUnit, "-- inst [sn:%lli] to pktAddr:%#x\n", + ld_inst->seqNum, invalidate_addr); + + if (force_squash || + req->isCacheBlockHit(invalidate_addr, cacheBlockMask)) { + if (needsTSO) { + // If we have a TSO system, as all loads must be ordered with + // all other loads, this load as well as *all* subsequent loads + // need to be squashed to prevent possible load reordering. + force_squash = true; + } + if (ld_inst->possibleLoadViolation() || force_squash) { + DPRINTF(LSQUnit, "Conflicting load at addr %#x [sn:%lli]\n", + pkt->getAddr(), ld_inst->seqNum); + + // Mark the load for re-execution + ld_inst->fault = std::make_shared(); + req->setStateToFault(); + } else { + DPRINTF(LSQUnit, "HitExternal Snoop for addr %#x [sn:%lli]\n", + pkt->getAddr(), ld_inst->seqNum); + + // Make sure that we don't lose a snoop hitting a LOCKED + // address since the LOCK* flags don't get updated until + // commit. + if (ld_inst->memReqFlags & Request::LLSC) + TheISA::handleLockedSnoopHit(ld_inst.get()); + + // If a older load checks this and it's true + // then we might have missed the snoop + // in which case we need to invalidate to be sure + ld_inst->hitExternalSnoop(true); + } + } + } + return; +} + +Fault +LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt, + const O3DynInstPtr& inst) +{ + Addr inst_eff_addr1 = inst->effAddr >> depCheckShift; + Addr inst_eff_addr2 = (inst->effAddr + inst->effSize - 1) >> depCheckShift; + + /** @todo in theory you only need to check an instruction that has executed + * however, there isn't a good way in the pipeline at the moment to check + * all instructions that will execute before the store writes back. Thus, + * like the implementation that came before it, we're overly conservative. + */ + while (loadIt != loadQueue.end()) { + O3DynInstPtr ld_inst = loadIt->instruction(); + if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) { + ++loadIt; + continue; + } + + Addr ld_eff_addr1 = ld_inst->effAddr >> depCheckShift; + Addr ld_eff_addr2 = + (ld_inst->effAddr + ld_inst->effSize - 1) >> depCheckShift; + + if (inst_eff_addr2 >= ld_eff_addr1 && inst_eff_addr1 <= ld_eff_addr2) { + if (inst->isLoad()) { + // If this load is to the same block as an external snoop + // invalidate that we've observed then the load needs to be + // squashed as it could have newer data + if (ld_inst->hitExternalSnoop()) { + if (!memDepViolator || + ld_inst->seqNum < memDepViolator->seqNum) { + DPRINTF(LSQUnit, "Detected fault with inst [sn:%lli] " + "and [sn:%lli] at address %#x\n", + inst->seqNum, ld_inst->seqNum, ld_eff_addr1); + memDepViolator = ld_inst; + + ++stats.memOrderViolation; + + return std::make_shared( + "Detected fault with inst [sn:%lli] and " + "[sn:%lli] at address %#x\n", + inst->seqNum, ld_inst->seqNum, ld_eff_addr1); + } + } + + // Otherwise, mark the load has a possible load violation and + // if we see a snoop before it's commited, we need to squash + ld_inst->possibleLoadViolation(true); + DPRINTF(LSQUnit, "Found possible load violation at addr: %#x" + " between instructions [sn:%lli] and [sn:%lli]\n", + inst_eff_addr1, inst->seqNum, ld_inst->seqNum); + } else { + // A load/store incorrectly passed this store. + // Check if we already have a violator, or if it's newer + // squash and refetch. + if (memDepViolator && ld_inst->seqNum > memDepViolator->seqNum) + break; + + DPRINTF(LSQUnit, "Detected fault with inst [sn:%lli] and " + "[sn:%lli] at address %#x\n", + inst->seqNum, ld_inst->seqNum, ld_eff_addr1); + memDepViolator = ld_inst; + + ++stats.memOrderViolation; + + return std::make_shared( + "Detected fault with " + "inst [sn:%lli] and [sn:%lli] at address %#x\n", + inst->seqNum, ld_inst->seqNum, ld_eff_addr1); + } + } + + ++loadIt; + } + return NoFault; +} + + + + +Fault +LSQUnit::executeLoad(const O3DynInstPtr &inst) +{ + // Execute a specific load. + Fault load_fault = NoFault; + + DPRINTF(LSQUnit, "Executing load PC %s, [sn:%lli]\n", + inst->pcState(), inst->seqNum); + + assert(!inst->isSquashed()); + + load_fault = inst->initiateAcc(); + + if (load_fault == NoFault && !inst->readMemAccPredicate()) { + assert(inst->readPredicate()); + inst->setExecuted(); + inst->completeAcc(nullptr); + iewStage->instToCommit(inst); + iewStage->activityThisCycle(); + return NoFault; + } + + if (inst->isTranslationDelayed() && load_fault == NoFault) + return load_fault; + + if (load_fault != NoFault && inst->translationCompleted() && + inst->savedReq->isPartialFault() && !inst->savedReq->isComplete()) { + assert(inst->savedReq->isSplit()); + // If we have a partial fault where the mem access is not complete yet + // then the cache must have been blocked. This load will be re-executed + // when the cache gets unblocked. We will handle the fault when the + // mem access is complete. + return NoFault; + } + + // If the instruction faulted or predicated false, then we need to send it + // along to commit without the instruction completing. + if (load_fault != NoFault || !inst->readPredicate()) { + // Send this instruction to commit, also make sure iew stage + // realizes there is activity. Mark it as executed unless it + // is a strictly ordered load that needs to hit the head of + // commit. + if (!inst->readPredicate()) + inst->forwardOldRegs(); + DPRINTF(LSQUnit, "Load [sn:%lli] not executed from %s\n", + inst->seqNum, + (load_fault != NoFault ? "fault" : "predication")); + if (!(inst->hasRequest() && inst->strictlyOrdered()) || + inst->isAtCommit()) { + inst->setExecuted(); + } + iewStage->instToCommit(inst); + iewStage->activityThisCycle(); + } else { + if (inst->effAddrValid()) { + auto it = inst->lqIt; + ++it; + + if (checkLoads) + return checkViolations(it, inst); + } + } + + return load_fault; +} + +Fault +LSQUnit::executeStore(const O3DynInstPtr &store_inst) +{ + // Make sure that a store exists. + assert(stores != 0); + + int store_idx = store_inst->sqIdx; + + DPRINTF(LSQUnit, "Executing store PC %s [sn:%lli]\n", + store_inst->pcState(), store_inst->seqNum); + + assert(!store_inst->isSquashed()); + + // Check the recently completed loads to see if any match this store's + // address. If so, then we have a memory ordering violation. + typename LoadQueue::iterator loadIt = store_inst->lqIt; + + Fault store_fault = store_inst->initiateAcc(); + + if (store_inst->isTranslationDelayed() && + store_fault == NoFault) + return store_fault; + + if (!store_inst->readPredicate()) { + DPRINTF(LSQUnit, "Store [sn:%lli] not executed from predication\n", + store_inst->seqNum); + store_inst->forwardOldRegs(); + return store_fault; + } + + if (storeQueue[store_idx].size() == 0) { + DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n", + store_inst->pcState(), store_inst->seqNum); + + return store_fault; + } + + assert(store_fault == NoFault); + + if (store_inst->isStoreConditional() || store_inst->isAtomic()) { + // Store conditionals and Atomics need to set themselves as able to + // writeback if we haven't had a fault by here. + storeQueue[store_idx].canWB() = true; + + ++storesToWB; + } + + return checkViolations(loadIt, store_inst); + +} + +void +LSQUnit::commitLoad() +{ + assert(loadQueue.front().valid()); + + DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n", + loadQueue.front().instruction()->pcState()); + + loadQueue.front().clear(); + loadQueue.pop_front(); + + --loads; +} + +void +LSQUnit::commitLoads(InstSeqNum &youngest_inst) +{ + assert(loads == 0 || loadQueue.front().valid()); + + while (loads != 0 && loadQueue.front().instruction()->seqNum + <= youngest_inst) { + commitLoad(); + } +} + +void +LSQUnit::commitStores(InstSeqNum &youngest_inst) +{ + assert(stores == 0 || storeQueue.front().valid()); + + /* Forward iterate the store queue (age order). */ + for (auto& x : storeQueue) { + assert(x.valid()); + // Mark any stores that are now committed and have not yet + // been marked as able to write back. + if (!x.canWB()) { + if (x.instruction()->seqNum > youngest_inst) { + break; + } + DPRINTF(LSQUnit, "Marking store as able to write back, PC " + "%s [sn:%lli]\n", + x.instruction()->pcState(), + x.instruction()->seqNum); + + x.canWB() = true; + + ++storesToWB; + } + } +} + +void +LSQUnit::writebackBlockedStore() +{ + assert(isStoreBlocked); + storeWBIt->request()->sendPacketToCache(); + if (storeWBIt->request()->isSent()){ + storePostSend(); + } +} + +void +LSQUnit::writebackStores() +{ + if (isStoreBlocked) { + DPRINTF(LSQUnit, "Writing back blocked store\n"); + writebackBlockedStore(); + } + + while (storesToWB > 0 && + storeWBIt.dereferenceable() && + storeWBIt->valid() && + storeWBIt->canWB() && + ((!needsTSO) || (!storeInFlight)) && + lsq->cachePortAvailable(false)) { + + if (isStoreBlocked) { + DPRINTF(LSQUnit, "Unable to write back any more stores, cache" + " is blocked!\n"); + break; + } + + // Store didn't write any data so no need to write it back to + // memory. + if (storeWBIt->size() == 0) { + /* It is important that the preincrement happens at (or before) + * the call, as the the code of completeStore checks + * storeWBIt. */ + completeStore(storeWBIt++); + continue; + } + + if (storeWBIt->instruction()->isDataPrefetch()) { + storeWBIt++; + continue; + } + + assert(storeWBIt->hasRequest()); + assert(!storeWBIt->committed()); + + O3DynInstPtr inst = storeWBIt->instruction(); + LSQRequest* req = storeWBIt->request(); + + // Process store conditionals or store release after all previous + // stores are completed + if ((req->mainRequest()->isLLSC() || + req->mainRequest()->isRelease()) && + (storeWBIt.idx() != storeQueue.head())) { + DPRINTF(LSQUnit, "Store idx:%i PC:%s to Addr:%#x " + "[sn:%lli] is %s%s and not head of the queue\n", + storeWBIt.idx(), inst->pcState(), + req->request()->getPaddr(), inst->seqNum, + req->mainRequest()->isLLSC() ? "SC" : "", + req->mainRequest()->isRelease() ? "/Release" : ""); + break; + } + + storeWBIt->committed() = true; + + assert(!inst->memData); + inst->memData = new uint8_t[req->_size]; + + if (storeWBIt->isAllZeros()) + memset(inst->memData, 0, req->_size); + else + memcpy(inst->memData, storeWBIt->data(), req->_size); + + + if (req->senderState() == nullptr) { + SQSenderState *state = new SQSenderState(storeWBIt); + state->isLoad = false; + state->needWB = false; + state->inst = inst; + + req->senderState(state); + if (inst->isStoreConditional() || inst->isAtomic()) { + /* Only store conditionals and atomics need a writeback. */ + state->needWB = true; + } + } + req->buildPackets(); + + DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%s " + "to Addr:%#x, data:%#x [sn:%lli]\n", + storeWBIt.idx(), inst->pcState(), + req->request()->getPaddr(), (int)*(inst->memData), + inst->seqNum); + + // @todo: Remove this SC hack once the memory system handles it. + if (inst->isStoreConditional()) { + // Disable recording the result temporarily. Writing to + // misc regs normally updates the result, but this is not + // the desired behavior when handling store conditionals. + inst->recordResult(false); + bool success = TheISA::handleLockedWrite(inst.get(), + req->request(), cacheBlockMask); + inst->recordResult(true); + req->packetSent(); + + if (!success) { + req->complete(); + // Instantly complete this store. + DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed. " + "Instantly completing it.\n", + inst->seqNum); + PacketPtr new_pkt = new Packet(*req->packet()); + WritebackEvent *wb = new WritebackEvent(inst, + new_pkt, this); + cpu->schedule(wb, curTick() + 1); + completeStore(storeWBIt); + if (!storeQueue.empty()) + storeWBIt++; + else + storeWBIt = storeQueue.end(); + continue; + } + } + + if (req->request()->isLocalAccess()) { + assert(!inst->isStoreConditional()); + assert(!inst->inHtmTransactionalState()); + ThreadContext *thread = cpu->tcBase(lsqID); + PacketPtr main_pkt = new Packet(req->mainRequest(), + MemCmd::WriteReq); + main_pkt->dataStatic(inst->memData); + req->request()->localAccessor(thread, main_pkt); + delete main_pkt; + completeStore(storeWBIt); + storeWBIt++; + continue; + } + /* Send to cache */ + req->sendPacketToCache(); + + /* If successful, do the post send */ + if (req->isSent()) { + storePostSend(); + } else { + DPRINTF(LSQUnit, "D-Cache became blocked when writing [sn:%lli], " + "will retry later\n", + inst->seqNum); + } + } + assert(stores >= 0 && storesToWB >= 0); +} + +void +LSQUnit::squash(const InstSeqNum &squashed_num) +{ + DPRINTF(LSQUnit, "Squashing until [sn:%lli]!" + "(Loads:%i Stores:%i)\n", squashed_num, loads, stores); + + while (loads != 0 && + loadQueue.back().instruction()->seqNum > squashed_num) { + DPRINTF(LSQUnit,"Load Instruction PC %s squashed, " + "[sn:%lli]\n", + loadQueue.back().instruction()->pcState(), + loadQueue.back().instruction()->seqNum); + + if (isStalled() && loadQueue.tail() == stallingLoadIdx) { + stalled = false; + stallingStoreIsn = 0; + stallingLoadIdx = 0; + } + + // hardware transactional memory + // Squashing instructions can alter the transaction nesting depth + // and must be corrected before fetching resumes. + if (loadQueue.back().instruction()->isHtmStart()) + { + htmStarts = (--htmStarts < 0) ? 0 : htmStarts; + DPRINTF(HtmCpu, ">> htmStarts-- (%d) : htmStops (%d)\n", + htmStarts, htmStops); + } + if (loadQueue.back().instruction()->isHtmStop()) + { + htmStops = (--htmStops < 0) ? 0 : htmStops; + DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops-- (%d)\n", + htmStarts, htmStops); + } + // Clear the smart pointer to make sure it is decremented. + loadQueue.back().instruction()->setSquashed(); + loadQueue.back().clear(); + + --loads; + + loadQueue.pop_back(); + ++stats.squashedLoads; + } + + // hardware transactional memory + // scan load queue (from oldest to youngest) for most recent valid htmUid + auto scan_it = loadQueue.begin(); + uint64_t in_flight_uid = 0; + while (scan_it != loadQueue.end()) { + if (scan_it->instruction()->isHtmStart() && + !scan_it->instruction()->isSquashed()) { + in_flight_uid = scan_it->instruction()->getHtmTransactionUid(); + DPRINTF(HtmCpu, "loadQueue[%d]: found valid HtmStart htmUid=%u\n", + scan_it._idx, in_flight_uid); + } + scan_it++; + } + // If there's a HtmStart in the pipeline then use its htmUid, + // otherwise use the most recently committed uid + const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr(); + if (htm_cpt) { + const uint64_t old_local_htm_uid = htm_cpt->getHtmUid(); + uint64_t new_local_htm_uid; + if (in_flight_uid > 0) + new_local_htm_uid = in_flight_uid; + else + new_local_htm_uid = lastRetiredHtmUid; + + if (old_local_htm_uid != new_local_htm_uid) { + DPRINTF(HtmCpu, "flush: lastRetiredHtmUid=%u\n", + lastRetiredHtmUid); + DPRINTF(HtmCpu, "flush: resetting localHtmUid=%u\n", + new_local_htm_uid); + + htm_cpt->setHtmUid(new_local_htm_uid); + } + } + + if (memDepViolator && squashed_num < memDepViolator->seqNum) { + memDepViolator = NULL; + } + + while (stores != 0 && + storeQueue.back().instruction()->seqNum > squashed_num) { + // Instructions marked as can WB are already committed. + if (storeQueue.back().canWB()) { + break; + } + + DPRINTF(LSQUnit,"Store Instruction PC %s squashed, " + "idx:%i [sn:%lli]\n", + storeQueue.back().instruction()->pcState(), + storeQueue.tail(), storeQueue.back().instruction()->seqNum); + + // I don't think this can happen. It should have been cleared + // by the stalling load. + if (isStalled() && + storeQueue.back().instruction()->seqNum == stallingStoreIsn) { + panic("Is stalled should have been cleared by stalling load!\n"); + stalled = false; + stallingStoreIsn = 0; + } + + // Clear the smart pointer to make sure it is decremented. + storeQueue.back().instruction()->setSquashed(); + + // Must delete request now that it wasn't handed off to + // memory. This is quite ugly. @todo: Figure out the proper + // place to really handle request deletes. + storeQueue.back().clear(); + --stores; + + storeQueue.pop_back(); + ++stats.squashedStores; + } +} + +void +LSQUnit::storePostSend() +{ + if (isStalled() && + storeWBIt->instruction()->seqNum == stallingStoreIsn) { + DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] " + "load idx:%i\n", + stallingStoreIsn, stallingLoadIdx); + stalled = false; + stallingStoreIsn = 0; + iewStage->replayMemInst(loadQueue[stallingLoadIdx].instruction()); + } + + if (!storeWBIt->instruction()->isStoreConditional()) { + // The store is basically completed at this time. This + // only works so long as the checker doesn't try to + // verify the value in memory for stores. + storeWBIt->instruction()->setCompleted(); + + if (cpu->checker) { + cpu->checker->verify(storeWBIt->instruction()); + } + } + + if (needsTSO) { + storeInFlight = true; + } + + storeWBIt++; +} + +void +LSQUnit::writeback(const O3DynInstPtr &inst, PacketPtr pkt) +{ + iewStage->wakeCPU(); + + // Squashed instructions do not need to complete their access. + if (inst->isSquashed()) { + assert (!inst->isStore() || inst->isStoreConditional()); + ++stats.ignoredResponses; + return; + } + + if (!inst->isExecuted()) { + inst->setExecuted(); + + if (inst->fault == NoFault) { + // Complete access to copy data to proper place. + inst->completeAcc(pkt); + } else { + // If the instruction has an outstanding fault, we cannot complete + // the access as this discards the current fault. + + // If we have an outstanding fault, the fault should only be of + // type ReExec or - in case of a SplitRequest - a partial + // translation fault + + // Unless it's a hardware transactional memory fault + auto htm_fault = std::dynamic_pointer_cast< + GenericHtmFailureFault>(inst->fault); + + if (!htm_fault) { + assert(dynamic_cast(inst->fault.get()) != nullptr || + inst->savedReq->isPartialFault()); + + } else if (!pkt->htmTransactionFailedInCache()) { + // Situation in which the instruction has a hardware + // transactional memory fault but not the packet itself. This + // can occur with ldp_uop microops since access is spread over + // multiple packets. + DPRINTF(HtmCpu, + "%s writeback with HTM failure fault, " + "however, completing packet is not aware of " + "transaction failure. cause=%s htmUid=%u\n", + inst->staticInst->getName(), + htmFailureToStr(htm_fault->getHtmFailureFaultCause()), + htm_fault->getHtmUid()); + } + + DPRINTF(LSQUnit, "Not completing instruction [sn:%lli] access " + "due to pending fault.\n", inst->seqNum); + } + } + + // Need to insert instruction into queue to commit + iewStage->instToCommit(inst); + + iewStage->activityThisCycle(); + + // see if this load changed the PC + iewStage->checkMisprediction(inst); +} + +void +LSQUnit::completeStore(typename StoreQueue::iterator store_idx) +{ + assert(store_idx->valid()); + store_idx->completed() = true; + --storesToWB; + // A bit conservative because a store completion may not free up entries, + // but hopefully avoids two store completions in one cycle from making + // the CPU tick twice. + cpu->wakeCPU(); + cpu->activityThisCycle(); + + /* We 'need' a copy here because we may clear the entry from the + * store queue. */ + O3DynInstPtr store_inst = store_idx->instruction(); + if (store_idx == storeQueue.begin()) { + do { + storeQueue.front().clear(); + storeQueue.pop_front(); + --stores; + } while (storeQueue.front().completed() && + !storeQueue.empty()); + + iewStage->updateLSQNextCycle = true; + } + + DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head " + "idx:%i\n", + store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1); + +#if TRACING_ON + if (Debug::O3PipeView) { + store_inst->storeTick = + curTick() - store_inst->fetchTick; + } +#endif + + if (isStalled() && + store_inst->seqNum == stallingStoreIsn) { + DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] " + "load idx:%i\n", + stallingStoreIsn, stallingLoadIdx); + stalled = false; + stallingStoreIsn = 0; + iewStage->replayMemInst(loadQueue[stallingLoadIdx].instruction()); + } + + store_inst->setCompleted(); + + if (needsTSO) { + storeInFlight = false; + } + + // Tell the checker we've completed this instruction. Some stores + // may get reported twice to the checker, but the checker can + // handle that case. + // Store conditionals cannot be sent to the checker yet, they have + // to update the misc registers first which should take place + // when they commit + if (cpu->checker && !store_inst->isStoreConditional()) { + cpu->checker->verify(store_inst); + } +} + +bool +LSQUnit::trySendPacket(bool isLoad, PacketPtr data_pkt) +{ + bool ret = true; + bool cache_got_blocked = false; + + auto state = dynamic_cast(data_pkt->senderState); + + if (!lsq->cacheBlocked() && + lsq->cachePortAvailable(isLoad)) { + if (!dcachePort->sendTimingReq(data_pkt)) { + ret = false; + cache_got_blocked = true; + } + } else { + ret = false; + } + + if (ret) { + if (!isLoad) { + isStoreBlocked = false; + } + lsq->cachePortBusy(isLoad); + state->outstanding++; + state->request()->packetSent(); + } else { + if (cache_got_blocked) { + lsq->cacheBlocked(true); + ++stats.blockedByCache; + } + if (!isLoad) { + assert(state->request() == storeWBIt->request()); + isStoreBlocked = true; + } + state->request()->packetNotSent(); + } + return ret; +} + +void +LSQUnit::recvRetry() +{ + if (isStoreBlocked) { + DPRINTF(LSQUnit, "Receiving retry: blocked store\n"); + writebackBlockedStore(); + } +} + +void +LSQUnit::dumpInsts() const +{ + cprintf("Load store queue: Dumping instructions.\n"); + cprintf("Load queue size: %i\n", loads); + cprintf("Load queue: "); + + for (const auto& e: loadQueue) { + const O3DynInstPtr &inst(e.instruction()); + cprintf("%s.[sn:%llu] ", inst->pcState(), inst->seqNum); + } + cprintf("\n"); + + cprintf("Store queue size: %i\n", stores); + cprintf("Store queue: "); + + for (const auto& e: storeQueue) { + const O3DynInstPtr &inst(e.instruction()); + cprintf("%s.[sn:%llu] ", inst->pcState(), inst->seqNum); + } + + cprintf("\n"); +} + +unsigned int +LSQUnit::cacheLineSize() +{ + return cpu->cacheLineSize(); +} + +Fault +LSQUnit::read(LSQRequest *req, int load_idx) +{ + LQEntry& load_req = loadQueue[load_idx]; + const O3DynInstPtr& load_inst = load_req.instruction(); + + load_req.setRequest(req); + assert(load_inst); + + assert(!load_inst->isExecuted()); + + // Make sure this isn't a strictly ordered load + // A bit of a hackish way to get strictly ordered accesses to work + // only if they're at the head of the LSQ and are ready to commit + // (at the head of the ROB too). + + if (req->mainRequest()->isStrictlyOrdered() && + (load_idx != loadQueue.head() || !load_inst->isAtCommit())) { + // Tell IQ/mem dep unit that this instruction will need to be + // rescheduled eventually + iewStage->rescheduleMemInst(load_inst); + load_inst->clearIssued(); + load_inst->effAddrValid(false); + ++stats.rescheduledLoads; + DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n", + load_inst->seqNum, load_inst->pcState()); + + // Must delete request now that it wasn't handed off to + // memory. This is quite ugly. @todo: Figure out the proper + // place to really handle request deletes. + load_req.setRequest(nullptr); + req->discard(); + return std::make_shared( + "Strictly ordered load [sn:%llx] PC %s\n", + load_inst->seqNum, load_inst->pcState()); + } + + DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, " + "storeHead: %i addr: %#x%s\n", + load_idx - 1, load_inst->sqIt._idx, storeQueue.head() - 1, + req->mainRequest()->getPaddr(), req->isSplit() ? " split" : ""); + + if (req->mainRequest()->isLLSC()) { + // Disable recording the result temporarily. Writing to misc + // regs normally updates the result, but this is not the + // desired behavior when handling store conditionals. + load_inst->recordResult(false); + TheISA::handleLockedRead(load_inst.get(), req->mainRequest()); + load_inst->recordResult(true); + } + + if (req->mainRequest()->isLocalAccess()) { + assert(!load_inst->memData); + assert(!load_inst->inHtmTransactionalState()); + load_inst->memData = new uint8_t[MaxDataBytes]; + + ThreadContext *thread = cpu->tcBase(lsqID); + PacketPtr main_pkt = new Packet(req->mainRequest(), MemCmd::ReadReq); + + main_pkt->dataStatic(load_inst->memData); + + Cycles delay = req->mainRequest()->localAccessor(thread, main_pkt); + + WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this); + cpu->schedule(wb, cpu->clockEdge(delay)); + return NoFault; + } + + // hardware transactional memory + if (req->mainRequest()->isHTMStart() || req->mainRequest()->isHTMCommit()) + { + // don't want to send nested transactionStarts and + // transactionStops outside of core, e.g. to Ruby + if (req->mainRequest()->getFlags().isSet(Request::NO_ACCESS)) { + Cycles delay(0); + PacketPtr data_pkt = + new Packet(req->mainRequest(), MemCmd::ReadReq); + + // Allocate memory if this is the first time a load is issued. + if (!load_inst->memData) { + load_inst->memData = + new uint8_t[req->mainRequest()->getSize()]; + // sanity checks espect zero in request's data + memset(load_inst->memData, 0, req->mainRequest()->getSize()); + } + + data_pkt->dataStatic(load_inst->memData); + if (load_inst->inHtmTransactionalState()) { + data_pkt->setHtmTransactional( + load_inst->getHtmTransactionUid()); + } + data_pkt->makeResponse(); + + WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this); + cpu->schedule(wb, cpu->clockEdge(delay)); + return NoFault; + } + } + + // Check the SQ for any previous stores that might lead to forwarding + auto store_it = load_inst->sqIt; + assert (store_it >= storeWBIt); + // End once we've reached the top of the LSQ + while (store_it != storeWBIt) { + // Move the index to one younger + store_it--; + assert(store_it->valid()); + assert(store_it->instruction()->seqNum < load_inst->seqNum); + int store_size = store_it->size(); + + // Cache maintenance instructions go down via the store + // path but they carry no data and they shouldn't be + // considered for forwarding + if (store_size != 0 && !store_it->instruction()->strictlyOrdered() && + !(store_it->request()->mainRequest() && + store_it->request()->mainRequest()->isCacheMaintenance())) { + assert(store_it->instruction()->effAddrValid()); + + // Check if the store data is within the lower and upper bounds of + // addresses that the request needs. + auto req_s = req->mainRequest()->getVaddr(); + auto req_e = req_s + req->mainRequest()->getSize(); + auto st_s = store_it->instruction()->effAddr; + auto st_e = st_s + store_size; + + bool store_has_lower_limit = req_s >= st_s; + bool store_has_upper_limit = req_e <= st_e; + bool lower_load_has_store_part = req_s < st_e; + bool upper_load_has_store_part = req_e > st_s; + + auto coverage = AddrRangeCoverage::NoAddrRangeCoverage; + + // If the store entry is not atomic (atomic does not have valid + // data), the store has all of the data needed, and + // the load is not LLSC, then + // we can forward data from the store to the load + if (!store_it->instruction()->isAtomic() && + store_has_lower_limit && store_has_upper_limit && + !req->mainRequest()->isLLSC()) { + + const auto& store_req = store_it->request()->mainRequest(); + coverage = store_req->isMasked() ? + AddrRangeCoverage::PartialAddrRangeCoverage : + AddrRangeCoverage::FullAddrRangeCoverage; + } else if ( + // This is the partial store-load forwarding case where a store + // has only part of the load's data and the load isn't LLSC + (!req->mainRequest()->isLLSC() && + ((store_has_lower_limit && lower_load_has_store_part) || + (store_has_upper_limit && upper_load_has_store_part) || + (lower_load_has_store_part && upper_load_has_store_part))) || + // The load is LLSC, and the store has all or part of the + // load's data + (req->mainRequest()->isLLSC() && + ((store_has_lower_limit || upper_load_has_store_part) && + (store_has_upper_limit || lower_load_has_store_part))) || + // The store entry is atomic and has all or part of the load's + // data + (store_it->instruction()->isAtomic() && + ((store_has_lower_limit || upper_load_has_store_part) && + (store_has_upper_limit || lower_load_has_store_part)))) { + + coverage = AddrRangeCoverage::PartialAddrRangeCoverage; + } + + if (coverage == AddrRangeCoverage::FullAddrRangeCoverage) { + // Get shift amount for offset into the store's data. + int shift_amt = req->mainRequest()->getVaddr() - + store_it->instruction()->effAddr; + + // Allocate memory if this is the first time a load is issued. + if (!load_inst->memData) { + load_inst->memData = + new uint8_t[req->mainRequest()->getSize()]; + } + if (store_it->isAllZeros()) + memset(load_inst->memData, 0, + req->mainRequest()->getSize()); + else + memcpy(load_inst->memData, + store_it->data() + shift_amt, + req->mainRequest()->getSize()); + + DPRINTF(LSQUnit, "Forwarding from store idx %i to load to " + "addr %#x\n", store_it._idx, + req->mainRequest()->getVaddr()); + + PacketPtr data_pkt = new Packet(req->mainRequest(), + MemCmd::ReadReq); + data_pkt->dataStatic(load_inst->memData); + + // hardware transactional memory + // Store to load forwarding within a transaction + // This should be okay because the store will be sent to + // the memory subsystem and subsequently get added to the + // write set of the transaction. The write set has a stronger + // property than the read set, so the load doesn't necessarily + // have to be there. + assert(!req->mainRequest()->isHTMCmd()); + if (load_inst->inHtmTransactionalState()) { + assert (!storeQueue[store_it._idx].completed()); + assert ( + storeQueue[store_it._idx].instruction()-> + inHtmTransactionalState()); + assert ( + load_inst->getHtmTransactionUid() == + storeQueue[store_it._idx].instruction()-> + getHtmTransactionUid()); + data_pkt->setHtmTransactional( + load_inst->getHtmTransactionUid()); + DPRINTF(HtmCpu, "HTM LD (ST2LDF) " + "pc=0x%lx - vaddr=0x%lx - " + "paddr=0x%lx - htmUid=%u\n", + load_inst->instAddr(), + data_pkt->req->hasVaddr() ? + data_pkt->req->getVaddr() : 0lu, + data_pkt->getAddr(), + load_inst->getHtmTransactionUid()); + } + + if (req->isAnyOutstandingRequest()) { + assert(req->_numOutstandingPackets > 0); + // There are memory requests packets in flight already. + // This may happen if the store was not complete the + // first time this load got executed. Signal the senderSate + // that response packets should be discarded. + req->discardSenderState(); + } + + WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, + this); + + // We'll say this has a 1 cycle load-store forwarding latency + // for now. + // @todo: Need to make this a parameter. + cpu->schedule(wb, curTick()); + + // Don't need to do anything special for split loads. + ++stats.forwLoads; + + return NoFault; + } else if ( + coverage == AddrRangeCoverage::PartialAddrRangeCoverage) { + // If it's already been written back, then don't worry about + // stalling on it. + if (store_it->completed()) { + panic("Should not check one of these"); + continue; + } + + // Must stall load and force it to retry, so long as it's the + // oldest load that needs to do so. + if (!stalled || + (stalled && + load_inst->seqNum < + loadQueue[stallingLoadIdx].instruction()->seqNum)) { + stalled = true; + stallingStoreIsn = store_it->instruction()->seqNum; + stallingLoadIdx = load_idx; + } + + // Tell IQ/mem dep unit that this instruction will need to be + // rescheduled eventually + iewStage->rescheduleMemInst(load_inst); + load_inst->clearIssued(); + load_inst->effAddrValid(false); + ++stats.rescheduledLoads; + + // Do not generate a writeback event as this instruction is not + // complete. + DPRINTF(LSQUnit, "Load-store forwarding mis-match. " + "Store idx %i to load addr %#x\n", + store_it._idx, req->mainRequest()->getVaddr()); + + // Must discard the request. + req->discard(); + load_req.setRequest(nullptr); + return NoFault; + } + } + } + + // If there's no forwarding case, then go access memory + DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n", + load_inst->seqNum, load_inst->pcState()); + + // Allocate memory if this is the first time a load is issued. + if (!load_inst->memData) { + load_inst->memData = new uint8_t[req->mainRequest()->getSize()]; + } + + + // hardware transactional memory + if (req->mainRequest()->isHTMCmd()) { + // this is a simple sanity check + // the Ruby cache controller will set + // memData to 0x0ul if successful. + *load_inst->memData = (uint64_t) 0x1ull; + } + + // For now, load throughput is constrained by the number of + // load FUs only, and loads do not consume a cache port (only + // stores do). + // @todo We should account for cache port contention + // and arbitrate between loads and stores. + + // if we the cache is not blocked, do cache access + if (req->senderState() == nullptr) { + LQSenderState *state = new LQSenderState( + loadQueue.getIterator(load_idx)); + state->isLoad = true; + state->inst = load_inst; + state->isSplit = req->isSplit(); + req->senderState(state); + } + req->buildPackets(); + req->sendPacketToCache(); + if (!req->isSent()) + iewStage->blockMemInst(load_inst); + + return NoFault; +} + +Fault +LSQUnit::write(LSQRequest *req, uint8_t *data, int store_idx) +{ + assert(storeQueue[store_idx].valid()); + + DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i " + "[sn:%llu]\n", + store_idx - 1, req->request()->getPaddr(), storeQueue.head() - 1, + storeQueue[store_idx].instruction()->seqNum); + + storeQueue[store_idx].setRequest(req); + unsigned size = req->_size; + storeQueue[store_idx].size() = size; + bool store_no_data = + req->mainRequest()->getFlags() & Request::STORE_NO_DATA; + storeQueue[store_idx].isAllZeros() = store_no_data; + assert(size <= SQEntry::DataSize || store_no_data); + + // copy data into the storeQueue only if the store request has valid data + if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) && + !req->request()->isCacheMaintenance() && + !req->request()->isAtomic()) + memcpy(storeQueue[store_idx].data(), data, size); + + // This function only writes the data to the store queue, so no fault + // can happen here. + return NoFault; +} + +InstSeqNum +LSQUnit::getLoadHeadSeqNum() +{ + if (loadQueue.front().valid()) + return loadQueue.front().instruction()->seqNum; + else + return 0; +} + +InstSeqNum +LSQUnit::getStoreHeadSeqNum() +{ + if (storeQueue.front().valid()) + return storeQueue.front().instruction()->seqNum; + else + return 0; +} diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 704a20be76..0ad7203189 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -52,9 +52,12 @@ #include "arch/generic/vec_reg.hh" #include "arch/locked_mem.hh" #include "config/the_isa.hh" +#include "cpu/base.hh" #include "cpu/inst_seq.hh" #include "cpu/o3/comm.hh" +#include "cpu/o3/cpu.hh" #include "cpu/o3/dyn_inst_ptr.hh" +#include "cpu/o3/impl.hh" #include "cpu/o3/lsq.hh" #include "cpu/timebuf.hh" #include "debug/HtmCpu.hh" @@ -80,14 +83,13 @@ class DefaultIEW; * the LSQ until the store writes back. At that point the load is * replayed. */ -template class LSQUnit { public: static constexpr auto MaxDataBytes = MaxVecRegLenInBytes; - using LSQSenderState = typename LSQ::LSQSenderState; - using LSQRequest = typename LSQ::LSQRequest; + using LSQSenderState = typename LSQ::LSQSenderState; + using LSQRequest = typename LSQ::LSQRequest; private: class LSQEntry { @@ -123,10 +125,10 @@ class LSQUnit } void - set(const O3DynInstPtr& inst) + set(const O3DynInstPtr& new_inst) { assert(!_valid); - this->inst = inst; + inst = new_inst; _valid = true; _size = 0; } @@ -223,8 +225,9 @@ class LSQUnit } /** Initializes the LSQ unit with the specified number of entries. */ - void init(FullO3CPU *cpu_ptr, DefaultIEW *iew_ptr, - const DerivO3CPUParams ¶ms, LSQ *lsq_ptr, unsigned id); + void init(FullO3CPU *cpu_ptr, DefaultIEW *iew_ptr, + const DerivO3CPUParams ¶ms, LSQ *lsq_ptr, + unsigned id); /** Returns the name of the LSQ unit. */ std::string name() const; @@ -396,13 +399,13 @@ class LSQUnit private: /** Pointer to the CPU. */ - FullO3CPU *cpu; + FullO3CPU *cpu; /** Pointer to the IEW stage. */ - DefaultIEW *iewStage; + DefaultIEW *iewStage; /** Pointer to the LSQ. */ - LSQ *lsq; + LSQ *lsq; /** Pointer to the dcache port. Used only for sending. */ RequestPort *dcachePort; @@ -466,7 +469,7 @@ class LSQUnit PacketPtr pkt; /** The pointer to the LSQ unit that issued the store. */ - LSQUnit *lsqPtr; + LSQUnit *lsqPtr; }; public: @@ -543,13 +546,6 @@ class LSQUnit /** The oldest load that caused a memory ordering violation. */ O3DynInstPtr memDepViolator; - /** Whether or not there is a packet that couldn't be sent because of - * a lack of cache ports. */ - bool hasPendingRequest; - - /** The packet that is pending free cache ports. */ - LSQRequest* pendingRequest; - /** Flag for memory model. */ bool needsTSO; @@ -595,24 +591,12 @@ class LSQUnit int getLoadHead() { return loadQueue.head(); } /** Returns the sequence number of the head load instruction. */ - InstSeqNum - getLoadHeadSeqNum() - { - return loadQueue.front().valid() - ? loadQueue.front().instruction()->seqNum - : 0; - } + InstSeqNum getLoadHeadSeqNum(); /** Returns the index of the head store instruction. */ int getStoreHead() { return storeQueue.head(); } /** Returns the sequence number of the head store instruction. */ - InstSeqNum - getStoreHeadSeqNum() - { - return storeQueue.front().valid() - ? storeQueue.front().instruction()->seqNum - : 0; - } + InstSeqNum getStoreHeadSeqNum(); /** Returns whether or not the LSQ unit is stalled. */ bool isStalled() { return stalled; } diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh deleted file mode 100644 index df789ce6b3..0000000000 --- a/src/cpu/o3/lsq_unit_impl.hh +++ /dev/null @@ -1,1659 +0,0 @@ - -/* - * Copyright (c) 2010-2014, 2017-2020 ARM Limited - * Copyright (c) 2013 Advanced Micro Devices, Inc. - * All rights reserved - * - * The license below extends only to copyright in the software and shall - * not be construed as granting a license to any other intellectual - * property including but not limited to intellectual property relating - * to a hardware implementation of the functionality of the software - * licensed hereunder. You may use the software subject to the license - * terms below provided that you ensure that this notice is replicated - * unmodified and in its entirety in all distributions of the software, - * modified or unmodified, in source code or in binary form. - * - * Copyright (c) 2004-2005 The Regents of The University of Michigan - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __CPU_O3_LSQ_UNIT_IMPL_HH__ -#define __CPU_O3_LSQ_UNIT_IMPL_HH__ - -#include "arch/generic/debugfaults.hh" -#include "arch/locked_mem.hh" -#include "base/str.hh" -#include "config/the_isa.hh" -#include "cpu/checker/cpu.hh" -#include "cpu/o3/limits.hh" -#include "cpu/o3/lsq.hh" -#include "cpu/o3/lsq_unit.hh" -#include "debug/Activity.hh" -#include "debug/HtmCpu.hh" -#include "debug/IEW.hh" -#include "debug/LSQUnit.hh" -#include "debug/O3PipeView.hh" -#include "mem/packet.hh" -#include "mem/request.hh" - -template -LSQUnit::WritebackEvent::WritebackEvent(const O3DynInstPtr &_inst, - PacketPtr _pkt, LSQUnit *lsq_ptr) - : Event(Default_Pri, AutoDelete), - inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr) -{ - assert(_inst->savedReq); - _inst->savedReq->writebackScheduled(); -} - -template -void -LSQUnit::WritebackEvent::process() -{ - assert(!lsqPtr->cpu->switchedOut()); - - lsqPtr->writeback(inst, pkt); - - assert(inst->savedReq); - inst->savedReq->writebackDone(); - delete pkt; -} - -template -const char * -LSQUnit::WritebackEvent::description() const -{ - return "Store writeback"; -} - -template -bool -LSQUnit::recvTimingResp(PacketPtr pkt) -{ - auto senderState = dynamic_cast(pkt->senderState); - LSQRequest* req = senderState->request(); - assert(req != nullptr); - bool ret = true; - /* Check that the request is still alive before any further action. */ - if (senderState->alive()) { - ret = req->recvTimingResp(pkt); - } else { - senderState->outstanding--; - } - return ret; - -} - -template -void -LSQUnit::completeDataAccess(PacketPtr pkt) -{ - LSQSenderState *state = dynamic_cast(pkt->senderState); - O3DynInstPtr inst = state->inst; - - // hardware transactional memory - // sanity check - if (pkt->isHtmTransactional() && !inst->isSquashed()) { - assert(inst->getHtmTransactionUid() == pkt->getHtmTransactionUid()); - } - - // if in a HTM transaction, it's possible - // to abort within the cache hierarchy. - // This is signalled back to the processor - // through responses to memory requests. - if (pkt->htmTransactionFailedInCache()) { - // cannot do this for write requests because - // they cannot tolerate faults - const HtmCacheFailure htm_rc = - pkt->getHtmTransactionFailedInCacheRC(); - if(pkt->isWrite()) { - DPRINTF(HtmCpu, - "store notification (ignored) of HTM transaction failure " - "in cache - addr=0x%lx - rc=%s - htmUid=%d\n", - pkt->getAddr(), htmFailureToStr(htm_rc), - pkt->getHtmTransactionUid()); - } else { - HtmFailureFaultCause fail_reason = - HtmFailureFaultCause::INVALID; - - if (htm_rc == HtmCacheFailure::FAIL_SELF) { - fail_reason = HtmFailureFaultCause::SIZE; - } else if (htm_rc == HtmCacheFailure::FAIL_REMOTE) { - fail_reason = HtmFailureFaultCause::MEMORY; - } else if (htm_rc == HtmCacheFailure::FAIL_OTHER) { - // these are likely loads that were issued out of order - // they are faulted here, but it's unlikely that these will - // ever reach the commit head. - fail_reason = HtmFailureFaultCause::OTHER; - } else { - panic("HTM error - unhandled return code from cache (%s)", - htmFailureToStr(htm_rc)); - } - - inst->fault = - std::make_shared( - inst->getHtmTransactionUid(), - fail_reason); - - DPRINTF(HtmCpu, - "load notification of HTM transaction failure " - "in cache - pc=%s - addr=0x%lx - " - "rc=%u - htmUid=%d\n", - inst->pcState(), pkt->getAddr(), - htmFailureToStr(htm_rc), pkt->getHtmTransactionUid()); - } - } - - cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt)); - - /* Notify the sender state that the access is complete (for ownership - * tracking). */ - state->complete(); - - assert(!cpu->switchedOut()); - if (!inst->isSquashed()) { - if (state->needWB) { - // Only loads, store conditionals and atomics perform the writeback - // after receving the response from the memory - assert(inst->isLoad() || inst->isStoreConditional() || - inst->isAtomic()); - - // hardware transactional memory - if (pkt->htmTransactionFailedInCache()) { - state->request()->mainPacket()->setHtmTransactionFailedInCache( - pkt->getHtmTransactionFailedInCacheRC() ); - } - - writeback(inst, state->request()->mainPacket()); - if (inst->isStore() || inst->isAtomic()) { - auto ss = dynamic_cast(state); - ss->writebackDone(); - completeStore(ss->idx); - } - } else if (inst->isStore()) { - // This is a regular store (i.e., not store conditionals and - // atomics), so it can complete without writing back - completeStore(dynamic_cast(state)->idx); - } - } -} - -template -LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries) - : lsqID(-1), storeQueue(sqEntries+1), loadQueue(lqEntries+1), - loads(0), stores(0), storesToWB(0), - htmStarts(0), htmStops(0), - lastRetiredHtmUid(0), - cacheBlockMask(0), stalled(false), - isStoreBlocked(false), storeInFlight(false), hasPendingRequest(false), - pendingRequest(nullptr), stats(nullptr) -{ -} - -template -void -LSQUnit::init(FullO3CPU *cpu_ptr, DefaultIEW *iew_ptr, - const DerivO3CPUParams ¶ms, LSQ *lsq_ptr, unsigned id) -{ - lsqID = id; - - cpu = cpu_ptr; - iewStage = iew_ptr; - - lsq = lsq_ptr; - - cpu->addStatGroup(csprintf("lsq%i", lsqID).c_str(), &stats); - - DPRINTF(LSQUnit, "Creating LSQUnit%i object.\n",lsqID); - - depCheckShift = params.LSQDepCheckShift; - checkLoads = params.LSQCheckLoads; - needsTSO = params.needsTSO; - - resetState(); -} - - -template -void -LSQUnit::resetState() -{ - loads = stores = storesToWB = 0; - - // hardware transactional memory - // nesting depth - htmStarts = htmStops = 0; - - storeWBIt = storeQueue.begin(); - - retryPkt = NULL; - memDepViolator = NULL; - - stalled = false; - - cacheBlockMask = ~(cpu->cacheLineSize() - 1); -} - -template -std::string -LSQUnit::name() const -{ - if (O3MaxThreads == 1) { - return iewStage->name() + ".lsq"; - } else { - return iewStage->name() + ".lsq.thread" + std::to_string(lsqID); - } -} - -template -LSQUnit::LSQUnitStats::LSQUnitStats(Stats::Group *parent) - : Stats::Group(parent), - ADD_STAT(forwLoads, Stats::Units::Count::get(), - "Number of loads that had data forwarded from stores"), - ADD_STAT(squashedLoads, Stats::Units::Count::get(), - "Number of loads squashed"), - ADD_STAT(ignoredResponses, Stats::Units::Count::get(), - "Number of memory responses ignored because the instruction is " - "squashed"), - ADD_STAT(memOrderViolation, Stats::Units::Count::get(), - "Number of memory ordering violations"), - ADD_STAT(squashedStores, Stats::Units::Count::get(), - "Number of stores squashed"), - ADD_STAT(rescheduledLoads, Stats::Units::Count::get(), - "Number of loads that were rescheduled"), - ADD_STAT(blockedByCache, Stats::Units::Count::get(), - "Number of times an access to memory failed due to the cache " - "being blocked") -{ -} - -template -void -LSQUnit::setDcachePort(RequestPort *dcache_port) -{ - dcachePort = dcache_port; -} - -template -void -LSQUnit::drainSanityCheck() const -{ - for (int i = 0; i < loadQueue.capacity(); ++i) - assert(!loadQueue[i].valid()); - - assert(storesToWB == 0); - assert(!retryPkt); -} - -template -void -LSQUnit::takeOverFrom() -{ - resetState(); -} - -template -void -LSQUnit::insert(const O3DynInstPtr &inst) -{ - assert(inst->isMemRef()); - - assert(inst->isLoad() || inst->isStore() || inst->isAtomic()); - - if (inst->isLoad()) { - insertLoad(inst); - } else { - insertStore(inst); - } - - inst->setInLSQ(); -} - -template -void -LSQUnit::insertLoad(const O3DynInstPtr &load_inst) -{ - assert(!loadQueue.full()); - assert(loads < loadQueue.capacity()); - - DPRINTF(LSQUnit, "Inserting load PC %s, idx:%i [sn:%lli]\n", - load_inst->pcState(), loadQueue.tail(), load_inst->seqNum); - - /* Grow the queue. */ - loadQueue.advance_tail(); - - load_inst->sqIt = storeQueue.end(); - - assert(!loadQueue.back().valid()); - loadQueue.back().set(load_inst); - load_inst->lqIdx = loadQueue.tail(); - assert(load_inst->lqIdx > 0); - load_inst->lqIt = loadQueue.getIterator(load_inst->lqIdx); - - ++loads; - - // hardware transactional memory - // transactional state and nesting depth must be tracked - // in the in-order part of the core. - if (load_inst->isHtmStart()) { - htmStarts++; - DPRINTF(HtmCpu, ">> htmStarts++ (%d) : htmStops (%d)\n", - htmStarts, htmStops); - - const int htm_depth = htmStarts - htmStops; - const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr(); - auto htm_uid = htm_cpt->getHtmUid(); - - // for debugging purposes - if (!load_inst->inHtmTransactionalState()) { - htm_uid = htm_cpt->newHtmUid(); - DPRINTF(HtmCpu, "generating new htmUid=%u\n", htm_uid); - if (htm_depth != 1) { - DPRINTF(HtmCpu, - "unusual HTM transactional depth (%d)" - " possibly caused by mispeculation - htmUid=%u\n", - htm_depth, htm_uid); - } - } - load_inst->setHtmTransactionalState(htm_uid, htm_depth); - } - - if (load_inst->isHtmStop()) { - htmStops++; - DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops++ (%d)\n", - htmStarts, htmStops); - - if (htmStops==1 && htmStarts==0) { - DPRINTF(HtmCpu, - "htmStops==1 && htmStarts==0. " - "This generally shouldn't happen " - "(unless due to misspeculation)\n"); - } - } -} - -template -void -LSQUnit::insertStore(const O3DynInstPtr& store_inst) -{ - // Make sure it is not full before inserting an instruction. - assert(!storeQueue.full()); - assert(stores < storeQueue.capacity()); - - DPRINTF(LSQUnit, "Inserting store PC %s, idx:%i [sn:%lli]\n", - store_inst->pcState(), storeQueue.tail(), store_inst->seqNum); - storeQueue.advance_tail(); - - store_inst->sqIdx = storeQueue.tail(); - store_inst->lqIdx = loadQueue.tail() + 1; - assert(store_inst->lqIdx > 0); - store_inst->lqIt = loadQueue.end(); - - storeQueue.back().set(store_inst); - - ++stores; -} - -template -O3DynInstPtr -LSQUnit::getMemDepViolator() -{ - O3DynInstPtr temp = memDepViolator; - - memDepViolator = NULL; - - return temp; -} - -template -unsigned -LSQUnit::numFreeLoadEntries() -{ - //LQ has an extra dummy entry to differentiate - //empty/full conditions. Subtract 1 from the free entries. - DPRINTF(LSQUnit, "LQ size: %d, #loads occupied: %d\n", - 1 + loadQueue.capacity(), loads); - return loadQueue.capacity() - loads; -} - -template -unsigned -LSQUnit::numFreeStoreEntries() -{ - //SQ has an extra dummy entry to differentiate - //empty/full conditions. Subtract 1 from the free entries. - DPRINTF(LSQUnit, "SQ size: %d, #stores occupied: %d\n", - 1 + storeQueue.capacity(), stores); - return storeQueue.capacity() - stores; - - } - -template -void -LSQUnit::checkSnoop(PacketPtr pkt) -{ - // Should only ever get invalidations in here - assert(pkt->isInvalidate()); - - DPRINTF(LSQUnit, "Got snoop for address %#x\n", pkt->getAddr()); - - for (int x = 0; x < cpu->numContexts(); x++) { - ThreadContext *tc = cpu->getContext(x); - bool no_squash = cpu->thread[x]->noSquashFromTC; - cpu->thread[x]->noSquashFromTC = true; - TheISA::handleLockedSnoop(tc, pkt, cacheBlockMask); - cpu->thread[x]->noSquashFromTC = no_squash; - } - - if (loadQueue.empty()) - return; - - auto iter = loadQueue.begin(); - - Addr invalidate_addr = pkt->getAddr() & cacheBlockMask; - - O3DynInstPtr ld_inst = iter->instruction(); - assert(ld_inst); - LSQRequest *req = iter->request(); - - // Check that this snoop didn't just invalidate our lock flag - if (ld_inst->effAddrValid() && - req->isCacheBlockHit(invalidate_addr, cacheBlockMask) - && ld_inst->memReqFlags & Request::LLSC) - TheISA::handleLockedSnoopHit(ld_inst.get()); - - bool force_squash = false; - - while (++iter != loadQueue.end()) { - ld_inst = iter->instruction(); - assert(ld_inst); - req = iter->request(); - if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) - continue; - - DPRINTF(LSQUnit, "-- inst [sn:%lli] to pktAddr:%#x\n", - ld_inst->seqNum, invalidate_addr); - - if (force_squash || - req->isCacheBlockHit(invalidate_addr, cacheBlockMask)) { - if (needsTSO) { - // If we have a TSO system, as all loads must be ordered with - // all other loads, this load as well as *all* subsequent loads - // need to be squashed to prevent possible load reordering. - force_squash = true; - } - if (ld_inst->possibleLoadViolation() || force_squash) { - DPRINTF(LSQUnit, "Conflicting load at addr %#x [sn:%lli]\n", - pkt->getAddr(), ld_inst->seqNum); - - // Mark the load for re-execution - ld_inst->fault = std::make_shared(); - req->setStateToFault(); - } else { - DPRINTF(LSQUnit, "HitExternal Snoop for addr %#x [sn:%lli]\n", - pkt->getAddr(), ld_inst->seqNum); - - // Make sure that we don't lose a snoop hitting a LOCKED - // address since the LOCK* flags don't get updated until - // commit. - if (ld_inst->memReqFlags & Request::LLSC) - TheISA::handleLockedSnoopHit(ld_inst.get()); - - // If a older load checks this and it's true - // then we might have missed the snoop - // in which case we need to invalidate to be sure - ld_inst->hitExternalSnoop(true); - } - } - } - return; -} - -template -Fault -LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt, - const O3DynInstPtr& inst) -{ - Addr inst_eff_addr1 = inst->effAddr >> depCheckShift; - Addr inst_eff_addr2 = (inst->effAddr + inst->effSize - 1) >> depCheckShift; - - /** @todo in theory you only need to check an instruction that has executed - * however, there isn't a good way in the pipeline at the moment to check - * all instructions that will execute before the store writes back. Thus, - * like the implementation that came before it, we're overly conservative. - */ - while (loadIt != loadQueue.end()) { - O3DynInstPtr ld_inst = loadIt->instruction(); - if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) { - ++loadIt; - continue; - } - - Addr ld_eff_addr1 = ld_inst->effAddr >> depCheckShift; - Addr ld_eff_addr2 = - (ld_inst->effAddr + ld_inst->effSize - 1) >> depCheckShift; - - if (inst_eff_addr2 >= ld_eff_addr1 && inst_eff_addr1 <= ld_eff_addr2) { - if (inst->isLoad()) { - // If this load is to the same block as an external snoop - // invalidate that we've observed then the load needs to be - // squashed as it could have newer data - if (ld_inst->hitExternalSnoop()) { - if (!memDepViolator || - ld_inst->seqNum < memDepViolator->seqNum) { - DPRINTF(LSQUnit, "Detected fault with inst [sn:%lli] " - "and [sn:%lli] at address %#x\n", - inst->seqNum, ld_inst->seqNum, ld_eff_addr1); - memDepViolator = ld_inst; - - ++stats.memOrderViolation; - - return std::make_shared( - "Detected fault with inst [sn:%lli] and " - "[sn:%lli] at address %#x\n", - inst->seqNum, ld_inst->seqNum, ld_eff_addr1); - } - } - - // Otherwise, mark the load has a possible load violation - // and if we see a snoop before it's commited, we need to squash - ld_inst->possibleLoadViolation(true); - DPRINTF(LSQUnit, "Found possible load violation at addr: %#x" - " between instructions [sn:%lli] and [sn:%lli]\n", - inst_eff_addr1, inst->seqNum, ld_inst->seqNum); - } else { - // A load/store incorrectly passed this store. - // Check if we already have a violator, or if it's newer - // squash and refetch. - if (memDepViolator && ld_inst->seqNum > memDepViolator->seqNum) - break; - - DPRINTF(LSQUnit, "Detected fault with inst [sn:%lli] and " - "[sn:%lli] at address %#x\n", - inst->seqNum, ld_inst->seqNum, ld_eff_addr1); - memDepViolator = ld_inst; - - ++stats.memOrderViolation; - - return std::make_shared( - "Detected fault with " - "inst [sn:%lli] and [sn:%lli] at address %#x\n", - inst->seqNum, ld_inst->seqNum, ld_eff_addr1); - } - } - - ++loadIt; - } - return NoFault; -} - - - - -template -Fault -LSQUnit::executeLoad(const O3DynInstPtr &inst) -{ - // Execute a specific load. - Fault load_fault = NoFault; - - DPRINTF(LSQUnit, "Executing load PC %s, [sn:%lli]\n", - inst->pcState(), inst->seqNum); - - assert(!inst->isSquashed()); - - load_fault = inst->initiateAcc(); - - if (load_fault == NoFault && !inst->readMemAccPredicate()) { - assert(inst->readPredicate()); - inst->setExecuted(); - inst->completeAcc(nullptr); - iewStage->instToCommit(inst); - iewStage->activityThisCycle(); - return NoFault; - } - - if (inst->isTranslationDelayed() && load_fault == NoFault) - return load_fault; - - if (load_fault != NoFault && inst->translationCompleted() && - inst->savedReq->isPartialFault() && !inst->savedReq->isComplete()) { - assert(inst->savedReq->isSplit()); - // If we have a partial fault where the mem access is not complete yet - // then the cache must have been blocked. This load will be re-executed - // when the cache gets unblocked. We will handle the fault when the - // mem access is complete. - return NoFault; - } - - // If the instruction faulted or predicated false, then we need to send it - // along to commit without the instruction completing. - if (load_fault != NoFault || !inst->readPredicate()) { - // Send this instruction to commit, also make sure iew stage - // realizes there is activity. Mark it as executed unless it - // is a strictly ordered load that needs to hit the head of - // commit. - if (!inst->readPredicate()) - inst->forwardOldRegs(); - DPRINTF(LSQUnit, "Load [sn:%lli] not executed from %s\n", - inst->seqNum, - (load_fault != NoFault ? "fault" : "predication")); - if (!(inst->hasRequest() && inst->strictlyOrdered()) || - inst->isAtCommit()) { - inst->setExecuted(); - } - iewStage->instToCommit(inst); - iewStage->activityThisCycle(); - } else { - if (inst->effAddrValid()) { - auto it = inst->lqIt; - ++it; - - if (checkLoads) - return checkViolations(it, inst); - } - } - - return load_fault; -} - -template -Fault -LSQUnit::executeStore(const O3DynInstPtr &store_inst) -{ - // Make sure that a store exists. - assert(stores != 0); - - int store_idx = store_inst->sqIdx; - - DPRINTF(LSQUnit, "Executing store PC %s [sn:%lli]\n", - store_inst->pcState(), store_inst->seqNum); - - assert(!store_inst->isSquashed()); - - // Check the recently completed loads to see if any match this store's - // address. If so, then we have a memory ordering violation. - typename LoadQueue::iterator loadIt = store_inst->lqIt; - - Fault store_fault = store_inst->initiateAcc(); - - if (store_inst->isTranslationDelayed() && - store_fault == NoFault) - return store_fault; - - if (!store_inst->readPredicate()) { - DPRINTF(LSQUnit, "Store [sn:%lli] not executed from predication\n", - store_inst->seqNum); - store_inst->forwardOldRegs(); - return store_fault; - } - - if (storeQueue[store_idx].size() == 0) { - DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n", - store_inst->pcState(), store_inst->seqNum); - - return store_fault; - } - - assert(store_fault == NoFault); - - if (store_inst->isStoreConditional() || store_inst->isAtomic()) { - // Store conditionals and Atomics need to set themselves as able to - // writeback if we haven't had a fault by here. - storeQueue[store_idx].canWB() = true; - - ++storesToWB; - } - - return checkViolations(loadIt, store_inst); - -} - -template -void -LSQUnit::commitLoad() -{ - assert(loadQueue.front().valid()); - - DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n", - loadQueue.front().instruction()->pcState()); - - loadQueue.front().clear(); - loadQueue.pop_front(); - - --loads; -} - -template -void -LSQUnit::commitLoads(InstSeqNum &youngest_inst) -{ - assert(loads == 0 || loadQueue.front().valid()); - - while (loads != 0 && loadQueue.front().instruction()->seqNum - <= youngest_inst) { - commitLoad(); - } -} - -template -void -LSQUnit::commitStores(InstSeqNum &youngest_inst) -{ - assert(stores == 0 || storeQueue.front().valid()); - - /* Forward iterate the store queue (age order). */ - for (auto& x : storeQueue) { - assert(x.valid()); - // Mark any stores that are now committed and have not yet - // been marked as able to write back. - if (!x.canWB()) { - if (x.instruction()->seqNum > youngest_inst) { - break; - } - DPRINTF(LSQUnit, "Marking store as able to write back, PC " - "%s [sn:%lli]\n", - x.instruction()->pcState(), - x.instruction()->seqNum); - - x.canWB() = true; - - ++storesToWB; - } - } -} - -template -void -LSQUnit::writebackBlockedStore() -{ - assert(isStoreBlocked); - storeWBIt->request()->sendPacketToCache(); - if (storeWBIt->request()->isSent()){ - storePostSend(); - } -} - -template -void -LSQUnit::writebackStores() -{ - if (isStoreBlocked) { - DPRINTF(LSQUnit, "Writing back blocked store\n"); - writebackBlockedStore(); - } - - while (storesToWB > 0 && - storeWBIt.dereferenceable() && - storeWBIt->valid() && - storeWBIt->canWB() && - ((!needsTSO) || (!storeInFlight)) && - lsq->cachePortAvailable(false)) { - - if (isStoreBlocked) { - DPRINTF(LSQUnit, "Unable to write back any more stores, cache" - " is blocked!\n"); - break; - } - - // Store didn't write any data so no need to write it back to - // memory. - if (storeWBIt->size() == 0) { - /* It is important that the preincrement happens at (or before) - * the call, as the the code of completeStore checks - * storeWBIt. */ - completeStore(storeWBIt++); - continue; - } - - if (storeWBIt->instruction()->isDataPrefetch()) { - storeWBIt++; - continue; - } - - assert(storeWBIt->hasRequest()); - assert(!storeWBIt->committed()); - - O3DynInstPtr inst = storeWBIt->instruction(); - LSQRequest* req = storeWBIt->request(); - - // Process store conditionals or store release after all previous - // stores are completed - if ((req->mainRequest()->isLLSC() || - req->mainRequest()->isRelease()) && - (storeWBIt.idx() != storeQueue.head())) { - DPRINTF(LSQUnit, "Store idx:%i PC:%s to Addr:%#x " - "[sn:%lli] is %s%s and not head of the queue\n", - storeWBIt.idx(), inst->pcState(), - req->request()->getPaddr(), inst->seqNum, - req->mainRequest()->isLLSC() ? "SC" : "", - req->mainRequest()->isRelease() ? "/Release" : ""); - break; - } - - storeWBIt->committed() = true; - - assert(!inst->memData); - inst->memData = new uint8_t[req->_size]; - - if (storeWBIt->isAllZeros()) - memset(inst->memData, 0, req->_size); - else - memcpy(inst->memData, storeWBIt->data(), req->_size); - - - if (req->senderState() == nullptr) { - SQSenderState *state = new SQSenderState(storeWBIt); - state->isLoad = false; - state->needWB = false; - state->inst = inst; - - req->senderState(state); - if (inst->isStoreConditional() || inst->isAtomic()) { - /* Only store conditionals and atomics need a writeback. */ - state->needWB = true; - } - } - req->buildPackets(); - - DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%s " - "to Addr:%#x, data:%#x [sn:%lli]\n", - storeWBIt.idx(), inst->pcState(), - req->request()->getPaddr(), (int)*(inst->memData), - inst->seqNum); - - // @todo: Remove this SC hack once the memory system handles it. - if (inst->isStoreConditional()) { - // Disable recording the result temporarily. Writing to - // misc regs normally updates the result, but this is not - // the desired behavior when handling store conditionals. - inst->recordResult(false); - bool success = TheISA::handleLockedWrite(inst.get(), - req->request(), cacheBlockMask); - inst->recordResult(true); - req->packetSent(); - - if (!success) { - req->complete(); - // Instantly complete this store. - DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed. " - "Instantly completing it.\n", - inst->seqNum); - PacketPtr new_pkt = new Packet(*req->packet()); - WritebackEvent *wb = new WritebackEvent(inst, - new_pkt, this); - cpu->schedule(wb, curTick() + 1); - completeStore(storeWBIt); - if (!storeQueue.empty()) - storeWBIt++; - else - storeWBIt = storeQueue.end(); - continue; - } - } - - if (req->request()->isLocalAccess()) { - assert(!inst->isStoreConditional()); - assert(!inst->inHtmTransactionalState()); - ThreadContext *thread = cpu->tcBase(lsqID); - PacketPtr main_pkt = new Packet(req->mainRequest(), - MemCmd::WriteReq); - main_pkt->dataStatic(inst->memData); - req->request()->localAccessor(thread, main_pkt); - delete main_pkt; - completeStore(storeWBIt); - storeWBIt++; - continue; - } - /* Send to cache */ - req->sendPacketToCache(); - - /* If successful, do the post send */ - if (req->isSent()) { - storePostSend(); - } else { - DPRINTF(LSQUnit, "D-Cache became blocked when writing [sn:%lli], " - "will retry later\n", - inst->seqNum); - } - } - assert(stores >= 0 && storesToWB >= 0); -} - -template -void -LSQUnit::squash(const InstSeqNum &squashed_num) -{ - DPRINTF(LSQUnit, "Squashing until [sn:%lli]!" - "(Loads:%i Stores:%i)\n", squashed_num, loads, stores); - - while (loads != 0 && - loadQueue.back().instruction()->seqNum > squashed_num) { - DPRINTF(LSQUnit,"Load Instruction PC %s squashed, " - "[sn:%lli]\n", - loadQueue.back().instruction()->pcState(), - loadQueue.back().instruction()->seqNum); - - if (isStalled() && loadQueue.tail() == stallingLoadIdx) { - stalled = false; - stallingStoreIsn = 0; - stallingLoadIdx = 0; - } - - // hardware transactional memory - // Squashing instructions can alter the transaction nesting depth - // and must be corrected before fetching resumes. - if (loadQueue.back().instruction()->isHtmStart()) - { - htmStarts = (--htmStarts < 0) ? 0 : htmStarts; - DPRINTF(HtmCpu, ">> htmStarts-- (%d) : htmStops (%d)\n", - htmStarts, htmStops); - } - if (loadQueue.back().instruction()->isHtmStop()) - { - htmStops = (--htmStops < 0) ? 0 : htmStops; - DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops-- (%d)\n", - htmStarts, htmStops); - } - // Clear the smart pointer to make sure it is decremented. - loadQueue.back().instruction()->setSquashed(); - loadQueue.back().clear(); - - --loads; - - loadQueue.pop_back(); - ++stats.squashedLoads; - } - - // hardware transactional memory - // scan load queue (from oldest to youngest) for most recent valid htmUid - auto scan_it = loadQueue.begin(); - uint64_t in_flight_uid = 0; - while (scan_it != loadQueue.end()) { - if (scan_it->instruction()->isHtmStart() && - !scan_it->instruction()->isSquashed()) { - in_flight_uid = scan_it->instruction()->getHtmTransactionUid(); - DPRINTF(HtmCpu, "loadQueue[%d]: found valid HtmStart htmUid=%u\n", - scan_it._idx, in_flight_uid); - } - scan_it++; - } - // If there's a HtmStart in the pipeline then use its htmUid, - // otherwise use the most recently committed uid - const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr(); - if (htm_cpt) { - const uint64_t old_local_htm_uid = htm_cpt->getHtmUid(); - uint64_t new_local_htm_uid; - if (in_flight_uid > 0) - new_local_htm_uid = in_flight_uid; - else - new_local_htm_uid = lastRetiredHtmUid; - - if (old_local_htm_uid != new_local_htm_uid) { - DPRINTF(HtmCpu, "flush: lastRetiredHtmUid=%u\n", - lastRetiredHtmUid); - DPRINTF(HtmCpu, "flush: resetting localHtmUid=%u\n", - new_local_htm_uid); - - htm_cpt->setHtmUid(new_local_htm_uid); - } - } - - if (memDepViolator && squashed_num < memDepViolator->seqNum) { - memDepViolator = NULL; - } - - while (stores != 0 && - storeQueue.back().instruction()->seqNum > squashed_num) { - // Instructions marked as can WB are already committed. - if (storeQueue.back().canWB()) { - break; - } - - DPRINTF(LSQUnit,"Store Instruction PC %s squashed, " - "idx:%i [sn:%lli]\n", - storeQueue.back().instruction()->pcState(), - storeQueue.tail(), storeQueue.back().instruction()->seqNum); - - // I don't think this can happen. It should have been cleared - // by the stalling load. - if (isStalled() && - storeQueue.back().instruction()->seqNum == stallingStoreIsn) { - panic("Is stalled should have been cleared by stalling load!\n"); - stalled = false; - stallingStoreIsn = 0; - } - - // Clear the smart pointer to make sure it is decremented. - storeQueue.back().instruction()->setSquashed(); - - // Must delete request now that it wasn't handed off to - // memory. This is quite ugly. @todo: Figure out the proper - // place to really handle request deletes. - storeQueue.back().clear(); - --stores; - - storeQueue.pop_back(); - ++stats.squashedStores; - } -} - -template -void -LSQUnit::storePostSend() -{ - if (isStalled() && - storeWBIt->instruction()->seqNum == stallingStoreIsn) { - DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] " - "load idx:%i\n", - stallingStoreIsn, stallingLoadIdx); - stalled = false; - stallingStoreIsn = 0; - iewStage->replayMemInst(loadQueue[stallingLoadIdx].instruction()); - } - - if (!storeWBIt->instruction()->isStoreConditional()) { - // The store is basically completed at this time. This - // only works so long as the checker doesn't try to - // verify the value in memory for stores. - storeWBIt->instruction()->setCompleted(); - - if (cpu->checker) { - cpu->checker->verify(storeWBIt->instruction()); - } - } - - if (needsTSO) { - storeInFlight = true; - } - - storeWBIt++; -} - -template -void -LSQUnit::writeback(const O3DynInstPtr &inst, PacketPtr pkt) -{ - iewStage->wakeCPU(); - - // Squashed instructions do not need to complete their access. - if (inst->isSquashed()) { - assert (!inst->isStore() || inst->isStoreConditional()); - ++stats.ignoredResponses; - return; - } - - if (!inst->isExecuted()) { - inst->setExecuted(); - - if (inst->fault == NoFault) { - // Complete access to copy data to proper place. - inst->completeAcc(pkt); - } else { - // If the instruction has an outstanding fault, we cannot complete - // the access as this discards the current fault. - - // If we have an outstanding fault, the fault should only be of - // type ReExec or - in case of a SplitRequest - a partial - // translation fault - - // Unless it's a hardware transactional memory fault - auto htm_fault = std::dynamic_pointer_cast< - GenericHtmFailureFault>(inst->fault); - - if (!htm_fault) { - assert(dynamic_cast(inst->fault.get()) != nullptr || - inst->savedReq->isPartialFault()); - - } else if (!pkt->htmTransactionFailedInCache()) { - // Situation in which the instruction has a hardware transactional - // memory fault but not the packet itself. This can occur with - // ldp_uop microops since access is spread over multiple packets. - DPRINTF(HtmCpu, - "%s writeback with HTM failure fault, " - "however, completing packet is not aware of " - "transaction failure. cause=%s htmUid=%u\n", - inst->staticInst->getName(), - htmFailureToStr(htm_fault->getHtmFailureFaultCause()), - htm_fault->getHtmUid()); - } - - DPRINTF(LSQUnit, "Not completing instruction [sn:%lli] access " - "due to pending fault.\n", inst->seqNum); - } - } - - // Need to insert instruction into queue to commit - iewStage->instToCommit(inst); - - iewStage->activityThisCycle(); - - // see if this load changed the PC - iewStage->checkMisprediction(inst); -} - -template -void -LSQUnit::completeStore(typename StoreQueue::iterator store_idx) -{ - assert(store_idx->valid()); - store_idx->completed() = true; - --storesToWB; - // A bit conservative because a store completion may not free up entries, - // but hopefully avoids two store completions in one cycle from making - // the CPU tick twice. - cpu->wakeCPU(); - cpu->activityThisCycle(); - - /* We 'need' a copy here because we may clear the entry from the - * store queue. */ - O3DynInstPtr store_inst = store_idx->instruction(); - if (store_idx == storeQueue.begin()) { - do { - storeQueue.front().clear(); - storeQueue.pop_front(); - --stores; - } while (storeQueue.front().completed() && - !storeQueue.empty()); - - iewStage->updateLSQNextCycle = true; - } - - DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head " - "idx:%i\n", - store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1); - -#if TRACING_ON - if (Debug::O3PipeView) { - store_inst->storeTick = - curTick() - store_inst->fetchTick; - } -#endif - - if (isStalled() && - store_inst->seqNum == stallingStoreIsn) { - DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] " - "load idx:%i\n", - stallingStoreIsn, stallingLoadIdx); - stalled = false; - stallingStoreIsn = 0; - iewStage->replayMemInst(loadQueue[stallingLoadIdx].instruction()); - } - - store_inst->setCompleted(); - - if (needsTSO) { - storeInFlight = false; - } - - // Tell the checker we've completed this instruction. Some stores - // may get reported twice to the checker, but the checker can - // handle that case. - // Store conditionals cannot be sent to the checker yet, they have - // to update the misc registers first which should take place - // when they commit - if (cpu->checker && !store_inst->isStoreConditional()) { - cpu->checker->verify(store_inst); - } -} - -template -bool -LSQUnit::trySendPacket(bool isLoad, PacketPtr data_pkt) -{ - bool ret = true; - bool cache_got_blocked = false; - - auto state = dynamic_cast(data_pkt->senderState); - - if (!lsq->cacheBlocked() && - lsq->cachePortAvailable(isLoad)) { - if (!dcachePort->sendTimingReq(data_pkt)) { - ret = false; - cache_got_blocked = true; - } - } else { - ret = false; - } - - if (ret) { - if (!isLoad) { - isStoreBlocked = false; - } - lsq->cachePortBusy(isLoad); - state->outstanding++; - state->request()->packetSent(); - } else { - if (cache_got_blocked) { - lsq->cacheBlocked(true); - ++stats.blockedByCache; - } - if (!isLoad) { - assert(state->request() == storeWBIt->request()); - isStoreBlocked = true; - } - state->request()->packetNotSent(); - } - return ret; -} - -template -void -LSQUnit::recvRetry() -{ - if (isStoreBlocked) { - DPRINTF(LSQUnit, "Receiving retry: blocked store\n"); - writebackBlockedStore(); - } -} - -template -void -LSQUnit::dumpInsts() const -{ - cprintf("Load store queue: Dumping instructions.\n"); - cprintf("Load queue size: %i\n", loads); - cprintf("Load queue: "); - - for (const auto& e: loadQueue) { - const O3DynInstPtr &inst(e.instruction()); - cprintf("%s.[sn:%llu] ", inst->pcState(), inst->seqNum); - } - cprintf("\n"); - - cprintf("Store queue size: %i\n", stores); - cprintf("Store queue: "); - - for (const auto& e: storeQueue) { - const O3DynInstPtr &inst(e.instruction()); - cprintf("%s.[sn:%llu] ", inst->pcState(), inst->seqNum); - } - - cprintf("\n"); -} - -template -unsigned int -LSQUnit::cacheLineSize() -{ - return cpu->cacheLineSize(); -} - -template -Fault -LSQUnit::read(LSQRequest *req, int load_idx) -{ - LQEntry& load_req = loadQueue[load_idx]; - const O3DynInstPtr& load_inst = load_req.instruction(); - - load_req.setRequest(req); - assert(load_inst); - - assert(!load_inst->isExecuted()); - - // Make sure this isn't a strictly ordered load - // A bit of a hackish way to get strictly ordered accesses to work - // only if they're at the head of the LSQ and are ready to commit - // (at the head of the ROB too). - - if (req->mainRequest()->isStrictlyOrdered() && - (load_idx != loadQueue.head() || !load_inst->isAtCommit())) { - // Tell IQ/mem dep unit that this instruction will need to be - // rescheduled eventually - iewStage->rescheduleMemInst(load_inst); - load_inst->clearIssued(); - load_inst->effAddrValid(false); - ++stats.rescheduledLoads; - DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n", - load_inst->seqNum, load_inst->pcState()); - - // Must delete request now that it wasn't handed off to - // memory. This is quite ugly. @todo: Figure out the proper - // place to really handle request deletes. - load_req.setRequest(nullptr); - req->discard(); - return std::make_shared( - "Strictly ordered load [sn:%llx] PC %s\n", - load_inst->seqNum, load_inst->pcState()); - } - - DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, " - "storeHead: %i addr: %#x%s\n", - load_idx - 1, load_inst->sqIt._idx, storeQueue.head() - 1, - req->mainRequest()->getPaddr(), req->isSplit() ? " split" : ""); - - if (req->mainRequest()->isLLSC()) { - // Disable recording the result temporarily. Writing to misc - // regs normally updates the result, but this is not the - // desired behavior when handling store conditionals. - load_inst->recordResult(false); - TheISA::handleLockedRead(load_inst.get(), req->mainRequest()); - load_inst->recordResult(true); - } - - if (req->mainRequest()->isLocalAccess()) { - assert(!load_inst->memData); - assert(!load_inst->inHtmTransactionalState()); - load_inst->memData = new uint8_t[MaxDataBytes]; - - ThreadContext *thread = cpu->tcBase(lsqID); - PacketPtr main_pkt = new Packet(req->mainRequest(), MemCmd::ReadReq); - - main_pkt->dataStatic(load_inst->memData); - - Cycles delay = req->mainRequest()->localAccessor(thread, main_pkt); - - WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this); - cpu->schedule(wb, cpu->clockEdge(delay)); - return NoFault; - } - - // hardware transactional memory - if (req->mainRequest()->isHTMStart() || req->mainRequest()->isHTMCommit()) - { - // don't want to send nested transactionStarts and - // transactionStops outside of core, e.g. to Ruby - if (req->mainRequest()->getFlags().isSet(Request::NO_ACCESS)) { - Cycles delay(0); - PacketPtr data_pkt = - new Packet(req->mainRequest(), MemCmd::ReadReq); - - // Allocate memory if this is the first time a load is issued. - if (!load_inst->memData) { - load_inst->memData = - new uint8_t[req->mainRequest()->getSize()]; - // sanity checks espect zero in request's data - memset(load_inst->memData, 0, req->mainRequest()->getSize()); - } - - data_pkt->dataStatic(load_inst->memData); - if (load_inst->inHtmTransactionalState()) { - data_pkt->setHtmTransactional( - load_inst->getHtmTransactionUid()); - } - data_pkt->makeResponse(); - - WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this); - cpu->schedule(wb, cpu->clockEdge(delay)); - return NoFault; - } - } - - // Check the SQ for any previous stores that might lead to forwarding - auto store_it = load_inst->sqIt; - assert (store_it >= storeWBIt); - // End once we've reached the top of the LSQ - while (store_it != storeWBIt) { - // Move the index to one younger - store_it--; - assert(store_it->valid()); - assert(store_it->instruction()->seqNum < load_inst->seqNum); - int store_size = store_it->size(); - - // Cache maintenance instructions go down via the store - // path but they carry no data and they shouldn't be - // considered for forwarding - if (store_size != 0 && !store_it->instruction()->strictlyOrdered() && - !(store_it->request()->mainRequest() && - store_it->request()->mainRequest()->isCacheMaintenance())) { - assert(store_it->instruction()->effAddrValid()); - - // Check if the store data is within the lower and upper bounds of - // addresses that the request needs. - auto req_s = req->mainRequest()->getVaddr(); - auto req_e = req_s + req->mainRequest()->getSize(); - auto st_s = store_it->instruction()->effAddr; - auto st_e = st_s + store_size; - - bool store_has_lower_limit = req_s >= st_s; - bool store_has_upper_limit = req_e <= st_e; - bool lower_load_has_store_part = req_s < st_e; - bool upper_load_has_store_part = req_e > st_s; - - auto coverage = AddrRangeCoverage::NoAddrRangeCoverage; - - // If the store entry is not atomic (atomic does not have valid - // data), the store has all of the data needed, and - // the load is not LLSC, then - // we can forward data from the store to the load - if (!store_it->instruction()->isAtomic() && - store_has_lower_limit && store_has_upper_limit && - !req->mainRequest()->isLLSC()) { - - const auto& store_req = store_it->request()->mainRequest(); - coverage = store_req->isMasked() ? - AddrRangeCoverage::PartialAddrRangeCoverage : - AddrRangeCoverage::FullAddrRangeCoverage; - } else if ( - // This is the partial store-load forwarding case where a store - // has only part of the load's data and the load isn't LLSC - (!req->mainRequest()->isLLSC() && - ((store_has_lower_limit && lower_load_has_store_part) || - (store_has_upper_limit && upper_load_has_store_part) || - (lower_load_has_store_part && upper_load_has_store_part))) || - // The load is LLSC, and the store has all or part of the - // load's data - (req->mainRequest()->isLLSC() && - ((store_has_lower_limit || upper_load_has_store_part) && - (store_has_upper_limit || lower_load_has_store_part))) || - // The store entry is atomic and has all or part of the load's - // data - (store_it->instruction()->isAtomic() && - ((store_has_lower_limit || upper_load_has_store_part) && - (store_has_upper_limit || lower_load_has_store_part)))) { - - coverage = AddrRangeCoverage::PartialAddrRangeCoverage; - } - - if (coverage == AddrRangeCoverage::FullAddrRangeCoverage) { - // Get shift amount for offset into the store's data. - int shift_amt = req->mainRequest()->getVaddr() - - store_it->instruction()->effAddr; - - // Allocate memory if this is the first time a load is issued. - if (!load_inst->memData) { - load_inst->memData = - new uint8_t[req->mainRequest()->getSize()]; - } - if (store_it->isAllZeros()) - memset(load_inst->memData, 0, - req->mainRequest()->getSize()); - else - memcpy(load_inst->memData, - store_it->data() + shift_amt, - req->mainRequest()->getSize()); - - DPRINTF(LSQUnit, "Forwarding from store idx %i to load to " - "addr %#x\n", store_it._idx, - req->mainRequest()->getVaddr()); - - PacketPtr data_pkt = new Packet(req->mainRequest(), - MemCmd::ReadReq); - data_pkt->dataStatic(load_inst->memData); - - // hardware transactional memory - // Store to load forwarding within a transaction - // This should be okay because the store will be sent to - // the memory subsystem and subsequently get added to the - // write set of the transaction. The write set has a stronger - // property than the read set, so the load doesn't necessarily - // have to be there. - assert(!req->mainRequest()->isHTMCmd()); - if (load_inst->inHtmTransactionalState()) { - assert (!storeQueue[store_it._idx].completed()); - assert ( - storeQueue[store_it._idx].instruction()-> - inHtmTransactionalState()); - assert ( - load_inst->getHtmTransactionUid() == - storeQueue[store_it._idx].instruction()-> - getHtmTransactionUid()); - data_pkt->setHtmTransactional( - load_inst->getHtmTransactionUid()); - DPRINTF(HtmCpu, "HTM LD (ST2LDF) " - "pc=0x%lx - vaddr=0x%lx - " - "paddr=0x%lx - htmUid=%u\n", - load_inst->instAddr(), - data_pkt->req->hasVaddr() ? - data_pkt->req->getVaddr() : 0lu, - data_pkt->getAddr(), - load_inst->getHtmTransactionUid()); - } - - if (req->isAnyOutstandingRequest()) { - assert(req->_numOutstandingPackets > 0); - // There are memory requests packets in flight already. - // This may happen if the store was not complete the - // first time this load got executed. Signal the senderSate - // that response packets should be discarded. - req->discardSenderState(); - } - - WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, - this); - - // We'll say this has a 1 cycle load-store forwarding latency - // for now. - // @todo: Need to make this a parameter. - cpu->schedule(wb, curTick()); - - // Don't need to do anything special for split loads. - ++stats.forwLoads; - - return NoFault; - } else if ( - coverage == AddrRangeCoverage::PartialAddrRangeCoverage) { - // If it's already been written back, then don't worry about - // stalling on it. - if (store_it->completed()) { - panic("Should not check one of these"); - continue; - } - - // Must stall load and force it to retry, so long as it's the - // oldest load that needs to do so. - if (!stalled || - (stalled && - load_inst->seqNum < - loadQueue[stallingLoadIdx].instruction()->seqNum)) { - stalled = true; - stallingStoreIsn = store_it->instruction()->seqNum; - stallingLoadIdx = load_idx; - } - - // Tell IQ/mem dep unit that this instruction will need to be - // rescheduled eventually - iewStage->rescheduleMemInst(load_inst); - load_inst->clearIssued(); - load_inst->effAddrValid(false); - ++stats.rescheduledLoads; - - // Do not generate a writeback event as this instruction is not - // complete. - DPRINTF(LSQUnit, "Load-store forwarding mis-match. " - "Store idx %i to load addr %#x\n", - store_it._idx, req->mainRequest()->getVaddr()); - - // Must discard the request. - req->discard(); - load_req.setRequest(nullptr); - return NoFault; - } - } - } - - // If there's no forwarding case, then go access memory - DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n", - load_inst->seqNum, load_inst->pcState()); - - // Allocate memory if this is the first time a load is issued. - if (!load_inst->memData) { - load_inst->memData = new uint8_t[req->mainRequest()->getSize()]; - } - - - // hardware transactional memory - if (req->mainRequest()->isHTMCmd()) { - // this is a simple sanity check - // the Ruby cache controller will set - // memData to 0x0ul if successful. - *load_inst->memData = (uint64_t) 0x1ull; - } - - // For now, load throughput is constrained by the number of - // load FUs only, and loads do not consume a cache port (only - // stores do). - // @todo We should account for cache port contention - // and arbitrate between loads and stores. - - // if we the cache is not blocked, do cache access - if (req->senderState() == nullptr) { - LQSenderState *state = new LQSenderState( - loadQueue.getIterator(load_idx)); - state->isLoad = true; - state->inst = load_inst; - state->isSplit = req->isSplit(); - req->senderState(state); - } - req->buildPackets(); - req->sendPacketToCache(); - if (!req->isSent()) - iewStage->blockMemInst(load_inst); - - return NoFault; -} - -template -Fault -LSQUnit::write(LSQRequest *req, uint8_t *data, int store_idx) -{ - assert(storeQueue[store_idx].valid()); - - DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i " - "[sn:%llu]\n", - store_idx - 1, req->request()->getPaddr(), storeQueue.head() - 1, - storeQueue[store_idx].instruction()->seqNum); - - storeQueue[store_idx].setRequest(req); - unsigned size = req->_size; - storeQueue[store_idx].size() = size; - bool store_no_data = - req->mainRequest()->getFlags() & Request::STORE_NO_DATA; - storeQueue[store_idx].isAllZeros() = store_no_data; - assert(size <= SQEntry::DataSize || store_no_data); - - // copy data into the storeQueue only if the store request has valid data - if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) && - !req->request()->isCacheMaintenance() && - !req->request()->isAtomic()) - memcpy(storeQueue[store_idx].data(), data, size); - - // This function only writes the data to the store queue, so no fault - // can happen here. - return NoFault; -} - -#endif//__CPU_O3_LSQ_UNIT_IMPL_HH__