cpu: Add first-/non-faulting load support to Minor and O3

Some architectures allow masking faults of memory load instructions in some specific circumstances (e.g. first-faulting and non-faulting loads in Arm SVE). This patch adds support for such loads in the Minor and O3 CPU models. Change-Id: I264a81a078f049127779aa834e89f0e693ba0bea Signed-off-by: Gabor Dozsa <gabor.dozsa@arm.com> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/19178 Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com> Maintainer: Andreas Sandberg <andreas.sandberg@arm.com> Tested-by: kokoro <noreply+kokoro@google.com>
2019-02-27 17:26:56 +00:00
parent 7652b2f12c
commit 46da8fb805
9 changed files with 188 additions and 65 deletions
--- a/src/cpu/minor/dyn_inst.cc
+++ b/src/cpu/minor/dyn_inst.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014, 2016 ARM Limited
+ * Copyright (c) 2013-2014, 2016,2018 ARM Limited
 * All rights reserved
 *
 * The license below extends only to copyright in the software and shall
@@ -108,6 +108,8 @@ MinorDynInst::reportData(std::ostream &os) const
        os << "-";
    else if (isFault())
        os << "F;" << id;
    else if (translationFault != NoFault)
        os << "TF;" << id;
    else
        os << id;
 }
@@ -120,6 +122,8 @@ operator <<(std::ostream &os, const MinorDynInst &inst)
    if (inst.isFault())
        os << "fault: \"" << inst.fault->name() << '"';
    else if (inst.translationFault != NoFault)
        os << "translation fault: \"" << inst.translationFault->name() << '"';
    else if (inst.staticInst)
        os << inst.staticInst->getName();
    else
--- a/src/cpu/minor/dyn_inst.hh
+++ b/src/cpu/minor/dyn_inst.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 ARM Limited
+ * Copyright (c) 2013-2014,2018 ARM Limited
 * All rights reserved
 *
 * The license below extends only to copyright in the software and shall
@@ -194,6 +194,9 @@ class MinorDynInst : public RefCounted
    /** This instruction is in the LSQ, not a functional unit */
    bool inLSQ;
    /** Translation fault in case of a mem ref */
    Fault translationFault;
    /** The instruction has been sent to the store buffer */
    bool inStoreBuffer;
@@ -233,9 +236,9 @@ class MinorDynInst : public RefCounted
        staticInst(NULL), id(id_), traceData(NULL),
        pc(TheISA::PCState(0)), fault(fault_),
        triedToPredict(false), predictedTaken(false),
-        fuIndex(0), inLSQ(false), inStoreBuffer(false),
+        fuIndex(0), inLSQ(false), translationFault(NoFault),
-        canEarlyIssue(false), predicate(true), memAccPredicate(true),
+        inStoreBuffer(false), canEarlyIssue(false), predicate(true),
-        instToWaitFor(0), extraCommitDelay(Cycles(0)),
+        memAccPredicate(true), instToWaitFor(0), extraCommitDelay(Cycles(0)),
        extraCommitDelayExpr(NULL), minimumCommitCycle(Cycles(0))
    { }
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@@ -116,9 +116,8 @@ class ExecContext : public ::ExecContext
                    const std::vector<bool>& byteEnable = std::vector<bool>())
        override
    {
-        execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
+        return execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
            size, addr, flags, nullptr, nullptr, byteEnable);
        return NoFault;
    }
    Fault
@@ -128,9 +127,8 @@ class ExecContext : public ::ExecContext
        override
    {
        assert(byteEnable.empty() || byteEnable.size() == size);
-        execute.getLSQ().pushRequest(inst, false /* store */, data,
+        return execute.getLSQ().pushRequest(inst, false /* store */, data,
            size, addr, flags, res, nullptr, byteEnable);
        return NoFault;
    }
    Fault
@@ -138,9 +136,8 @@ class ExecContext : public ::ExecContext
                   AtomicOpFunctor *amo_op) override
    {
        // AMO requests are pushed through the store path
-        execute.getLSQ().pushRequest(inst, false /* amo */, nullptr,
+        return execute.getLSQ().pushRequest(inst, false /* amo */, nullptr,
            size, addr, flags, nullptr, amo_op);
        return NoFault;
    }
    RegVal
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
@@ -337,19 +337,19 @@ Execute::handleMemResponse(MinorDynInstPtr inst,
     *  context predicate, otherwise, it will be set to false */
    bool use_context_predicate = true;
-    if (response->fault != NoFault) {
+    if (inst->translationFault != NoFault) {
        /* Invoke memory faults. */
        DPRINTF(MinorMem, "Completing fault from DTLB access: %s\n",
-            response->fault->name());
+            inst->translationFault->name());
        if (inst->staticInst->isPrefetch()) {
            DPRINTF(MinorMem, "Not taking fault on prefetch: %s\n",
-                response->fault->name());
+                inst->translationFault->name());
            /* Don't assign to fault */
        } else {
            /* Take the fault raised during the TLB/memory access */
-            fault = response->fault;
+            fault = inst->translationFault;
            fault->invoke(thread, inst->staticInst);
        }
@@ -469,6 +469,18 @@ Execute::executeMemRefInst(MinorDynInstPtr inst, BranchData &branch,
        Fault init_fault = inst->staticInst->initiateAcc(&context,
            inst->traceData);
        if (inst->inLSQ) {
            if (init_fault != NoFault) {
                assert(inst->translationFault != NoFault);
                // Translation faults are dealt with in handleMemResponse()
                init_fault = NoFault;
            } else {
                // If we have a translation fault then it got suppressed  by
                // initateAcc()
                inst->translationFault = NoFault;
            }
        }
        if (init_fault != NoFault) {
            DPRINTF(MinorExecute, "Fault on memory inst: %s"
                " initiateAcc: %s\n", *inst, init_fault->name());
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -65,15 +65,51 @@ LSQ::LSQRequest::LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_,
    data(data_),
    packet(NULL),
    request(),
    fault(NoFault),
    res(res_),
    skipped(false),
    issuedToMemory(false),
    isTranslationDelayed(false),
    state(NotIssued)
 {
    request = std::make_shared<Request>();
 }
 void
 LSQ::LSQRequest::tryToSuppressFault()
 {
    SimpleThread &thread = *port.cpu.threads[inst->id.threadId];
    TheISA::PCState old_pc = thread.pcState();
    ExecContext context(port.cpu, thread, port.execute, inst);
    Fault M5_VAR_USED fault = inst->translationFault;
    // Give the instruction a chance to suppress a translation fault
    inst->translationFault = inst->staticInst->initiateAcc(&context, nullptr);
    if (inst->translationFault == NoFault) {
        DPRINTFS(MinorMem, (&port),
                 "Translation fault suppressed for inst:%s\n", *inst);
    } else {
        assert(inst->translationFault == fault);
    }
    thread.pcState(old_pc);
 }
 void
 LSQ::LSQRequest::completeDisabledMemAccess()
 {
    DPRINTFS(MinorMem, (&port), "Complete disabled mem access for inst:%s\n",
             *inst);
    SimpleThread &thread = *port.cpu.threads[inst->id.threadId];
    TheISA::PCState old_pc = thread.pcState();
    ExecContext context(port.cpu, thread, port.execute, inst);
    context.setMemAccPredicate(false);
    inst->staticInst->completeAcc(nullptr, &context, inst->traceData);
    thread.pcState(old_pc);
 }
 void
 LSQ::LSQRequest::disableMemAccess()
 {
@@ -227,16 +263,26 @@ void
 LSQ::SingleDataRequest::finish(const Fault &fault_, const RequestPtr &request_,
                               ThreadContext *tc, BaseTLB::Mode mode)
 {
    fault = fault_;
    port.numAccessesInDTLB--;
    DPRINTFS(MinorMem, (&port), "Received translation response for"
-        " request: %s\n", *inst);
+             " request: %s delayed:%d %s\n", *inst, isTranslationDelayed,
             fault_ != NoFault ? fault_->name() : "");
-    makePacket();
+    if (fault_ != NoFault) {
-
+        inst->translationFault = fault_;
-    setState(Translated);
+        if (isTranslationDelayed) {
            tryToSuppressFault();
            if (inst->translationFault == NoFault) {
                completeDisabledMemAccess();
                setState(Complete);
            }
        }
        setState(Translated);
    } else {
        setState(Translated);
        makePacket();
    }
    port.tryToSendToTransfers(this);
    /* Let's try and wake up the processor for the next cycle */
@@ -281,8 +327,6 @@ void
 LSQ::SplitDataRequest::finish(const Fault &fault_, const RequestPtr &request_,
                              ThreadContext *tc, BaseTLB::Mode mode)
 {
    fault = fault_;
    port.numAccessesInDTLB--;
    unsigned int M5_VAR_USED expected_fragment_index =
@@ -292,7 +336,9 @@ LSQ::SplitDataRequest::finish(const Fault &fault_, const RequestPtr &request_,
    numTranslatedFragments++;
    DPRINTFS(MinorMem, (&port), "Received translation response for fragment"
-        " %d of request: %s\n", expected_fragment_index, *inst);
+             " %d of request: %s delayed:%d %s\n", expected_fragment_index,
             *inst, isTranslationDelayed,
             fault_ != NoFault ? fault_->name() : "");
    assert(request_ == fragmentRequests[expected_fragment_index]);
@@ -300,18 +346,33 @@ LSQ::SplitDataRequest::finish(const Fault &fault_, const RequestPtr &request_,
     *  tryToSendToTransfers does take */
    port.cpu.wakeupOnEvent(Pipeline::ExecuteStageId);
-    if (fault != NoFault) {
+    if (fault_ != NoFault) {
        /* tryToSendToTransfers will handle the fault */
        inst->translationFault = fault_;
        DPRINTFS(MinorMem, (&port), "Faulting translation for fragment:"
            " %d of request: %s\n",
            expected_fragment_index, *inst);
-        setState(Translated);
+        if (expected_fragment_index > 0 || isTranslationDelayed)
            tryToSuppressFault();
        if (expected_fragment_index == 0) {
            if (isTranslationDelayed && inst->translationFault == NoFault) {
                completeDisabledMemAccess();
                setState(Complete);
            } else {
                setState(Translated);
            }
        } else if (inst->translationFault == NoFault) {
            setState(Translated);
            numTranslatedFragments--;
            makeFragmentPackets();
        } else {
            setState(Translated);
        }
        port.tryToSendToTransfers(this);
    } else if (numTranslatedFragments == numFragments) {
        makeFragmentPackets();
        setState(Translated);
        port.tryToSendToTransfers(this);
    } else {
@@ -562,6 +623,7 @@ LSQ::SplitDataRequest::stepToNextPacket()
 void
 LSQ::SplitDataRequest::retireResponse(PacketPtr response)
 {
    assert(inst->translationFault == NoFault);
    assert(numRetiredFragments < numTranslatedFragments);
    DPRINTFS(MinorMem, (&port), "Retiring fragment addr: 0x%x size: %d"
@@ -950,7 +1012,7 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request)
        return;
    }
-    if (request->fault != NoFault) {
+    if (request->inst->translationFault != NoFault) {
        if (request->inst->staticInst->isPrefetch()) {
            DPRINTF(MinorMem, "Not signalling fault for faulting prefetch\n");
        }
@@ -1508,12 +1570,18 @@ LSQ::needsToTick()
    return ret;
 }
-void
+Fault
 LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                 unsigned int size, Addr addr, Request::Flags flags,
                 uint64_t *res, AtomicOpFunctor *amo_op,
                 const std::vector<bool>& byteEnable)
 {
    assert(inst->translationFault == NoFault || inst->inLSQ);
    if (inst->inLSQ) {
        return inst->translationFault;
    }
    bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
    if (needs_burst && inst->staticInst->isAtomic()) {
@@ -1568,12 +1636,13 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
        addr, size, flags, cpu.dataMasterId(),
        /* I've no idea why we need the PC, but give it */
        inst->pc.instAddr(), amo_op);
-    if (!byteEnable.empty()) {
+    request->request->setByteEnable(byteEnable);
        request->request->setByteEnable(byteEnable);
    }
    requests.push(request);
    inst->inLSQ = true;
    request->startAddrTranslation();
    return inst->translationFault;
 }
 void
@@ -1642,16 +1711,12 @@ LSQ::issuedMemBarrierInst(MinorDynInstPtr inst)
 void
 LSQ::LSQRequest::makePacket()
 {
    assert(inst->translationFault == NoFault);
    /* Make the function idempotent */
    if (packet)
        return;
    // if the translation faulted, do not create a packet
    if (fault != NoFault) {
        assert(packet == NULL);
        return;
    }
    packet = makePacketForRequest(request, isLoad, this, data);
    /* Null the ret data so we know not to deallocate it when the
     * ret is destroyed.  The data now belongs to the ret and
--- a/src/cpu/minor/lsq.hh
+++ b/src/cpu/minor/lsq.hh
@@ -145,9 +145,6 @@ class LSQ : public Named
        /** The underlying request of this LSQRequest */
        RequestPtr request;
        /** Fault generated performing this request */
        Fault fault;
        /** Res from pushRequest */
        uint64_t *res;
@@ -160,6 +157,9 @@ class LSQ : public Named
         *  that's visited the memory system */
        bool issuedToMemory;
        /** Address translation is delayed due to table walk */
        bool isTranslationDelayed;
        enum LSQRequestState
        {
            NotIssued, /* Newly created */
@@ -186,9 +186,14 @@ class LSQ : public Named
      protected:
        /** BaseTLB::Translation interface */
-        void markDelayed() { }
+        void markDelayed() { isTranslationDelayed = true; }
        /** Instructions may want to suppress translation faults (e.g.
         *  non-faulting vector loads).*/
        void tryToSuppressFault();
        void disableMemAccess();
        void completeDisabledMemAccess();
      public:
        LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_,
@@ -701,11 +706,11 @@ class LSQ : public Named
    /** Single interface for readMem/writeMem/amoMem to issue requests into
     *  the LSQ */
-    void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
+    Fault pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
-                     unsigned int size, Addr addr, Request::Flags flags,
+                      unsigned int size, Addr addr, Request::Flags flags,
-                     uint64_t *res, AtomicOpFunctor *amo_op,
+                      uint64_t *res, AtomicOpFunctor *amo_op,
-                     const std::vector<bool>& byteEnable =
+                      const std::vector<bool>& byteEnable =
-                         std::vector<bool>());
+                          std::vector<bool>());
    /** Push a predicate failed-representing request into the queues just
     *  to maintain commit order */
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -226,6 +226,7 @@ class LSQ
            Complete,
            Squashed,
            Fault,
            PartialFault,
        };
        State _state;
        LSQSenderState* _senderState;
@@ -564,6 +565,19 @@ class LSQ
            return flags.isSet(Flag::Sent);
        }
        bool
        isPartialFault()
        {
            return _state == State::PartialFault;
        }
        bool
        isMemAccessRequired()
        {
            return (_state == State::Request ||
                    (isPartialFault() && isLoad()));
        }
        /**
         * The LSQ entry is cleared
         */
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -733,7 +733,7 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
    /* This is the place were instructions get the effAddr. */
    if (req->isTranslationComplete()) {
-        if (inst->getFault() == NoFault) {
+        if (req->isMemAccessRequired()) {
            inst->effAddr = req->getVaddr();
            inst->effSize = size;
            inst->effAddrValid(true);
@@ -741,10 +741,17 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
            if (cpu->checker) {
                inst->reqToVerify = std::make_shared<Request>(*req->request());
            }
            Fault fault;
            if (isLoad)
-                inst->getFault() = cpu->read(req, inst->lqIdx);
+                fault = cpu->read(req, inst->lqIdx);
            else
-                inst->getFault() = cpu->write(req, data, inst->sqIdx);
+                fault = cpu->write(req, data, inst->sqIdx);
            // inst->getFault() may have the first-fault of a
            // multi-access split request at this point.
            // Overwrite that only if we got another type of fault
            // (e.g. re-exec).
            if (fault != NoFault)
                inst->getFault() = fault;
        } else if (isLoad) {
            inst->setMemAccPredicate(false);
            // Commit will have to clean up whatever happened.  Set this
@@ -797,13 +804,16 @@ void
 LSQ<Impl>::SplitDataRequest::finish(const Fault &fault, const RequestPtr &req,
        ThreadContext* tc, BaseTLB::Mode mode)
 {
-    _fault.push_back(fault);
+    int i;
-    assert(req == _requests[numTranslatedFragments] || this->isDelayed());
+    for (i = 0; i < _requests.size() && _requests[i] != req; i++);
    assert(i < _requests.size());
    _fault[i] = fault;
    numInTranslationFragments--;
    numTranslatedFragments++;
-    mainReq->setFlags(req->getFlags());
+    if (fault == NoFault)
        mainReq->setFlags(req->getFlags());
    if (numTranslatedFragments == _requests.size()) {
        if (_inst->isSquashed()) {
@@ -811,27 +821,30 @@ LSQ<Impl>::SplitDataRequest::finish(const Fault &fault, const RequestPtr &req,
        } else {
            _inst->strictlyOrdered(mainReq->isStrictlyOrdered());
            flags.set(Flag::TranslationFinished);
-            auto fault_it = _fault.begin();
+            _inst->translationCompleted(true);
            /* Ffwd to the first NoFault. */
            while (fault_it != _fault.end() && *fault_it == NoFault)
                fault_it++;
            /* If none of the fragments faulted: */
            if (fault_it == _fault.end()) {
                _inst->physEffAddr = request(0)->getPaddr();
            for (i = 0; i < _fault.size() && _fault[i] == NoFault; i++);
            if (i > 0) {
                _inst->physEffAddr = request(0)->getPaddr();
                _inst->memReqFlags = mainReq->getFlags();
                if (mainReq->isCondSwap()) {
                    assert (i == _fault.size());
                    assert(_res);
                    mainReq->setExtraData(*_res);
                }
-                setState(State::Request);
+                if (i == _fault.size()) {
-                _inst->fault = NoFault;
+                    _inst->fault = NoFault;
                    setState(State::Request);
                } else {
                  _inst->fault = _fault[i];
                  setState(State::PartialFault);
                }
            } else {
                _inst->fault = _fault[0];
                setState(State::Fault);
                _inst->fault = *fault_it;
            }
            _inst->translationCompleted(true);
        }
    }
 }
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -554,6 +554,16 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst)
    if (inst->isTranslationDelayed() && load_fault == NoFault)
        return load_fault;
    if (load_fault != NoFault && inst->translationCompleted() &&
        inst->savedReq->isPartialFault() && !inst->savedReq->isComplete()) {
        assert(inst->savedReq->isSplit());
        // If we have a partial fault where the mem access is not complete yet
        // then the cache must have been blocked. This load will be re-executed
        // when the cache gets unblocked. We will handle the fault when the
        // mem access is complete.
        return NoFault;
    }
    // If the instruction faulted or predicated false, then we need to send it
    // along to commit without the instruction completing.
    if (load_fault != NoFault || !inst->readPredicate()) {