cpu-o3: O3 LSQ Generalisation

This patch does a large modification of the LSQ in the O3 model. The
main goal of the patch is to remove the 'an operation can be served with
one or two memory requests' assumption that is present in the LSQ
and the instruction with the req, reqLow, reqHigh triplet, and
generalising it to operations that can be addressed with one request,
and operations that require many requests, embodied in the
SingleDataRequest and the SplitDataRequest.

This modification has been done mimicking the minor model to an extent,
shifting the responsibilities of dealing with VtoP translation and
tracking the status and resources from the DynInst to the LSQ via the
LSQRequest. The LSQRequest models the information concerning the
operation, handles the creation of fragments for translation and request
as well as assembling/splitting the data accordingly.

With this modifications, the implementation of vector ISAs, particularly
on the memory side, become more rich, as the new model permits a
dissociation of the ISA characteristics as vector length, from the
microarchitectural characteristics that govern how contiguous loads are
executing, allowing exploration of different LSQ to DL1 bus widths to
understand the tradeoffs in complexity and performance.

Part of the complexities introduced stem from the fact that gem5 keeps a
large amount of metadata regarding, in particular, memory operations,
thus, when an instruction is squashed while some operation as TLB lookup
or cache access is ongoing, when the relevant structure communicates to
the LSQ that the operation is over, it tries to access some pieces of
data that should have died when the instruction is squashed, leading to
asserts, panics, or memory corruption. To ensure the correct behaviour,
the LSQRequest rely on assesing who is their owner, and self-destroying
if they detect their owner is done with the request, and there will be
no subsequent action. For example, in the case of an instruction
squashed whal the TLB is doing a walk to serve the translation, when the
translation is served by the TLB, the LSQRequest detects that the
instruction was squashed, and as the translation is done, no one else
expect to access its information, and therefore, it self-destructs.
Having destroyed the LSQRequest earlier, would lead to wrong behaviour
as the TLB walk may access some fields of it.

Additional authors:
- Gabor Dozsa <gabor.dozsa@arm.com>

Change-Id: I9578a1a3f6b899c390cdd886856a24db68ff7d0c
Signed-off-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/13516
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
This commit is contained in:
Rekai Gonzalez-Alberquilla
2017-02-13 09:41:44 +00:00
committed by Giacomo Travaglini
parent 6379bebd41
commit 51becd2475
12 changed files with 1919 additions and 1281 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017 ARM Limited
* Copyright (c) 2017-2018 ARM Limited
* All rights reserved.
*
* The license below extends only to copyright in the software and shall

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011,2013,2016 ARM Limited
* Copyright (c) 2011, 2013, 2016-2018 ARM Limited
* Copyright (c) 2013 Advanced Micro Devices, Inc.
* All rights reserved.
*
@@ -84,6 +84,10 @@ class BaseDynInst : public ExecContext, public RefCounted
typedef typename ImplCPU::ImplState ImplState;
using VecRegContainer = TheISA::VecRegContainer;
using LSQRequestPtr = typename Impl::CPUPol::LSQ::LSQRequest*;
using LQIterator = typename Impl::CPUPol::LSQUnit::LQIterator;
using SQIterator = typename Impl::CPUPol::LSQUnit::SQIterator;
// The DynInstPtr type.
typedef typename Impl::DynInstPtr DynInstPtr;
typedef RefCountingPtr<BaseDynInst<Impl> > BaseDynInstPtr;
@@ -203,12 +207,7 @@ class BaseDynInst : public ExecContext, public RefCounted
Addr effAddr;
/** The effective physical address. */
Addr physEffAddrLow;
/** The effective physical address
* of the second request for a split request
*/
Addr physEffAddrHigh;
Addr physEffAddr;
/** The memory request flags (from translation). */
unsigned memReqFlags;
@@ -224,19 +223,19 @@ class BaseDynInst : public ExecContext, public RefCounted
/** Load queue index. */
int16_t lqIdx;
LQIterator lqIt;
/** Store queue index. */
int16_t sqIdx;
SQIterator sqIt;
/////////////////////// TLB Miss //////////////////////
/**
* Saved memory requests (needed when the DTB address translation is
* Saved memory request (needed when the DTB address translation is
* delayed due to a hw page table walk).
*/
RequestPtr savedReq;
RequestPtr savedSreqLow;
RequestPtr savedSreqHigh;
LSQRequestPtr savedReq;
/////////////////////// Checker //////////////////////
// Need a copy of main request pointer to verify on writes.
@@ -270,6 +269,7 @@ class BaseDynInst : public ExecContext, public RefCounted
/** Is the effective virtual address valid. */
bool effAddrValid() const { return instFlags[EffAddrValid]; }
void effAddrValid(bool b) { instFlags[EffAddrValid] = b; }
/** Whether or not the memory operation is done. */
bool memOpDone() const { return instFlags[MemOpDone]; }
@@ -303,18 +303,6 @@ class BaseDynInst : public ExecContext, public RefCounted
Fault writeMem(uint8_t *data, unsigned size, Addr addr,
Request::Flags flags, uint64_t *res);
/** Splits a request in two if it crosses a dcache block. */
void splitRequest(const RequestPtr &req, RequestPtr &sreqLow,
RequestPtr &sreqHigh);
/** Initiate a DTB address translation. */
void initiateTranslation(const RequestPtr &req, const RequestPtr &sreqLow,
const RequestPtr &sreqHigh, uint64_t *res,
BaseTLB::Mode mode);
/** Finish a DTB address translation. */
void finishTranslation(WholeTranslationState *state);
/** True if the DTB address translation has started. */
bool translationStarted() const { return instFlags[TranslationStarted]; }
void translationStarted(bool f) { instFlags[TranslationStarted] = f; }
@@ -454,6 +442,9 @@ class BaseDynInst : public ExecContext, public RefCounted
/** Returns the fault type. */
Fault getFault() const { return fault; }
/** TODO: This I added for the LSQRequest side to be able to modify the
* fault. There should be a better mechanism in place. */
Fault& getFault() { return fault; }
/** Checks whether or not this instruction has had its branch target
* calculated yet. For now it is not utilized and is hacked to be
@@ -589,7 +580,8 @@ class BaseDynInst : public ExecContext, public RefCounted
int8_t numIntDestRegs() const { return staticInst->numIntDestRegs(); }
int8_t numCCDestRegs() const { return staticInst->numCCDestRegs(); }
int8_t numVecDestRegs() const { return staticInst->numVecDestRegs(); }
int8_t numVecElemDestRegs() const {
int8_t numVecElemDestRegs() const
{
return staticInst->numVecElemDestRegs();
}
@@ -837,6 +829,7 @@ class BaseDynInst : public ExecContext, public RefCounted
/** Sets the ASID. */
void setASID(short addr_space_id) { asid = addr_space_id; }
short getASID() { return asid; }
/** Sets the thread id. */
void setTid(ThreadID tid) { threadNumber = tid; }
@@ -853,9 +846,12 @@ class BaseDynInst : public ExecContext, public RefCounted
/** Is this instruction's memory access strictly ordered? */
bool strictlyOrdered() const { return instFlags[IsStrictlyOrdered]; }
void strictlyOrdered(bool so) { instFlags[IsStrictlyOrdered] = so; }
/** Has this instruction generated a memory request. */
bool hasRequest() const { return instFlags[ReqMade]; }
/** Assert this instruction has generated a memory request. */
void setRequest() { instFlags[ReqMade] = true; }
/** Returns iterator to this instruction in the list of all insts. */
ListIt &getInstListIt() { return instListIt; }
@@ -887,50 +883,9 @@ Fault
BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size,
Request::Flags flags)
{
instFlags[ReqMade] = true;
RequestPtr req = NULL;
RequestPtr sreqLow = NULL;
RequestPtr sreqHigh = NULL;
if (instFlags[ReqMade] && translationStarted()) {
req = savedReq;
sreqLow = savedSreqLow;
sreqHigh = savedSreqHigh;
} else {
req = std::make_shared<Request>(
asid, addr, size, flags, masterId(),
this->pc.instAddr(), thread->contextId());
req->taskId(cpu->taskId());
// Only split the request if the ISA supports unaligned accesses.
if (TheISA::HasUnalignedMemAcc) {
splitRequest(req, sreqLow, sreqHigh);
}
initiateTranslation(req, sreqLow, sreqHigh, NULL, BaseTLB::Read);
}
if (translationCompleted()) {
if (fault == NoFault) {
effAddr = req->getVaddr();
effSize = size;
instFlags[EffAddrValid] = true;
if (cpu->checker) {
reqToVerify = std::make_shared<Request>(*req);
}
fault = cpu->read(req, sreqLow, sreqHigh, lqIdx);
} else {
// Commit will have to clean up whatever happened. Set this
// instruction as executed.
this->setExecuted();
}
}
if (traceData)
traceData->setMem(addr, size, flags);
return fault;
return cpu->pushRequest(
dynamic_cast<typename DynInstPtr::PtrType>(this),
/* ld */ true, nullptr, size, addr, flags, nullptr);
}
template<class Impl>
@@ -938,154 +893,9 @@ Fault
BaseDynInst<Impl>::writeMem(uint8_t *data, unsigned size, Addr addr,
Request::Flags flags, uint64_t *res)
{
if (traceData)
traceData->setMem(addr, size, flags);
instFlags[ReqMade] = true;
RequestPtr req = NULL;
RequestPtr sreqLow = NULL;
RequestPtr sreqHigh = NULL;
if (instFlags[ReqMade] && translationStarted()) {
req = savedReq;
sreqLow = savedSreqLow;
sreqHigh = savedSreqHigh;
} else {
req = std::make_shared<Request>(
asid, addr, size, flags, masterId(),
this->pc.instAddr(), thread->contextId());
req->taskId(cpu->taskId());
// Only split the request if the ISA supports unaligned accesses.
if (TheISA::HasUnalignedMemAcc) {
splitRequest(req, sreqLow, sreqHigh);
}
initiateTranslation(req, sreqLow, sreqHigh, res, BaseTLB::Write);
}
if (fault == NoFault && translationCompleted()) {
effAddr = req->getVaddr();
effSize = size;
instFlags[EffAddrValid] = true;
if (cpu->checker) {
reqToVerify = std::make_shared<Request>(*req);
}
fault = cpu->write(req, sreqLow, sreqHigh, data, sqIdx);
}
return fault;
}
template<class Impl>
inline void
BaseDynInst<Impl>::splitRequest(const RequestPtr &req, RequestPtr &sreqLow,
RequestPtr &sreqHigh)
{
// Check to see if the request crosses the next level block boundary.
unsigned block_size = cpu->cacheLineSize();
Addr addr = req->getVaddr();
Addr split_addr = roundDown(addr + req->getSize() - 1, block_size);
assert(split_addr <= addr || split_addr - addr < block_size);
// Spans two blocks.
if (split_addr > addr) {
req->splitOnVaddr(split_addr, sreqLow, sreqHigh);
}
}
template<class Impl>
inline void
BaseDynInst<Impl>::initiateTranslation(const RequestPtr &req,
const RequestPtr &sreqLow,
const RequestPtr &sreqHigh,
uint64_t *res,
BaseTLB::Mode mode)
{
translationStarted(true);
if (!TheISA::HasUnalignedMemAcc || sreqLow == NULL) {
WholeTranslationState *state =
new WholeTranslationState(req, NULL, res, mode);
// One translation if the request isn't split.
DataTranslation<BaseDynInstPtr> *trans =
new DataTranslation<BaseDynInstPtr>(this, state);
cpu->dtb->translateTiming(req, thread->getTC(), trans, mode);
if (!translationCompleted()) {
// The translation isn't yet complete, so we can't possibly have a
// fault. Overwrite any existing fault we might have from a previous
// execution of this instruction (e.g. an uncachable load that
// couldn't execute because it wasn't at the head of the ROB).
fault = NoFault;
// Save memory requests.
savedReq = state->mainReq;
savedSreqLow = state->sreqLow;
savedSreqHigh = state->sreqHigh;
}
} else {
WholeTranslationState *state =
new WholeTranslationState(req, sreqLow, sreqHigh, NULL, res, mode);
// Two translations when the request is split.
DataTranslation<BaseDynInstPtr> *stransLow =
new DataTranslation<BaseDynInstPtr>(this, state, 0);
DataTranslation<BaseDynInstPtr> *stransHigh =
new DataTranslation<BaseDynInstPtr>(this, state, 1);
cpu->dtb->translateTiming(sreqLow, thread->getTC(), stransLow, mode);
cpu->dtb->translateTiming(sreqHigh, thread->getTC(), stransHigh, mode);
if (!translationCompleted()) {
// The translation isn't yet complete, so we can't possibly have a
// fault. Overwrite any existing fault we might have from a previous
// execution of this instruction (e.g. an uncachable load that
// couldn't execute because it wasn't at the head of the ROB).
fault = NoFault;
// Save memory requests.
savedReq = state->mainReq;
savedSreqLow = state->sreqLow;
savedSreqHigh = state->sreqHigh;
}
}
}
template<class Impl>
inline void
BaseDynInst<Impl>::finishTranslation(WholeTranslationState *state)
{
fault = state->getFault();
instFlags[IsStrictlyOrdered] = state->isStrictlyOrdered();
if (fault == NoFault) {
// save Paddr for a single req
physEffAddrLow = state->getPaddr();
// case for the request that has been split
if (state->isSplit) {
physEffAddrLow = state->sreqLow->getPaddr();
physEffAddrHigh = state->sreqHigh->getPaddr();
}
memReqFlags = state->getFlags();
if (state->mainReq->isCondSwap()) {
assert(state->res);
state->mainReq->setExtraData(*state->res);
}
} else {
state->deleteReqs();
}
delete state;
translationCompleted(true);
return cpu->pushRequest(
dynamic_cast<typename DynInstPtr::PtrType>(this),
/* st */ false, data, size, addr, flags, res);
}
#endif // __CPU_BASE_DYN_INST_HH__

View File

@@ -69,8 +69,6 @@ BaseDynInst<Impl>::BaseDynInst(const StaticInstPtr &_staticInst,
macroop(_macroop),
memData(nullptr),
savedReq(nullptr),
savedSreqLow(nullptr),
savedSreqHigh(nullptr),
reqToVerify(nullptr)
{
seqNum = seq_num;
@@ -96,8 +94,7 @@ BaseDynInst<Impl>::initVars()
{
memData = NULL;
effAddr = 0;
physEffAddrLow = 0;
physEffAddrHigh = 0;
physEffAddr = 0;
readyRegs = 0;
memReqFlags = 0;

View File

@@ -850,7 +850,6 @@ FullO3CPU<Impl>::insertThread(ThreadID tid)
//Reset ROB/IQ/LSQ Entries
commit.rob->resetEntries();
iew.resetEntries();
}
template <class Impl>

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011-2013, 2016 ARM Limited
* Copyright (c) 2011-2013, 2016-2018 ARM Limited
* Copyright (c) 2013 Advanced Micro Devices, Inc.
* All rights reserved
*
@@ -125,6 +125,7 @@ class FullO3CPU : public BaseO3CPU
BaseTLB *itb;
BaseTLB *dtb;
using LSQRequest = typename LSQ<Impl>::LSQRequest;
/** Overall CPU status. */
Status _status;
@@ -733,21 +734,25 @@ class FullO3CPU : public BaseO3CPU
/** Available thread ids in the cpu*/
std::vector<ThreadID> tids;
/** CPU read function, forwards read to LSQ. */
Fault read(const RequestPtr &req,
RequestPtr &sreqLow, RequestPtr &sreqHigh,
int load_idx)
/** CPU pushRequest function, forwards request to LSQ. */
Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
unsigned int size, Addr addr, Request::Flags flags,
uint64_t *res)
{
return this->iew.ldstQueue.read(req, sreqLow, sreqHigh, load_idx);
return iew.ldstQueue.pushRequest(inst, isLoad, data, size, addr,
flags, res);
}
/** CPU read function, forwards read to LSQ. */
Fault read(LSQRequest* req, int load_idx)
{
return this->iew.ldstQueue.read(req, load_idx);
}
/** CPU write function, forwards write to LSQ. */
Fault write(const RequestPtr &req,
const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
uint8_t *data, int store_idx)
Fault write(LSQRequest* req, uint8_t *data, int store_idx)
{
return this->iew.ldstQueue.write(req, sreqLow, sreqHigh,
data, store_idx);
return this->iew.ldstQueue.write(req, data, store_idx);
}
/** Used by the fetch unit to get a hold of the instruction port. */

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2010-2013 ARM Limited
* Copyright (c) 2010-2013, 2018 ARM Limited
* Copyright (c) 2013 Advanced Micro Devices, Inc.
* All rights reserved.
*
@@ -743,14 +743,6 @@ DefaultIEW<Impl>::updateStatus()
}
}
template <class Impl>
void
DefaultIEW<Impl>::resetEntries()
{
instQueue.resetEntries();
ldstQueue.resetEntries();
}
template <class Impl>
bool
DefaultIEW<Impl>::checkStall(ThreadID tid)
@@ -1353,7 +1345,7 @@ DefaultIEW<Impl>::executeInsts()
DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s "
"[sn:%lli], inst PC: %s [sn:%lli]. Addr is: %#x.\n",
violator->pcState(), violator->seqNum,
inst->pcState(), inst->seqNum, inst->physEffAddrLow);
inst->pcState(), inst->seqNum, inst->physEffAddr);
fetchRedirect[tid] = true;
@@ -1376,7 +1368,7 @@ DefaultIEW<Impl>::executeInsts()
DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: "
"%s, inst PC: %s. Addr is: %#x.\n",
violator->pcState(), inst->pcState(),
inst->physEffAddrLow);
inst->physEffAddr);
DPRINTF(IEW, "Violation will not be handled because "
"already squashing\n");
@@ -1460,6 +1452,8 @@ DefaultIEW<Impl>::tick()
wroteToTimeBuffer = false;
updatedQueues = false;
ldstQueue.tick();
sortInsts();
// Free function units marked as being freed this cycle.

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011-2014 ARM Limited
* Copyright (c) 2011-2014, 2017-2018 ARM Limited
* Copyright (c) 2013 Advanced Micro Devices, Inc.
* All rights reserved.
*
@@ -1140,9 +1140,6 @@ template <class Impl>
void
InstructionQueue<Impl>::blockMemInst(const DynInstPtr &blocked_inst)
{
blocked_inst->translationStarted(false);
blocked_inst->translationCompleted(false);
blocked_inst->clearIssued();
blocked_inst->clearCanIssue();
blockedMemInsts.push_back(blocked_inst);
@@ -1285,9 +1282,9 @@ InstructionQueue<Impl>::doSquash(ThreadID tid)
squashed_inst);
}
++iqSquashedOperandsExamined;
}
} else if (!squashed_inst->isStoreConditional() ||
!squashed_inst->isCompleted()) {
NonSpecMapIt ns_inst_it =

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011-2012, 2014 ARM Limited
* Copyright (c) 2011-2012, 2014, 2018 ARM Limited
* Copyright (c) 2013 Advanced Micro Devices, Inc.
* All rights reserved
*
@@ -47,8 +47,9 @@
#include <map>
#include <queue>
#include "cpu/o3/lsq_unit.hh"
#include "arch/generic/tlb.hh"
#include "cpu/inst_seq.hh"
#include "cpu/o3/lsq_unit.hh"
#include "enums/SMTQueuePolicy.hh"
#include "mem/port.hh"
#include "sim/sim_object.hh"
@@ -56,13 +57,659 @@
struct DerivO3CPUParams;
template <class Impl>
class LSQ {
class LSQ
{
public:
typedef typename Impl::O3CPU O3CPU;
typedef typename Impl::DynInstPtr DynInstPtr;
typedef typename Impl::CPUPol::IEW IEW;
typedef typename Impl::CPUPol::LSQUnit LSQUnit;
class LSQRequest;
/** Derived class to hold any sender state the LSQ needs. */
class LSQSenderState : public Packet::SenderState
{
protected:
/** The senderState needs to know the LSQRequest who owns it. */
LSQRequest* _request;
/** Default constructor. */
LSQSenderState(LSQRequest* request, bool isLoad_)
: _request(request), mainPkt(nullptr), pendingPacket(nullptr),
outstanding(0), isLoad(isLoad_), needWB(isLoad_), isSplit(false),
pktToSend(false), deleted(false)
{ }
public:
/** Instruction which initiated the access to memory. */
DynInstPtr inst;
/** The main packet from a split load, used during writeback. */
PacketPtr mainPkt;
/** A second packet from a split store that needs sending. */
PacketPtr pendingPacket;
/** Number of outstanding packets to complete. */
uint8_t outstanding;
/** Whether or not it is a load. */
bool isLoad;
/** Whether or not the instruction will need to writeback. */
bool needWB;
/** Whether or not this access is split in two. */
bool isSplit;
/** Whether or not there is a packet that needs sending. */
bool pktToSend;
/** Has the request been deleted?
* LSQ entries can be squashed before the response comes back. in that
* case the SenderState knows.
*/
bool deleted;
ContextID contextId() { return inst->contextId(); }
/** Completes a packet and returns whether the access is finished. */
inline bool isComplete() { return outstanding == 0; }
inline void deleteRequest() { deleted = true; }
inline bool alive() { return !deleted; }
LSQRequest* request() { return _request; }
virtual void complete() = 0;
void writebackDone() { _request->writebackDone(); }
};
/** Memory operation metadata.
* This class holds the information about a memory operation. It lives
* from initiateAcc to resource deallocation at commit or squash.
* LSQRequest objects are owned by the LQ/SQ Entry in the LSQUnit that
* holds the operation. It is also used by the LSQSenderState. In addition,
* the LSQRequest is a TranslationState, therefore, upon squash, there must
* be a defined ownership transferal in case the LSQ resources are
* deallocated before the TLB is done using the TranslationState. If that
* happens, the LSQRequest will be self-owned, and responsible to detect
* that its services are no longer required and self-destruct.
*
* Lifetime of a LSQRequest:
* +--------------------+
* |LSQ creates and owns|
* +--------------------+
* |
* +--------------------+
* | Initate translation|
* +--------------------+
* |
* ___^___
* ___/ \___
* ______/ Squashed? \
* | \___ ___/
* | \___ ___/
* | v
* | |
* | +--------------------+
* | | Translation done |
* | +--------------------+
* | |
* | +--------------------+
* | | Send packet |<------+
* | +--------------------+ |
* | | |
* | ___^___ |
* | ___/ \___ |
* | ____/ Squashed? \ |
* | | \___ ___/ |
* | | \___ ___/ |
* | | v |
* | | | |
* | | ___^___ |
* | | ___/ \___ |
* | | / Done? \__________|
* | | \___ ___/
* | | \___ ___/
* | | v
* | | |
* | | +--------------------+
* | | | Manage stuff |
* | | | Free resources |
* | | +--------------------+
* | |
* | | +--------------------+
* | | | senderState owns |
* | +->| onRecvTimingResp |
* | | free resources |
* | +--------------------+
* |
* | +----------------------+
* | | self owned (Trans) |
* +-->| on TranslationFinish |
* | free resources |
* +----------------------+
*
*
*/
class LSQRequest : public BaseTLB::Translation
{
protected:
typedef uint32_t FlagsStorage;
typedef ::Flags<FlagsStorage> FlagsType;
enum Flag : FlagsStorage
{
IsLoad = 0x00000001,
/** True if this is a store that writes registers (SC). */
WbStore = 0x00000002,
Delayed = 0x00000004,
IsSplit = 0x00000008,
/** True if any translation has been sent to TLB. */
TranslationStarted = 0x00000010,
/** True if there are un-replied outbound translations.. */
TranslationFinished = 0x00000020,
Sent = 0x00000040,
Retry = 0x00000080,
Complete = 0x00000100,
/** Ownership tracking flags. */
/** Translation squashed. */
TranslationSquashed = 0x00000200,
/** Request discarded */
Discarded = 0x00000400,
/** LSQ resources freed. */
LSQEntryFreed = 0x00000800,
/** Store written back. */
WritebackScheduled = 0x00001000,
WritebackDone = 0x00002000
};
FlagsType flags;
enum class State
{
NotIssued,
Translation,
Request,
Complete,
Squashed,
Fault,
};
State _state;
LSQSenderState* _senderState;
void setState(const State& newState) { _state = newState; }
uint32_t numTranslatedFragments;
uint32_t numInTranslationFragments;
/** LQ/SQ entry idx. */
uint32_t _entryIdx;
void markDelayed() { flags.set(Flag::Delayed); }
bool isDelayed() { return flags.isSet(Flag::Delayed); }
public:
LSQUnit& _port;
const DynInstPtr _inst;
uint32_t _taskId;
PacketDataPtr _data;
std::vector<PacketPtr> _packets;
std::vector<RequestPtr> _requests;
std::vector<Fault> _fault;
uint64_t* _res;
const Addr _addr;
const uint32_t _size;
const Request::Flags _flags;
uint32_t _numOutstandingPackets;
protected:
LSQUnit* lsqUnit() { return &_port; }
LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad) :
_state(State::NotIssued), _senderState(nullptr),
_port(*port), _inst(inst), _data(nullptr),
_res(nullptr), _addr(0), _size(0), _flags(0),
_numOutstandingPackets(0)
{
flags.set(Flag::IsLoad, isLoad);
flags.set(Flag::WbStore, _inst->isStoreConditional());
install();
}
LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
const Addr& addr, const uint32_t& size,
const Request::Flags& flags_,
PacketDataPtr data = nullptr, uint64_t* res = nullptr)
: _state(State::NotIssued), _senderState(nullptr),
numTranslatedFragments(0),
numInTranslationFragments(0),
_port(*port), _inst(inst), _data(data),
_res(res), _addr(addr), _size(size),
_flags(flags_),
_numOutstandingPackets(0)
{
flags.set(Flag::IsLoad, isLoad);
flags.set(Flag::WbStore, _inst->isStoreConditional());
install();
}
bool
isLoad() const
{
return flags.isSet(Flag::IsLoad);
}
/** Install the request in the LQ/SQ. */
void install()
{
if (isLoad()) {
_port.loadQueue[_inst->lqIdx].setRequest(this);
} else {
_port.storeQueue[_inst->sqIdx].setRequest(this);
}
}
virtual bool
squashed() const override
{
return _inst->isSquashed();
}
/**
* Test if the LSQRequest has been released, i.e. self-owned.
* An LSQRequest manages itself when the resources on the LSQ are freed
* but the translation is still going on and the LSQEntry was freed.
*/
bool
isReleased()
{
return flags.isSet(Flag::LSQEntryFreed) ||
flags.isSet(Flag::Discarded);
}
/** Release the LSQRequest.
* Notify the sender state that the request it points to is not valid
* anymore. Understand if the request is orphan (self-managed) and if
* so, mark it as freed, else destroy it, as this means
* the end of its life cycle.
* An LSQRequest is orphan when its resources are released
* but there is any in-flight translation request to the TLB or access
* request to the memory.
*/
void release(Flag reason)
{
assert(reason == Flag::LSQEntryFreed || reason == Flag::Discarded);
if (!isAnyOutstandingRequest()) {
delete this;
} else {
if (_senderState) {
_senderState->deleteRequest();
}
flags.set(reason);
}
}
/** Destructor.
* The LSQRequest owns the request. If the packet has already been
* sent, the sender state will be deleted upon receiving the reply.
*/
virtual ~LSQRequest()
{
assert(!isAnyOutstandingRequest());
_inst->savedReq = nullptr;
if (_senderState)
delete _senderState;
for (auto r: _packets)
delete r;
};
public:
/** Convenience getters/setters. */
/** @{ */
/** Set up Context numbers. */
void
setContext(const ContextID& context_id)
{
request()->setContext(context_id);
}
const DynInstPtr&
instruction()
{
return _inst;
}
/** Set up virtual request.
* For a previously allocated Request objects.
*/
void
setVirt(int asid, Addr vaddr, unsigned size, Request::Flags flags_,
MasterID mid, Addr pc)
{
request()->setVirt(asid, vaddr, size, flags_, mid, pc);
}
void
taskId(const uint32_t& v)
{
_taskId = v;
for (auto& r: _requests)
r->taskId(v);
}
uint32_t taskId() const { return _taskId; }
RequestPtr request(int idx = 0) { return _requests.at(idx); }
const RequestPtr
request(int idx = 0) const
{
return _requests.at(idx);
}
Addr getVaddr(int idx = 0) const { return request(idx)->getVaddr(); }
virtual void initiateTranslation() = 0;
PacketPtr packet(int idx = 0) { return _packets.at(idx); }
virtual PacketPtr
mainPacket()
{
assert (_packets.size() == 1);
return packet();
}
virtual RequestPtr
mainRequest()
{
assert (_requests.size() == 1);
return request();
}
void
senderState(LSQSenderState* st)
{
_senderState = st;
for (auto& pkt: _packets) {
if (pkt)
pkt->senderState = st;
}
}
const LSQSenderState*
senderState() const
{
return _senderState;
}
/**
* Mark senderState as discarded. This will cause to discard response
* packets from the cache.
*/
void
discardSenderState()
{
assert(_senderState);
_senderState->deleteRequest();
}
/**
* Test if there is any in-flight translation or mem access request
*/
bool
isAnyOutstandingRequest()
{
return numInTranslationFragments > 0 ||
_numOutstandingPackets > 0 ||
(flags.isSet(Flag::WritebackScheduled) &&
!flags.isSet(Flag::WritebackDone));
}
bool
isSplit() const
{
return flags.isSet(Flag::IsSplit);
}
/** @} */
virtual bool recvTimingResp(PacketPtr pkt) = 0;
virtual void sendPacketToCache() = 0;
virtual void buildPackets() = 0;
/**
* Memory mapped IPR accesses
*/
virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt) = 0;
virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt) = 0;
/**
* Test if the request accesses a particular cache line.
*/
virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask) = 0;
/** Update the status to reflect that a packet was sent. */
void
packetSent()
{
flags.set(Flag::Sent);
}
/** Update the status to reflect that a packet was not sent.
* When a packet fails to be sent, we mark the request as needing a
* retry. Note that Retry flag is sticky.
*/
void
packetNotSent()
{
flags.set(Flag::Retry);
flags.clear(Flag::Sent);
}
void sendFragmentToTranslation(int i);
bool
isComplete()
{
return flags.isSet(Flag::Complete);
}
bool
isInTranslation()
{
return _state == State::Translation;
}
bool
isTranslationComplete()
{
return flags.isSet(Flag::TranslationStarted) &&
!isInTranslation();
}
bool
isTranslationBlocked()
{
return _state == State::Translation &&
flags.isSet(Flag::TranslationStarted) &&
!flags.isSet(Flag::TranslationFinished);
}
bool
isSent()
{
return flags.isSet(Flag::Sent);
}
/**
* The LSQ entry is cleared
*/
void
freeLSQEntry()
{
release(Flag::LSQEntryFreed);
}
/**
* The request is discarded (e.g. partial store-load forwarding)
*/
void
discard()
{
release(Flag::Discarded);
}
void
packetReplied()
{
assert(_numOutstandingPackets > 0);
_numOutstandingPackets--;
if (_numOutstandingPackets == 0 && isReleased())
delete this;
}
void
writebackScheduled()
{
assert(!flags.isSet(Flag::WritebackScheduled));
flags.set(Flag::WritebackScheduled);
}
void
writebackDone()
{
flags.set(Flag::WritebackDone);
/* If the lsq resources are already free */
if (isReleased()) {
delete this;
}
}
void
squashTranslation()
{
assert(numInTranslationFragments == 0);
flags.set(Flag::TranslationSquashed);
/* If we are on our own, self-destruct. */
if (isReleased()) {
delete this;
}
}
void
complete()
{
flags.set(Flag::Complete);
}
};
class SingleDataRequest : public LSQRequest
{
protected:
/* Given that we are inside templates, children need explicit
* declaration of the names in the parent class. */
using Flag = typename LSQRequest::Flag;
using State = typename LSQRequest::State;
using LSQRequest::_fault;
using LSQRequest::_inst;
using LSQRequest::_packets;
using LSQRequest::_port;
using LSQRequest::_res;
using LSQRequest::_senderState;
using LSQRequest::_state;
using LSQRequest::flags;
using LSQRequest::isLoad;
using LSQRequest::isTranslationComplete;
using LSQRequest::lsqUnit;
using LSQRequest::request;
using LSQRequest::sendFragmentToTranslation;
using LSQRequest::setState;
using LSQRequest::numInTranslationFragments;
using LSQRequest::numTranslatedFragments;
using LSQRequest::_numOutstandingPackets;
public:
SingleDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
const Addr& addr, const uint32_t& size,
const Request::Flags& flags_,
PacketDataPtr data = nullptr,
uint64_t* res = nullptr) :
LSQRequest(port, inst, isLoad, addr, size, flags_, data, res)
{
LSQRequest::_requests.push_back(
std::make_shared<Request>(inst->getASID(), addr, size, flags_,
inst->masterId(), inst->instAddr(), inst->contextId()));
LSQRequest::_requests.back()->setReqInstSeqNum(inst->seqNum);
}
inline virtual ~SingleDataRequest() {}
virtual void initiateTranslation();
virtual void finish(const Fault &fault, const RequestPtr &req,
ThreadContext* tc, BaseTLB::Mode mode);
virtual bool recvTimingResp(PacketPtr pkt);
virtual void sendPacketToCache();
virtual void buildPackets();
virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt);
virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt);
virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask);
};
class SplitDataRequest : public LSQRequest
{
protected:
/* Given that we are inside templates, children need explicit
* declaration of the names in the parent class. */
using Flag = typename LSQRequest::Flag;
using State = typename LSQRequest::State;
using LSQRequest::_addr;
using LSQRequest::_data;
using LSQRequest::_fault;
using LSQRequest::_flags;
using LSQRequest::_inst;
using LSQRequest::_packets;
using LSQRequest::_port;
using LSQRequest::_requests;
using LSQRequest::_res;
using LSQRequest::_senderState;
using LSQRequest::_size;
using LSQRequest::_state;
using LSQRequest::_taskId;
using LSQRequest::flags;
using LSQRequest::isLoad;
using LSQRequest::isTranslationComplete;
using LSQRequest::lsqUnit;
using LSQRequest::numInTranslationFragments;
using LSQRequest::numTranslatedFragments;
using LSQRequest::request;
using LSQRequest::sendFragmentToTranslation;
using LSQRequest::setState;
using LSQRequest::_numOutstandingPackets;
uint32_t numFragments;
uint32_t numReceivedPackets;
RequestPtr mainReq;
PacketPtr _mainPacket;
public:
SplitDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
const Addr& addr, const uint32_t& size,
const Request::Flags & flags_,
PacketDataPtr data = nullptr,
uint64_t* res = nullptr) :
LSQRequest(port, inst, isLoad, addr, size, flags_, data, res),
numFragments(0),
numReceivedPackets(0),
mainReq(nullptr),
_mainPacket(nullptr)
{
flags.set(Flag::IsSplit);
}
virtual ~SplitDataRequest()
{
if (mainReq) {
mainReq = nullptr;
}
if (_mainPacket) {
delete _mainPacket;
_mainPacket = nullptr;
}
}
virtual void finish(const Fault &fault, const RequestPtr &req,
ThreadContext* tc, BaseTLB::Mode mode);
virtual bool recvTimingResp(PacketPtr pkt);
virtual void initiateTranslation();
virtual void sendPacketToCache();
virtual void buildPackets();
virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt);
virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt);
virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask);
virtual RequestPtr mainRequest();
virtual PacketPtr mainPacket();
};
/** Constructs an LSQ with the given parameters. */
LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params);
~LSQ() { }
@@ -85,17 +732,9 @@ class LSQ {
/** Number of entries needed for the given amount of threads.*/
int entryAmount(ThreadID num_threads);
void removeEntries(ThreadID tid);
/** Reset the max entries for each thread. */
void resetEntries();
/** Resize the max entries for a thread. */
void resizeEntries(unsigned size, ThreadID tid);
/** Ticks the LSQ. */
void tick();
/** Ticks a specific LSQ Unit. */
void tick(ThreadID tid)
{ thread[tid].tick(); }
void tick() { usedStorePorts = 0; }
/** Inserts a load into the LSQ. */
void insertLoad(const DynInstPtr &load_inst);
@@ -112,13 +751,13 @@ class LSQ {
* Commits loads up until the given sequence number for a specific thread.
*/
void commitLoads(InstSeqNum &youngest_inst, ThreadID tid)
{ thread[tid].commitLoads(youngest_inst); }
{ thread.at(tid).commitLoads(youngest_inst); }
/**
* Commits stores up until the given sequence number for a specific thread.
*/
void commitStores(InstSeqNum &youngest_inst, ThreadID tid)
{ thread[tid].commitStores(youngest_inst); }
{ thread.at(tid).commitStores(youngest_inst); }
/**
* Attempts to write back stores until all cache ports are used or the
@@ -131,8 +770,11 @@ class LSQ {
/**
* Squash instructions from a thread until the specified sequence number.
*/
void squash(const InstSeqNum &squashed_num, ThreadID tid)
{ thread[tid].squash(squashed_num); }
void
squash(const InstSeqNum &squashed_num, ThreadID tid)
{
thread.at(tid).squash(squashed_num);
}
/** Returns whether or not there was a memory ordering violation. */
bool violation();
@@ -140,50 +782,49 @@ class LSQ {
* Returns whether or not there was a memory ordering violation for a
* specific thread.
*/
bool violation(ThreadID tid)
{ return thread[tid].violation(); }
bool violation(ThreadID tid) { return thread.at(tid).violation(); }
/** Gets the instruction that caused the memory ordering violation. */
DynInstPtr getMemDepViolator(ThreadID tid)
{ return thread[tid].getMemDepViolator(); }
DynInstPtr
getMemDepViolator(ThreadID tid)
{
return thread.at(tid).getMemDepViolator();
}
/** Returns the head index of the load queue for a specific thread. */
int getLoadHead(ThreadID tid)
{ return thread[tid].getLoadHead(); }
int getLoadHead(ThreadID tid) { return thread.at(tid).getLoadHead(); }
/** Returns the sequence number of the head of the load queue. */
InstSeqNum getLoadHeadSeqNum(ThreadID tid)
InstSeqNum
getLoadHeadSeqNum(ThreadID tid)
{
return thread[tid].getLoadHeadSeqNum();
return thread.at(tid).getLoadHeadSeqNum();
}
/** Returns the head index of the store queue. */
int getStoreHead(ThreadID tid)
{ return thread[tid].getStoreHead(); }
int getStoreHead(ThreadID tid) { return thread.at(tid).getStoreHead(); }
/** Returns the sequence number of the head of the store queue. */
InstSeqNum getStoreHeadSeqNum(ThreadID tid)
InstSeqNum
getStoreHeadSeqNum(ThreadID tid)
{
return thread[tid].getStoreHeadSeqNum();
return thread.at(tid).getStoreHeadSeqNum();
}
/** Returns the number of instructions in all of the queues. */
int getCount();
/** Returns the number of instructions in the queues of one thread. */
int getCount(ThreadID tid)
{ return thread[tid].getCount(); }
int getCount(ThreadID tid) { return thread.at(tid).getCount(); }
/** Returns the total number of loads in the load queue. */
int numLoads();
/** Returns the total number of loads for a single thread. */
int numLoads(ThreadID tid)
{ return thread[tid].numLoads(); }
int numLoads(ThreadID tid) { return thread.at(tid).numLoads(); }
/** Returns the total number of stores in the store queue. */
int numStores();
/** Returns the total number of stores for a single thread. */
int numStores(ThreadID tid)
{ return thread[tid].numStores(); }
int numStores(ThreadID tid) { return thread.at(tid).numStores(); }
/** Returns the number of free load entries. */
unsigned numFreeLoadEntries();
@@ -242,46 +883,39 @@ class LSQ {
/** Returns whether or not a specific thread has any stores to write back
* to memory.
*/
bool hasStoresToWB(ThreadID tid)
{ return thread[tid].hasStoresToWB(); }
bool hasStoresToWB(ThreadID tid) { return thread.at(tid).hasStoresToWB(); }
/** Returns the number of stores a specific thread has to write back. */
int numStoresToWB(ThreadID tid)
{ return thread[tid].numStoresToWB(); }
int numStoresToWB(ThreadID tid) { return thread.at(tid).numStoresToWB(); }
/** Returns if the LSQ will write back to memory this cycle. */
bool willWB();
/** Returns if the LSQ of a specific thread will write back to memory this
* cycle.
*/
bool willWB(ThreadID tid)
{ return thread[tid].willWB(); }
bool willWB(ThreadID tid) { return thread.at(tid).willWB(); }
/** Debugging function to print out all instructions. */
void dumpInsts() const;
/** Debugging function to print out instructions from a specific thread. */
void dumpInsts(ThreadID tid) const
{ thread[tid].dumpInsts(); }
void dumpInsts(ThreadID tid) const { thread.at(tid).dumpInsts(); }
/** Executes a read operation, using the load specified at the load
* index.
*/
Fault read(const RequestPtr &req,
RequestPtr &sreqLow, RequestPtr &sreqHigh,
int load_idx);
Fault read(LSQRequest* req, int load_idx);
/** Executes a store operation, using the store specified at the store
* index.
*/
Fault write(const RequestPtr &req,
const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
uint8_t *data, int store_idx);
Fault write(LSQRequest* req, uint8_t *data, int store_idx);
/**
* Retry the previous send that failed.
*/
void recvReqRetry();
void completeDataAccess(PacketPtr pkt);
/**
* Handles writing back and completing the load or store that has
* returned from memory.
@@ -292,13 +926,34 @@ class LSQ {
void recvTimingSnoopReq(PacketPtr pkt);
Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
unsigned int size, Addr addr, Request::Flags flags,
uint64_t *res);
/** The CPU pointer. */
O3CPU *cpu;
/** The IEW stage pointer. */
IEW *iewStage;
/** Is D-cache blocked? */
bool cacheBlocked() const;
/** Set D-cache blocked status */
void cacheBlocked(bool v);
/** Is any store port available to use? */
bool storePortAvailable() const;
/** Another store port is in use */
void storePortBusy();
protected:
/** D-cache is blocked */
bool _cacheBlocked;
/** The number of cache ports available each cycle (stores only). */
int cacheStorePorts;
/** The number of used cache ports in this cycle by stores. */
int usedStorePorts;
/** The LSQ policy for SMT mode. */
SMTQueuePolicy lsqPolicy;
@@ -307,8 +962,10 @@ class LSQ {
* and threshold, this function calculates how many resources each thread
* can occupy at most.
*/
static uint32_t maxLSQAllocation(SMTQueuePolicy pol, uint32_t entries,
uint32_t numThreads, uint32_t SMTThreshold) {
static uint32_t
maxLSQAllocation(SMTQueuePolicy pol, uint32_t entries,
uint32_t numThreads, uint32_t SMTThreshold)
{
if (pol == SMTQueuePolicy::Dynamic) {
return entries;
} else if (pol == SMTQueuePolicy::Partitioned) {
@@ -346,24 +1003,20 @@ class LSQ {
template <class Impl>
Fault
LSQ<Impl>::read(const RequestPtr &req,
RequestPtr &sreqLow, RequestPtr &sreqHigh,
int load_idx)
LSQ<Impl>::read(LSQRequest* req, int load_idx)
{
ThreadID tid = cpu->contextToThread(req->contextId());
ThreadID tid = cpu->contextToThread(req->request()->contextId());
return thread[tid].read(req, sreqLow, sreqHigh, load_idx);
return thread.at(tid).read(req, load_idx);
}
template <class Impl>
Fault
LSQ<Impl>::write(const RequestPtr &req,
const RequestPtr &sreqLow, const RequestPtr &sreqHigh,
uint8_t *data, int store_idx)
LSQ<Impl>::write(LSQRequest* req, uint8_t *data, int store_idx)
{
ThreadID tid = cpu->contextToThread(req->contextId());
ThreadID tid = cpu->contextToThread(req->request()->contextId());
return thread[tid].write(req, sreqLow, sreqHigh, data, store_idx);
return thread.at(tid).write(req, data, store_idx);
}
#endif // __CPU_O3_LSQ_HH__

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011-2012, 2014 ARM Limited
* Copyright (c) 2011-2012, 2014, 2017-2018 ARM Limited
* Copyright (c) 2013 Advanced Micro Devices, Inc.
* All rights reserved
*
@@ -61,6 +61,8 @@ using namespace std;
template <class Impl>
LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params)
: cpu(cpu_ptr), iewStage(iew_ptr),
_cacheBlocked(false),
cacheStorePorts(params->cacheStorePorts), usedStorePorts(0),
lsqPolicy(params->smtLSQPolicy),
LQEntries(params->LQEntries),
SQEntries(params->SQEntries),
@@ -76,8 +78,8 @@ LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params)
//************ Handle SMT Parameters ***********/
//**********************************************/
//Figure out fetch policy
if (lsqPolicy == SMTQueuePolicy::Dynamic) {
/* Run SMT olicy checks. */
if (lsqPolicy == SMTQueuePolicy::Dynamic) {
DPRINTF(LSQ, "LSQ sharing policy set to Dynamic\n");
} else if (lsqPolicy == SMTQueuePolicy::Partitioned) {
DPRINTF(Fetch, "LSQ sharing policy set to Partitioned: "
@@ -85,8 +87,8 @@ LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params)
maxLQEntries,maxSQEntries);
} else if (lsqPolicy == SMTQueuePolicy::Threshold) {
assert(params->smtLSQThreshold > LQEntries);
assert(params->smtLSQThreshold > SQEntries);
assert(params->smtLSQThreshold > params->LQEntries);
assert(params->smtLSQThreshold > params->SQEntries);
DPRINTF(LSQ, "LSQ sharing policy set to Threshold: "
"%i entries per LQ | %i entries per SQ\n",
@@ -163,79 +165,41 @@ template <class Impl>
void
LSQ<Impl>::takeOverFrom()
{
usedStorePorts = 0;
_cacheBlocked = false;
for (ThreadID tid = 0; tid < numThreads; tid++) {
thread[tid].takeOverFrom();
}
}
template <class Impl>
int
LSQ<Impl>::entryAmount(ThreadID num_threads)
template<class Impl>
bool
LSQ<Impl>::cacheBlocked() const
{
if (lsqPolicy == SMTQueuePolicy::Partitioned) {
return LQEntries / num_threads;
} else {
return 0;
}
}
template <class Impl>
void
LSQ<Impl>::resetEntries()
{
if (lsqPolicy != SMTQueuePolicy::Dynamic || numThreads > 1) {
int active_threads = activeThreads->size();
int maxEntries;
if (lsqPolicy == SMTQueuePolicy::Partitioned) {
maxEntries = LQEntries / active_threads;
} else if (lsqPolicy == SMTQueuePolicy::Threshold &&
active_threads == 1) {
maxEntries = LQEntries;
} else {
maxEntries = LQEntries;
}
list<ThreadID>::iterator threads = activeThreads->begin();
list<ThreadID>::iterator end = activeThreads->end();
while (threads != end) {
ThreadID tid = *threads++;
resizeEntries(maxEntries, tid);
}
}
return _cacheBlocked;
}
template<class Impl>
void
LSQ<Impl>::removeEntries(ThreadID tid)
LSQ<Impl>::cacheBlocked(bool v)
{
thread[tid].clearLQ();
thread[tid].clearSQ();
_cacheBlocked = v;
}
template<class Impl>
bool
LSQ<Impl>::storePortAvailable() const
{
return usedStorePorts < cacheStorePorts;
}
template<class Impl>
void
LSQ<Impl>::resizeEntries(unsigned size, ThreadID tid)
LSQ<Impl>::storePortBusy()
{
thread[tid].resizeLQ(size);
thread[tid].resizeSQ(size);
}
template<class Impl>
void
LSQ<Impl>::tick()
{
list<ThreadID>::iterator threads = activeThreads->begin();
list<ThreadID>::iterator end = activeThreads->end();
while (threads != end) {
ThreadID tid = *threads++;
thread[tid].tick();
}
usedStorePorts++;
assert(usedStorePorts <= cacheStorePorts);
}
template<class Impl>
@@ -316,12 +280,22 @@ void
LSQ<Impl>::recvReqRetry()
{
iewStage->cacheUnblocked();
cacheBlocked(false);
for (ThreadID tid : *activeThreads) {
thread[tid].recvRetry();
}
}
template <class Impl>
void
LSQ<Impl>::completeDataAccess(PacketPtr pkt)
{
auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
thread[cpu->contextToThread(senderState->contextId())]
.completeDataAccess(pkt);
}
template <class Impl>
bool
LSQ<Impl>::recvTimingResp(PacketPtr pkt)
@@ -330,8 +304,10 @@ LSQ<Impl>::recvTimingResp(PacketPtr pkt)
DPRINTF(LSQ, "Got error packet back for address: %#X\n",
pkt->getAddr());
thread[cpu->contextToThread(pkt->req->contextId())]
.completeDataAccess(pkt);
auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
panic_if(!senderState, "Got packet back with unknown sender state\n");
thread[cpu->contextToThread(senderState->contextId())].recvTimingResp(pkt);
if (pkt->isInvalidate()) {
// This response also contains an invalidate; e.g. this can be the case
@@ -352,8 +328,9 @@ LSQ<Impl>::recvTimingResp(PacketPtr pkt)
thread[tid].checkSnoop(pkt);
}
}
// Update the LSQRequest state (this may delete the request)
senderState->request()->packetReplied();
delete pkt;
return true;
}
@@ -681,4 +658,442 @@ LSQ<Impl>::dumpInsts() const
}
}
static Addr
addrBlockOffset(Addr addr, unsigned int block_size)
{
return addr & (block_size - 1);
}
static Addr
addrBlockAlign(Addr addr, uint64_t block_size)
{
return addr & ~(block_size - 1);
}
static bool
transferNeedsBurst(Addr addr, uint64_t size, uint64_t block_size)
{
return (addrBlockOffset(addr, block_size) + size) > block_size;
}
template<class Impl>
Fault
LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
unsigned int size, Addr addr, Request::Flags flags,
uint64_t *res)
{
ThreadID tid = cpu->contextToThread(inst->contextId());
auto cacheLineSize = cpu->cacheLineSize();
bool needs_burst = transferNeedsBurst(addr, size, cacheLineSize);
LSQRequest* req = nullptr;
if (inst->translationStarted()) {
req = inst->savedReq;
assert(req);
} else {
if (needs_burst) {
req = new SplitDataRequest(&thread[tid], inst, isLoad, addr,
size, flags, data, res);
} else {
req = new SingleDataRequest(&thread[tid], inst, isLoad, addr,
size, flags, data, res);
}
assert(req);
inst->setRequest();
req->taskId(cpu->taskId());
req->initiateTranslation();
}
/* This is the place were instructions get the effAddr. */
if (req->isTranslationComplete()) {
if (inst->getFault() == NoFault) {
inst->effAddr = req->getVaddr();
inst->effSize = size;
inst->effAddrValid(true);
if (cpu->checker) {
inst->reqToVerify = std::make_shared<Request>(*req->request());
}
if (isLoad)
inst->getFault() = cpu->read(req, inst->lqIdx);
else
inst->getFault() = cpu->write(req, data, inst->sqIdx);
} else if (isLoad) {
// Commit will have to clean up whatever happened. Set this
// instruction as executed.
inst->setExecuted();
}
}
if (inst->traceData)
inst->traceData->setMem(addr, size, flags);
return inst->getFault();
}
template<class Impl>
void
LSQ<Impl>::SingleDataRequest::finish(const Fault &fault, const RequestPtr &req,
ThreadContext* tc, BaseTLB::Mode mode)
{
_fault.push_back(fault);
numInTranslationFragments = 0;
numTranslatedFragments = 1;
/* If the instruction has been squahsed, let the request know
* as it may have to self-destruct. */
if (_inst->isSquashed()) {
this->squashTranslation();
} else {
_inst->strictlyOrdered(req->isStrictlyOrdered());
flags.set(Flag::TranslationFinished);
if (fault == NoFault) {
_inst->physEffAddr = req->getPaddr();
_inst->memReqFlags = req->getFlags();
if (req->isCondSwap()) {
assert(_res);
req->setExtraData(*_res);
}
setState(State::Request);
} else {
setState(State::Fault);
}
LSQRequest::_inst->fault = fault;
LSQRequest::_inst->translationCompleted(true);
}
}
template<class Impl>
void
LSQ<Impl>::SplitDataRequest::finish(const Fault &fault, const RequestPtr &req,
ThreadContext* tc, BaseTLB::Mode mode)
{
_fault.push_back(fault);
assert(req == _requests[numTranslatedFragments] || this->isDelayed());
numInTranslationFragments--;
numTranslatedFragments++;
mainReq->setFlags(req->getFlags());
if (numTranslatedFragments == _requests.size()) {
if (_inst->isSquashed()) {
this->squashTranslation();
} else {
_inst->strictlyOrdered(mainReq->isStrictlyOrdered());
flags.set(Flag::TranslationFinished);
auto fault_it = _fault.begin();
/* Ffwd to the first NoFault. */
while (fault_it != _fault.end() && *fault_it == NoFault)
fault_it++;
/* If none of the fragments faulted: */
if (fault_it == _fault.end()) {
_inst->physEffAddr = request(0)->getPaddr();
_inst->memReqFlags = mainReq->getFlags();
if (mainReq->isCondSwap()) {
assert(_res);
mainReq->setExtraData(*_res);
}
setState(State::Request);
_inst->fault = NoFault;
} else {
setState(State::Fault);
_inst->fault = *fault_it;
}
_inst->translationCompleted(true);
}
}
}
template<class Impl>
void
LSQ<Impl>::SingleDataRequest::initiateTranslation()
{
_inst->translationStarted(true);
setState(State::Translation);
flags.set(Flag::TranslationStarted);
_inst->savedReq = this;
sendFragmentToTranslation(0);
if (isTranslationComplete()) {
}
}
template<class Impl>
PacketPtr
LSQ<Impl>::SplitDataRequest::mainPacket()
{
return _mainPacket;
}
template<class Impl>
RequestPtr
LSQ<Impl>::SplitDataRequest::mainRequest()
{
return mainReq;
}
template<class Impl>
void
LSQ<Impl>::SplitDataRequest::initiateTranslation()
{
_inst->translationStarted(true);
setState(State::Translation);
flags.set(Flag::TranslationStarted);
unsigned int cacheLineSize = _port.cacheLineSize();
Addr base_addr = _addr;
Addr next_addr = addrBlockAlign(_addr + cacheLineSize, cacheLineSize);
Addr final_addr = addrBlockAlign(_addr + _size, cacheLineSize);
uint32_t size_so_far = 0;
mainReq = std::make_shared<Request>(_inst->getASID(), base_addr,
_size, _flags, _inst->masterId(),
_inst->instAddr(), _inst->contextId());
// Paddr is not used in mainReq. However, we will accumulate the flags
// from the sub requests into mainReq by calling setFlags() in finish().
// setFlags() assumes that paddr is set so flip the paddr valid bit here to
// avoid a potential assert in setFlags() when we call it from finish().
mainReq->setPaddr(0);
/* Get the pre-fix, possibly unaligned. */
_requests.push_back(std::make_shared<Request>(_inst->getASID(), base_addr,
next_addr - base_addr, _flags, _inst->masterId(),
_inst->instAddr(), _inst->contextId()));
size_so_far = next_addr - base_addr;
/* We are block aligned now, reading whole blocks. */
base_addr = next_addr;
while (base_addr != final_addr) {
_requests.push_back(std::make_shared<Request>(_inst->getASID(),
base_addr, cacheLineSize, _flags, _inst->masterId(),
_inst->instAddr(), _inst->contextId()));
size_so_far += cacheLineSize;
base_addr += cacheLineSize;
}
/* Deal with the tail. */
if (size_so_far < _size) {
_requests.push_back(std::make_shared<Request>(_inst->getASID(),
base_addr, _size - size_so_far, _flags, _inst->masterId(),
_inst->instAddr(), _inst->contextId()));
}
/* Setup the requests and send them to translation. */
for (auto& r: _requests) {
r->setReqInstSeqNum(_inst->seqNum);
r->taskId(_taskId);
}
this->_inst->savedReq = this;
numInTranslationFragments = 0;
numTranslatedFragments = 0;
for (uint32_t i = 0; i < _requests.size(); i++) {
sendFragmentToTranslation(i);
}
}
template<class Impl>
void
LSQ<Impl>::LSQRequest::sendFragmentToTranslation(int i)
{
numInTranslationFragments++;
_port.dTLB()->translateTiming(
this->request(i),
this->_inst->thread->getTC(), this,
this->isLoad() ? BaseTLB::Read : BaseTLB::Write);
}
template<class Impl>
bool
LSQ<Impl>::SingleDataRequest::recvTimingResp(PacketPtr pkt)
{
assert(_numOutstandingPackets == 1);
auto state = dynamic_cast<LSQSenderState*>(pkt->senderState);
setState(State::Complete);
flags.set(Flag::Complete);
state->outstanding--;
assert(pkt == _packets.front());
_port.completeDataAccess(pkt);
return true;
}
template<class Impl>
bool
LSQ<Impl>::SplitDataRequest::recvTimingResp(PacketPtr pkt)
{
auto state = dynamic_cast<LSQSenderState*>(pkt->senderState);
uint32_t pktIdx = 0;
while (pktIdx < _packets.size() && pkt != _packets[pktIdx])
pktIdx++;
assert(pktIdx < _packets.size());
assert(pkt->req == _requests[pktIdx]);
assert(pkt == _packets[pktIdx]);
numReceivedPackets++;
state->outstanding--;
if (numReceivedPackets == _packets.size()) {
setState(State::Complete);
flags.set(Flag::Complete);
/* Assemble packets. */
PacketPtr resp = isLoad()
? Packet::createRead(mainReq)
: Packet::createWrite(mainReq);
if (isLoad())
resp->dataStatic(_inst->memData);
else
resp->dataStatic(_data);
resp->senderState = _senderState;
_port.completeDataAccess(resp);
delete resp;
}
return true;
}
template<class Impl>
void
LSQ<Impl>::SingleDataRequest::buildPackets()
{
assert(_senderState);
/* Retries do not create new packets. */
if (_packets.size() == 0) {
_packets.push_back(
isLoad()
? Packet::createRead(request())
: Packet::createWrite(request()));
_packets.back()->dataStatic(_inst->memData);
_packets.back()->senderState = _senderState;
}
assert(_packets.size() == 1);
}
template<class Impl>
void
LSQ<Impl>::SplitDataRequest::buildPackets()
{
/* Extra data?? */
ptrdiff_t offset = 0;
if (_packets.size() == 0) {
/* New stuff */
if (isLoad()) {
_mainPacket = Packet::createRead(mainReq);
_mainPacket->dataStatic(_inst->memData);
}
for (auto& r: _requests) {
PacketPtr pkt = isLoad() ? Packet::createRead(r)
: Packet::createWrite(r);
if (isLoad()) {
pkt->dataStatic(_inst->memData + offset);
} else {
uint8_t* req_data = new uint8_t[r->getSize()];
std::memcpy(req_data,
_inst->memData + offset,
r->getSize());
pkt->dataDynamic(req_data);
}
offset += r->getSize();
pkt->senderState = _senderState;
_packets.push_back(pkt);
}
}
assert(_packets.size() == _requests.size());
}
template<class Impl>
void
LSQ<Impl>::SingleDataRequest::sendPacketToCache()
{
assert(_numOutstandingPackets == 0);
if (lsqUnit()->trySendPacket(isLoad(), _packets.at(0)))
_numOutstandingPackets = 1;
}
template<class Impl>
void
LSQ<Impl>::SplitDataRequest::sendPacketToCache()
{
/* Try to send the packets. */
while (numReceivedPackets + _numOutstandingPackets < _packets.size() &&
lsqUnit()->trySendPacket(isLoad(),
_packets.at(numReceivedPackets + _numOutstandingPackets))) {
_numOutstandingPackets++;
}
}
template<class Impl>
void
LSQ<Impl>::SingleDataRequest::handleIprWrite(ThreadContext *thread,
PacketPtr pkt)
{
TheISA::handleIprWrite(thread, pkt);
}
template<class Impl>
void
LSQ<Impl>::SplitDataRequest::handleIprWrite(ThreadContext *thread,
PacketPtr mainPkt)
{
unsigned offset = 0;
for (auto r: _requests) {
PacketPtr pkt = new Packet(r, MemCmd::WriteReq);
pkt->dataStatic(mainPkt->getPtr<uint8_t>() + offset);
TheISA::handleIprWrite(thread, pkt);
offset += r->getSize();
delete pkt;
}
}
template<class Impl>
Cycles
LSQ<Impl>::SingleDataRequest::handleIprRead(ThreadContext *thread,
PacketPtr pkt)
{
return TheISA::handleIprRead(thread, pkt);
}
template<class Impl>
Cycles
LSQ<Impl>::SplitDataRequest::handleIprRead(ThreadContext *thread,
PacketPtr mainPkt)
{
Cycles delay(0);
unsigned offset = 0;
for (auto r: _requests) {
PacketPtr pkt = new Packet(r, MemCmd::ReadReq);
pkt->dataStatic(mainPkt->getPtr<uint8_t>() + offset);
Cycles d = TheISA::handleIprRead(thread, pkt);
if (d > delay)
delay = d;
offset += r->getSize();
delete pkt;
}
return delay;
}
template<class Impl>
bool
LSQ<Impl>::SingleDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask)
{
return ( (LSQRequest::_requests[0]->getPaddr() & blockMask) == blockAddr);
}
template<class Impl>
bool
LSQ<Impl>::SplitDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask)
{
bool is_hit = false;
for (auto &r: _requests) {
if ((r->getPaddr() & blockMask) == blockAddr) {
is_hit = true;
break;
}
}
return is_hit;
}
#endif//__CPU_O3_LSQ_IMPL_HH__

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -409,7 +409,7 @@ ElasticTrace::addDepTraceRecord(const DynInstConstPtr& head_inst,
new_record->reqFlags = head_inst->memReqFlags;
new_record->virtAddr = head_inst->effAddr;
new_record->asid = head_inst->asid;
new_record->physAddr = head_inst->physEffAddrLow;
new_record->physAddr = head_inst->physEffAddr;
// Currently the tracing does not support split requests.
new_record->size = head_inst->effSize;
new_record->pc = head_inst->instAddr();