cpu: HTM Implementation for O3CPU

JIRA: https://gem5.atlassian.net/browse/GEM5-587

Change-Id: I83787f4594963a15d856b81ad283b4f032d1c007
Signed-off-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30328
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Timothy Hayes
2020-09-02 11:28:33 +01:00
committed by Giacomo Travaglini
parent 79df434187
commit 46d7fdf1b6
15 changed files with 684 additions and 32 deletions

View File

@@ -61,6 +61,7 @@
#include "cpu/op_class.hh"
#include "cpu/static_inst.hh"
#include "cpu/translation.hh"
#include "debug/HtmCpu.hh"
#include "mem/packet.hh"
#include "mem/request.hh"
#include "sim/byteswap.hh"
@@ -140,6 +141,7 @@ class BaseDynInst : public ExecContext, public RefCounted
IsStrictlyOrdered,
ReqMade,
MemOpDone,
HtmFromTransaction,
MaxFlags
};
@@ -240,6 +242,11 @@ class BaseDynInst : public ExecContext, public RefCounted
// Need a copy of main request pointer to verify on writes.
RequestPtr reqToVerify;
private:
// hardware transactional memory
uint64_t htmUid;
uint64_t htmDepth;
protected:
/** Flattened register index of the destination registers of this
* instruction.
@@ -548,8 +555,8 @@ class BaseDynInst : public ExecContext, public RefCounted
uint64_t getHtmTransactionUid() const override
{
panic("Not yet implemented\n");
return 0;
assert(instFlags[HtmFromTransaction]);
return this->htmUid;
}
uint64_t newHtmTransactionUid() const override
@@ -560,14 +567,35 @@ class BaseDynInst : public ExecContext, public RefCounted
bool inHtmTransactionalState() const override
{
panic("Not yet implemented\n");
return false;
return instFlags[HtmFromTransaction];
}
uint64_t getHtmTransactionalDepth() const override
{
panic("Not yet implemented\n");
return 0;
if (inHtmTransactionalState())
return this->htmDepth;
else
return 0;
}
void setHtmTransactionalState(uint64_t htm_uid, uint64_t htm_depth)
{
instFlags.set(HtmFromTransaction);
htmUid = htm_uid;
htmDepth = htm_depth;
}
void clearHtmTransactionalState()
{
if (inHtmTransactionalState()) {
DPRINTF(HtmCpu,
"clearing instuction's transactional state htmUid=%u\n",
getHtmTransactionUid());
instFlags.reset(HtmFromTransaction);
htmUid = -1;
htmDepth = 0;
}
}
/** Temporarily sets this instruction as a serialize before instruction. */
@@ -997,8 +1025,9 @@ template<class Impl>
Fault
BaseDynInst<Impl>::initiateHtmCmd(Request::Flags flags)
{
panic("Not yet implemented\n");
return NoFault;
return cpu->pushRequest(
dynamic_cast<typename DynInstPtr::PtrType>(this),
/* ld */ true, nullptr, 8, 0x0ul, flags, nullptr, nullptr);
}
template<class Impl>

View File

@@ -95,6 +95,9 @@ BaseDynInst<Impl>::initVars()
physEffAddr = 0;
readyRegs = 0;
memReqFlags = 0;
// hardware transactional memory
htmUid = -1;
htmDepth = 0;
status.reset();

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2010-2012, 2014 ARM Limited
* Copyright (c) 2010-2012, 2014, 2019 ARM Limited
* All rights reserved.
*
* The license below extends only to copyright in the software and shall
@@ -205,6 +205,12 @@ class DefaultCommit
/** Deschedules a thread from scheduling */
void deactivateThread(ThreadID tid);
/** Is the CPU currently processing a HTM transaction? */
bool executingHtmTransaction(ThreadID) const;
/* Reset HTM tracking, e.g. after an abort */
void resetHtmStartsStops(ThreadID);
/** Ticks the commit stage, which tries to commit instructions. */
void tick();
@@ -473,6 +479,11 @@ class DefaultCommit
/** Updates commit stats based on this instruction. */
void updateComInstStats(const DynInstPtr &inst);
// HTM
int htmStarts[Impl::MaxThreads];
int htmStops[Impl::MaxThreads];
/** Stat for the total number of squashed instructions discarded by commit.
*/
Stats::Scalar commitSquashedInsts;

View File

@@ -60,6 +60,7 @@
#include "debug/CommitRate.hh"
#include "debug/Drain.hh"
#include "debug/ExecFaulting.hh"
#include "debug/HtmCpu.hh"
#include "debug/O3PipeView.hh"
#include "params/DerivO3CPU.hh"
#include "sim/faults.hh"
@@ -121,6 +122,8 @@ DefaultCommit<Impl>::DefaultCommit(O3CPU *_cpu, DerivO3CPUParams *params)
committedStores[tid] = false;
checkEmptyROB[tid] = false;
renameMap[tid] = nullptr;
htmStarts[tid] = 0;
htmStops[tid] = 0;
}
interrupt = NoFault;
}
@@ -404,6 +407,14 @@ DefaultCommit<Impl>::drainSanityCheck() const
{
assert(isDrained());
rob->drainSanityCheck();
// hardware transactional memory
// cannot drain partially through a transaction
for (ThreadID tid = 0; tid < numThreads; tid++) {
if (executingHtmTransaction(tid)) {
panic("cannot drain partially through a HTM transaction");
}
}
}
template <class Impl>
@@ -462,6 +473,27 @@ DefaultCommit<Impl>::deactivateThread(ThreadID tid)
}
}
template <class Impl>
bool
DefaultCommit<Impl>::executingHtmTransaction(ThreadID tid) const
{
if (tid == InvalidThreadID)
return false;
else
return (htmStarts[tid] > htmStops[tid]);
}
template <class Impl>
void
DefaultCommit<Impl>::resetHtmStartsStops(ThreadID tid)
{
if (tid != InvalidThreadID)
{
htmStarts[tid] = 0;
htmStops[tid] = 0;
}
}
template <class Impl>
void
@@ -532,6 +564,14 @@ DefaultCommit<Impl>::generateTrapEvent(ThreadID tid, Fault inst_fault)
Cycles latency = dynamic_pointer_cast<SyscallRetryFault>(inst_fault) ?
cpu->syscallRetryLatency : trapLatency;
// hardware transactional memory
if (inst_fault != nullptr &&
std::dynamic_pointer_cast<GenericHtmFailureFault>(inst_fault)) {
// TODO
// latency = default abort/restore latency
// could also do some kind of exponential back off if desired
}
cpu->schedule(trap, cpu->clockEdge(latency));
trapInFlight[tid] = true;
thread[tid]->trapPending = true;
@@ -991,13 +1031,28 @@ DefaultCommit<Impl>::commitInsts()
// Commit as many instructions as possible until the commit bandwidth
// limit is reached, or it becomes impossible to commit any more.
while (num_committed < commitWidth) {
// Check for any interrupt that we've already squashed for
// and start processing it.
if (interrupt != NoFault)
handleInterrupt();
// hardware transactionally memory
// If executing within a transaction,
// need to handle interrupts specially
ThreadID commit_thread = getCommittingThread();
// Check for any interrupt that we've already squashed for
// and start processing it.
if (interrupt != NoFault) {
// If inside a transaction, postpone interrupts
if (executingHtmTransaction(commit_thread)) {
cpu->clearInterrupts(0);
toIEW->commitInfo[0].clearInterrupt = true;
interrupt = NoFault;
avoidQuiesceLiveLock = true;
} else {
handleInterrupt();
}
}
// ThreadID commit_thread = getCommittingThread();
if (commit_thread == -1 || !rob->isHeadReady(commit_thread))
break;
@@ -1044,6 +1099,23 @@ DefaultCommit<Impl>::commitInsts()
statCommittedInstType[tid][head_inst->opClass()]++;
ppCommit->notify(head_inst);
// hardware transactional memory
// update nesting depth
if (head_inst->isHtmStart())
htmStarts[tid]++;
// sanity check
if (head_inst->inHtmTransactionalState()) {
assert(executingHtmTransaction(tid));
} else {
assert(!executingHtmTransaction(tid));
}
// update nesting depth
if (head_inst->isHtmStop())
htmStops[tid]++;
changedROBNumEntries[tid] = true;
// Set the doneSeqNum to the youngest committed instruction.
@@ -1206,6 +1278,23 @@ DefaultCommit<Impl>::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
// Check if the instruction caused a fault. If so, trap.
Fault inst_fault = head_inst->getFault();
// hardware transactional memory
// if a fault occurred within a HTM transaction
// ensure that the transaction aborts
if (inst_fault != NoFault && head_inst->inHtmTransactionalState()) {
// There exists a generic HTM fault common to all ISAs
if (!std::dynamic_pointer_cast<GenericHtmFailureFault>(inst_fault)) {
DPRINTF(HtmCpu, "%s - fault (%s) encountered within transaction"
" - converting to GenericHtmFailureFault\n",
head_inst->staticInst->getName(), inst_fault->name());
inst_fault = std::make_shared<GenericHtmFailureFault>(
head_inst->getHtmTransactionUid(),
HtmFailureFaultCause::EXCEPTION);
}
// If this point is reached and the fault inherits from the HTM fault,
// then there is no need to raise a new fault
}
// Stores mark themselves as completed.
if (!head_inst->isStore() && inst_fault == NoFault) {
head_inst->setCompleted();
@@ -1301,6 +1390,11 @@ DefaultCommit<Impl>::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
head_inst->renamedDestRegIdx(i));
}
// hardware transactional memory
// the HTM UID is purely for correctness and debugging purposes
if (head_inst->isHtmStart())
iewStage->setLastRetiredHtmUid(tid, head_inst->getHtmTransactionUid());
// Finally clear the head ROB entry.
rob->retireHead(tid);

View File

@@ -623,6 +623,10 @@ template <class Impl>
void
FullO3CPU<Impl>::deactivateThread(ThreadID tid)
{
// hardware transactional memory
// shouldn't deactivate thread in the middle of a transaction
assert(!commit.executingHtmTransaction(tid));
//Remove From Active List, if Active
list<ThreadID>::iterator thread_it =
std::find(activeThreads.begin(), activeThreads.end(), tid);
@@ -1829,10 +1833,38 @@ FullO3CPU<Impl>::exitThreads()
template <class Impl>
void
FullO3CPU<Impl>::htmSendAbortSignal(ThreadID tid, uint64_t htmUid,
FullO3CPU<Impl>::htmSendAbortSignal(ThreadID tid, uint64_t htm_uid,
HtmFailureFaultCause cause)
{
panic("not yet supported!");
const Addr addr = 0x0ul;
const int size = 8;
const Request::Flags flags =
Request::PHYSICAL|Request::STRICT_ORDER|Request::HTM_ABORT;
// O3-specific actions
this->iew.ldstQueue.resetHtmStartsStops(tid);
this->commit.resetHtmStartsStops(tid);
// notify l1 d-cache (ruby) that core has aborted transaction
RequestPtr req =
std::make_shared<Request>(addr, size, flags, _dataMasterId);
req->taskId(taskId());
req->setContext(this->thread[tid]->contextId());
req->setHtmAbortCause(cause);
assert(req->isHTMAbort());
PacketPtr abort_pkt = Packet::createRead(req);
uint8_t *memData = new uint8_t[8];
assert(memData);
abort_pkt->dataStatic(memData);
abort_pkt->setHtmTransactional(htm_uid);
// TODO include correct error handling here
if (!this->iew.ldstQueue.getDataPort().sendTimingReq(abort_pkt)) {
panic("HTM abort signal was not sent to the memory subsystem.");
}
}
// Forward declaration of FullO3CPU.

View File

@@ -61,7 +61,6 @@
#include "cpu/base.hh"
#include "cpu/simple_thread.hh"
#include "cpu/timebuf.hh"
//#include "cpu/o3/thread_context.hh"
#include "params/DerivO3CPU.hh"
#include "sim/process.hh"

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2010-2012, 2014 ARM Limited
* Copyright (c) 2010-2012, 2014, 2019 ARM Limited
* All rights reserved
*
* The license below extends only to copyright in the software and shall
@@ -233,6 +233,16 @@ class DefaultIEW
/** Check misprediction */
void checkMisprediction(const DynInstPtr &inst);
// hardware transactional memory
// For debugging purposes, it is useful to keep track of the most recent
// htmUid that has been committed (architecturally, not transactionally)
// to ensure that the core and the memory subsystem are observing
// correct ordering constraints.
void setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid)
{
ldstQueue.setLastRetiredHtmUid(tid, htmUid);
}
private:
/** Sends commit proper information for a squash due to a branch
* mispredict.

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2010-2013, 2018 ARM Limited
* Copyright (c) 2010-2013, 2018-2019 ARM Limited
* Copyright (c) 2013 Advanced Micro Devices, Inc.
* All rights reserved.
*
@@ -1051,6 +1051,20 @@ DefaultIEW<Impl>::dispatchInsts(ThreadID tid)
break;
}
// hardware transactional memory
// CPU needs to track transactional state in program order.
const int numHtmStarts = ldstQueue.numHtmStarts(tid);
const int numHtmStops = ldstQueue.numHtmStops(tid);
const int htmDepth = numHtmStarts - numHtmStops;
if (htmDepth > 0) {
inst->setHtmTransactionalState(ldstQueue.getLatestHtmUid(tid),
htmDepth);
} else {
inst->clearHtmTransactionalState();
}
// Otherwise issue the instruction just fine.
if (inst->isAtomic()) {
DPRINTF(IEW, "[tid:%i] Issue: Memory instruction "

View File

@@ -687,6 +687,8 @@ class LSQ
{
flags.set(Flag::Complete);
}
virtual std::string name() const { return "LSQRequest"; }
};
class SingleDataRequest : public LSQRequest
@@ -739,6 +741,35 @@ class LSQ
virtual void buildPackets();
virtual Cycles handleLocalAccess(ThreadContext *thread, PacketPtr pkt);
virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask);
virtual std::string name() const { return "SingleDataRequest"; }
};
// hardware transactional memory
// This class extends SingleDataRequest for the sole purpose
// of encapsulating hardware transactional memory command requests
class HtmCmdRequest : public SingleDataRequest
{
protected:
/* Given that we are inside templates, children need explicit
* declaration of the names in the parent class. */
using Flag = typename LSQRequest::Flag;
using State = typename LSQRequest::State;
using LSQRequest::_addr;
using LSQRequest::_size;
using LSQRequest::_byteEnable;
using LSQRequest::_requests;
using LSQRequest::_inst;
using LSQRequest::_taskId;
using LSQRequest::flags;
using LSQRequest::setState;
public:
HtmCmdRequest(LSQUnit* port, const DynInstPtr& inst,
const Request::Flags& flags_);
inline virtual ~HtmCmdRequest() {}
virtual void initiateTranslation();
virtual void finish(const Fault &fault, const RequestPtr &req,
ThreadContext* tc, BaseTLB::Mode mode);
virtual std::string name() const { return "HtmCmdRequest"; }
};
class SplitDataRequest : public LSQRequest
@@ -815,6 +846,7 @@ class LSQ
virtual RequestPtr mainRequest();
virtual PacketPtr mainPacket();
virtual std::string name() const { return "SplitDataRequest"; }
};
/** Constructs an LSQ with the given parameters. */
@@ -933,6 +965,44 @@ class LSQ
/** Returns the total number of stores for a single thread. */
int numStores(ThreadID tid) { return thread.at(tid).numStores(); }
// hardware transactional memory
int numHtmStarts(ThreadID tid) const
{
if (tid == InvalidThreadID)
return 0;
else
return thread[tid].numHtmStarts();
}
int numHtmStops(ThreadID tid) const
{
if (tid == InvalidThreadID)
return 0;
else
return thread[tid].numHtmStops();
}
void resetHtmStartsStops(ThreadID tid)
{
if (tid != InvalidThreadID)
thread[tid].resetHtmStartsStops();
}
uint64_t getLatestHtmUid(ThreadID tid) const
{
if (tid == InvalidThreadID)
return 0;
else
return thread[tid].getLatestHtmUid();
}
void setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid)
{
if (tid != InvalidThreadID)
thread[tid].setLastRetiredHtmUid(htmUid);
}
/** Returns the number of free load entries. */
unsigned numFreeLoadEntries();

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011-2012, 2014, 2017-2018 ARM Limited
* Copyright (c) 2011-2012, 2014, 2017-2019 ARM Limited
* Copyright (c) 2013 Advanced Micro Devices, Inc.
* All rights reserved
*
@@ -51,6 +51,7 @@
#include "cpu/o3/lsq.hh"
#include "debug/Drain.hh"
#include "debug/Fetch.hh"
#include "debug/HtmCpu.hh"
#include "debug/LSQ.hh"
#include "debug/Writeback.hh"
#include "params/DerivO3CPU.hh"
@@ -706,11 +707,17 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
// lines. For now, such cross-line update is not supported.
assert(!isAtomic || (isAtomic && !needs_burst));
const bool htm_cmd = isLoad && (flags & Request::HTM_CMD);
if (inst->translationStarted()) {
req = inst->savedReq;
assert(req);
} else {
if (needs_burst) {
if (htm_cmd) {
assert(addr == 0x0lu);
assert(size == 8);
req = new HtmCmdRequest(&thread[tid], inst, flags);
} else if (needs_burst) {
req = new SplitDataRequest(&thread[tid], inst, isLoad, addr,
size, flags, data, res);
} else {
@@ -1033,6 +1040,23 @@ LSQ<Impl>::SingleDataRequest::buildPackets()
: Packet::createWrite(request()));
_packets.back()->dataStatic(_inst->memData);
_packets.back()->senderState = _senderState;
// hardware transactional memory
// If request originates in a transaction (not necessarily a HtmCmd),
// then the packet should be marked as such.
if (_inst->inHtmTransactionalState()) {
_packets.back()->setHtmTransactional(
_inst->getHtmTransactionUid());
DPRINTF(HtmCpu,
"HTM %s pc=0x%lx - vaddr=0x%lx - paddr=0x%lx - htmUid=%u\n",
isLoad() ? "LD" : "ST",
_inst->instAddr(),
_packets.back()->req->hasVaddr() ?
_packets.back()->req->getVaddr() : 0lu,
_packets.back()->getAddr(),
_inst->getHtmTransactionUid());
}
}
assert(_packets.size() == 1);
}
@@ -1049,6 +1073,21 @@ LSQ<Impl>::SplitDataRequest::buildPackets()
if (isLoad()) {
_mainPacket = Packet::createRead(mainReq);
_mainPacket->dataStatic(_inst->memData);
// hardware transactional memory
// If request originates in a transaction,
// packet should be marked as such
if (_inst->inHtmTransactionalState()) {
_mainPacket->setHtmTransactional(
_inst->getHtmTransactionUid());
DPRINTF(HtmCpu,
"HTM LD.0 pc=0x%lx-vaddr=0x%lx-paddr=0x%lx-htmUid=%u\n",
_inst->instAddr(),
_mainPacket->req->hasVaddr() ?
_mainPacket->req->getVaddr() : 0lu,
_mainPacket->getAddr(),
_inst->getHtmTransactionUid());
}
}
for (int i = 0; i < _requests.size() && _fault[i] == NoFault; i++) {
RequestPtr r = _requests[i];
@@ -1066,6 +1105,23 @@ LSQ<Impl>::SplitDataRequest::buildPackets()
}
pkt->senderState = _senderState;
_packets.push_back(pkt);
// hardware transactional memory
// If request originates in a transaction,
// packet should be marked as such
if (_inst->inHtmTransactionalState()) {
_packets.back()->setHtmTransactional(
_inst->getHtmTransactionUid());
DPRINTF(HtmCpu,
"HTM %s.%d pc=0x%lx-vaddr=0x%lx-paddr=0x%lx-htmUid=%u\n",
isLoad() ? "LD" : "ST",
i+1,
_inst->instAddr(),
_packets.back()->req->hasVaddr() ?
_packets.back()->req->getVaddr() : 0lu,
_packets.back()->getAddr(),
_inst->getHtmTransactionUid());
}
}
}
assert(_packets.size() > 0);
@@ -1192,4 +1248,59 @@ LSQ<Impl>::DcachePort::recvReqRetry()
lsq->recvReqRetry();
}
template<class Impl>
LSQ<Impl>::HtmCmdRequest::HtmCmdRequest(LSQUnit* port,
const DynInstPtr& inst,
const Request::Flags& flags_) :
SingleDataRequest(port, inst, true, 0x0lu, 8, flags_,
nullptr, nullptr, nullptr)
{
assert(_requests.size() == 0);
this->addRequest(_addr, _size, _byteEnable);
if (_requests.size() > 0) {
_requests.back()->setReqInstSeqNum(_inst->seqNum);
_requests.back()->taskId(_taskId);
_requests.back()->setPaddr(_addr);
_requests.back()->setInstCount(_inst->getCpuPtr()->totalInsts());
_inst->strictlyOrdered(_requests.back()->isStrictlyOrdered());
_inst->fault = NoFault;
_inst->physEffAddr = _requests.back()->getPaddr();
_inst->memReqFlags = _requests.back()->getFlags();
_inst->savedReq = this;
setState(State::Translation);
} else {
panic("unexpected behaviour");
}
}
template<class Impl>
void
LSQ<Impl>::HtmCmdRequest::initiateTranslation()
{
// Transaction commands are implemented as loads to avoid significant
// changes to the cpu and memory interfaces
// The virtual and physical address uses a dummy value of 0x00
// Address translation does not really occur thus the code below
flags.set(Flag::TranslationStarted);
flags.set(Flag::TranslationFinished);
_inst->translationStarted(true);
_inst->translationCompleted(true);
setState(State::Request);
}
template<class Impl>
void
LSQ<Impl>::HtmCmdRequest::finish(const Fault &fault, const RequestPtr &req,
ThreadContext* tc, BaseTLB::Mode mode)
{
panic("unexpected behaviour");
}
#endif//__CPU_O3_LSQ_IMPL_HH__

View File

@@ -53,6 +53,7 @@
#include "config/the_isa.hh"
#include "cpu/inst_seq.hh"
#include "cpu/timebuf.hh"
#include "debug/HtmCpu.hh"
#include "debug/LSQUnit.hh"
#include "mem/packet.hh"
#include "mem/port.hh"
@@ -312,6 +313,21 @@ class LSQUnit
/** Returns the number of stores in the SQ. */
int numStores() { return stores; }
// hardware transactional memory
int numHtmStarts() const { return htmStarts; }
int numHtmStops() const { return htmStops; }
void resetHtmStartsStops() { htmStarts = htmStops = 0; }
uint64_t getLatestHtmUid() const
{
const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr();
return htm_cpt->getHtmUid();
}
void setLastRetiredHtmUid(uint64_t htm_uid)
{
assert(htm_uid >= lastRetiredHtmUid);
lastRetiredHtmUid = htm_uid;
}
/** Returns if either the LQ or SQ is full. */
bool isFull() { return lqFull() || sqFull(); }
@@ -496,6 +512,13 @@ class LSQUnit
/** The number of store instructions in the SQ waiting to writeback. */
int storesToWB;
// hardware transactional memory
// nesting depth
int htmStarts;
int htmStops;
// sanity checks and debugging
uint64_t lastRetiredHtmUid;
/** The index of the first instruction that may be ready to be
* written back, and has not yet been written back.
*/
@@ -665,6 +688,7 @@ LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
if (req->mainRequest()->isLocalAccess()) {
assert(!load_inst->memData);
assert(!load_inst->inHtmTransactionalState());
load_inst->memData = new uint8_t[MaxDataBytes];
ThreadContext *thread = cpu->tcBase(lsqID);
@@ -679,6 +703,37 @@ LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
return NoFault;
}
// hardware transactional memory
if (req->mainRequest()->isHTMStart() || req->mainRequest()->isHTMCommit())
{
// don't want to send nested transactionStarts and
// transactionStops outside of core, e.g. to Ruby
if (req->mainRequest()->getFlags().isSet(Request::NO_ACCESS)) {
Cycles delay(0);
PacketPtr data_pkt =
new Packet(req->mainRequest(), MemCmd::ReadReq);
// Allocate memory if this is the first time a load is issued.
if (!load_inst->memData) {
load_inst->memData =
new uint8_t[req->mainRequest()->getSize()];
// sanity checks espect zero in request's data
memset(load_inst->memData, 0, req->mainRequest()->getSize());
}
data_pkt->dataStatic(load_inst->memData);
if (load_inst->inHtmTransactionalState()) {
data_pkt->setHtmTransactional(
load_inst->getHtmTransactionUid());
}
data_pkt->makeResponse();
WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
cpu->schedule(wb, cpu->clockEdge(delay));
return NoFault;
}
}
// Check the SQ for any previous stores that might lead to forwarding
auto store_it = load_inst->sqIt;
assert (store_it >= storeWBIt);
@@ -771,6 +826,35 @@ LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
MemCmd::ReadReq);
data_pkt->dataStatic(load_inst->memData);
// hardware transactional memory
// Store to load forwarding within a transaction
// This should be okay because the store will be sent to
// the memory subsystem and subsequently get added to the
// write set of the transaction. The write set has a stronger
// property than the read set, so the load doesn't necessarily
// have to be there.
assert(!req->mainRequest()->isHTMCmd());
if (load_inst->inHtmTransactionalState()) {
assert (!storeQueue[store_it._idx].completed());
assert (
storeQueue[store_it._idx].instruction()->
inHtmTransactionalState());
assert (
load_inst->getHtmTransactionUid() ==
storeQueue[store_it._idx].instruction()->
getHtmTransactionUid());
data_pkt->setHtmTransactional(
load_inst->getHtmTransactionUid());
DPRINTF(HtmCpu, "HTM LD (ST2LDF) "
"pc=0x%lx - vaddr=0x%lx - "
"paddr=0x%lx - htmUid=%u\n",
load_inst->instAddr(),
data_pkt->req->hasVaddr() ?
data_pkt->req->getVaddr() : 0lu,
data_pkt->getAddr(),
load_inst->getHtmTransactionUid());
}
if (req->isAnyOutstandingRequest()) {
assert(req->_numOutstandingPackets > 0);
// There are memory requests packets in flight already.
@@ -841,6 +925,15 @@ LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
load_inst->memData = new uint8_t[req->mainRequest()->getSize()];
}
// hardware transactional memory
if (req->mainRequest()->isHTMCmd()) {
// this is a simple sanity check
// the Ruby cache controller will set
// memData to 0x0ul if successful.
*load_inst->memData = (uint64_t) 0x1ull;
}
// For now, load throughput is constrained by the number of
// load FUs only, and loads do not consume a cache port (only
// stores do).

View File

@@ -51,6 +51,7 @@
#include "cpu/o3/lsq.hh"
#include "cpu/o3/lsq_unit.hh"
#include "debug/Activity.hh"
#include "debug/HtmCpu.hh"
#include "debug/IEW.hh"
#include "debug/LSQUnit.hh"
#include "debug/O3PipeView.hh"
@@ -112,6 +113,59 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState);
DynInstPtr inst = state->inst;
// hardware transactional memory
// sanity check
if (pkt->isHtmTransactional() && !inst->isSquashed()) {
assert(inst->getHtmTransactionUid() == pkt->getHtmTransactionUid());
}
// if in a HTM transaction, it's possible
// to abort within the cache hierarchy.
// This is signalled back to the processor
// through responses to memory requests.
if (pkt->htmTransactionFailedInCache()) {
// cannot do this for write requests because
// they cannot tolerate faults
const HtmCacheFailure htm_rc =
pkt->getHtmTransactionFailedInCacheRC();
if(pkt->isWrite()) {
DPRINTF(HtmCpu,
"store notification (ignored) of HTM transaction failure "
"in cache - addr=0x%lx - rc=%s - htmUid=%d\n",
pkt->getAddr(), htmFailureToStr(htm_rc),
pkt->getHtmTransactionUid());
} else {
HtmFailureFaultCause fail_reason =
HtmFailureFaultCause::INVALID;
if (htm_rc == HtmCacheFailure::FAIL_SELF) {
fail_reason = HtmFailureFaultCause::SIZE;
} else if (htm_rc == HtmCacheFailure::FAIL_REMOTE) {
fail_reason = HtmFailureFaultCause::MEMORY;
} else if (htm_rc == HtmCacheFailure::FAIL_OTHER) {
// these are likely loads that were issued out of order
// they are faulted here, but it's unlikely that these will
// ever reach the commit head.
fail_reason = HtmFailureFaultCause::OTHER;
} else {
panic("HTM error - unhandled return code from cache (%s)",
htmFailureToStr(htm_rc));
}
inst->fault =
std::make_shared<GenericHtmFailureFault>(
inst->getHtmTransactionUid(),
fail_reason);
DPRINTF(HtmCpu,
"load notification of HTM transaction failure "
"in cache - pc=%s - addr=0x%lx - "
"rc=%u - htmUid=%d\n",
inst->pcState(), pkt->getAddr(),
htmFailureToStr(htm_rc), pkt->getHtmTransactionUid());
}
}
cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt));
/* Notify the sender state that the access is complete (for ownership
@@ -125,6 +179,13 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
// after receving the response from the memory
assert(inst->isLoad() || inst->isStoreConditional() ||
inst->isAtomic());
// hardware transactional memory
if (pkt->htmTransactionFailedInCache()) {
state->request()->mainPacket()->setHtmTransactionFailedInCache(
pkt->getHtmTransactionFailedInCacheRC() );
}
writeback(inst, state->request()->mainPacket());
if (inst->isStore() || inst->isAtomic()) {
auto ss = dynamic_cast<SQSenderState*>(state);
@@ -142,7 +203,10 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
template <class Impl>
LSQUnit<Impl>::LSQUnit(uint32_t lqEntries, uint32_t sqEntries)
: lsqID(-1), storeQueue(sqEntries+1), loadQueue(lqEntries+1),
loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false),
loads(0), stores(0), storesToWB(0),
htmStarts(0), htmStops(0),
lastRetiredHtmUid(0),
cacheBlockMask(0), stalled(false),
isStoreBlocked(false), storeInFlight(false), hasPendingRequest(false),
pendingRequest(nullptr)
{
@@ -176,6 +240,9 @@ LSQUnit<Impl>::resetState()
{
loads = stores = storesToWB = 0;
// hardware transactional memory
// nesting depth
htmStarts = htmStops = 0;
storeWBIt = storeQueue.begin();
@@ -306,6 +373,45 @@ LSQUnit<Impl>::insertLoad(const DynInstPtr &load_inst)
load_inst->lqIt = loadQueue.getIterator(load_inst->lqIdx);
++loads;
// hardware transactional memory
// transactional state and nesting depth must be tracked
// in the in-order part of the core.
if (load_inst->isHtmStart()) {
htmStarts++;
DPRINTF(HtmCpu, ">> htmStarts++ (%d) : htmStops (%d)\n",
htmStarts, htmStops);
const int htm_depth = htmStarts - htmStops;
const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr();
auto htm_uid = htm_cpt->getHtmUid();
// for debugging purposes
if (!load_inst->inHtmTransactionalState()) {
htm_uid = htm_cpt->newHtmUid();
DPRINTF(HtmCpu, "generating new htmUid=%u\n", htm_uid);
if (htm_depth != 1) {
DPRINTF(HtmCpu,
"unusual HTM transactional depth (%d)"
" possibly caused by mispeculation - htmUid=%u\n",
htm_depth, htm_uid);
}
}
load_inst->setHtmTransactionalState(htm_uid, htm_depth);
}
if (load_inst->isHtmStop()) {
htmStops++;
DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops++ (%d)\n",
htmStarts, htmStops);
if (htmStops==1 && htmStarts==0) {
DPRINTF(HtmCpu,
"htmStops==1 && htmStarts==0. "
"This generally shouldn't happen "
"(unless due to misspeculation)\n");
}
}
}
template <class Impl>
@@ -831,6 +937,7 @@ LSQUnit<Impl>::writebackStores()
if (req->request()->isLocalAccess()) {
assert(!inst->isStoreConditional());
assert(!inst->inHtmTransactionalState());
ThreadContext *thread = cpu->tcBase(lsqID);
PacketPtr main_pkt = new Packet(req->mainRequest(),
MemCmd::WriteReq);
@@ -876,6 +983,21 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
stallingLoadIdx = 0;
}
// hardware transactional memory
// Squashing instructions can alter the transaction nesting depth
// and must be corrected before fetching resumes.
if (loadQueue.back().instruction()->isHtmStart())
{
htmStarts = (--htmStarts < 0) ? 0 : htmStarts;
DPRINTF(HtmCpu, ">> htmStarts-- (%d) : htmStops (%d)\n",
htmStarts, htmStops);
}
if (loadQueue.back().instruction()->isHtmStop())
{
htmStops = (--htmStops < 0) ? 0 : htmStops;
DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops-- (%d)\n",
htmStarts, htmStops);
}
// Clear the smart pointer to make sure it is decremented.
loadQueue.back().instruction()->setSquashed();
loadQueue.back().clear();
@@ -886,6 +1008,40 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
++lsqSquashedLoads;
}
// hardware transactional memory
// scan load queue (from oldest to youngest) for most recent valid htmUid
auto scan_it = loadQueue.begin();
uint64_t in_flight_uid = 0;
while (scan_it != loadQueue.end()) {
if (scan_it->instruction()->isHtmStart() &&
!scan_it->instruction()->isSquashed()) {
in_flight_uid = scan_it->instruction()->getHtmTransactionUid();
DPRINTF(HtmCpu, "loadQueue[%d]: found valid HtmStart htmUid=%u\n",
scan_it._idx, in_flight_uid);
}
scan_it++;
}
// If there's a HtmStart in the pipeline then use its htmUid,
// otherwise use the most recently committed uid
const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr();
if (htm_cpt) {
const uint64_t old_local_htm_uid = htm_cpt->getHtmUid();
uint64_t new_local_htm_uid;
if (in_flight_uid > 0)
new_local_htm_uid = in_flight_uid;
else
new_local_htm_uid = lastRetiredHtmUid;
if (old_local_htm_uid != new_local_htm_uid) {
DPRINTF(HtmCpu, "flush: lastRetiredHtmUid=%u\n",
lastRetiredHtmUid);
DPRINTF(HtmCpu, "flush: resetting localHtmUid=%u\n",
new_local_htm_uid);
htm_cpt->setHtmUid(new_local_htm_uid);
}
}
if (memDepViolator && squashed_num < memDepViolator->seqNum) {
memDepViolator = NULL;
}
@@ -965,7 +1121,7 @@ LSQUnit<Impl>::writeback(const DynInstPtr &inst, PacketPtr pkt)
// Squashed instructions do not need to complete their access.
if (inst->isSquashed()) {
assert(!inst->isStore());
assert (!inst->isStore() || inst->isStoreConditional());
++lsqIgnoredResponses;
return;
}
@@ -983,8 +1139,27 @@ LSQUnit<Impl>::writeback(const DynInstPtr &inst, PacketPtr pkt)
// If we have an outstanding fault, the fault should only be of
// type ReExec or - in case of a SplitRequest - a partial
// translation fault
assert(dynamic_cast<ReExec*>(inst->fault.get()) != nullptr ||
inst->savedReq->isPartialFault());
// Unless it's a hardware transactional memory fault
auto htm_fault = std::dynamic_pointer_cast<
GenericHtmFailureFault>(inst->fault);
if (!htm_fault) {
assert(dynamic_cast<ReExec*>(inst->fault.get()) != nullptr ||
inst->savedReq->isPartialFault());
} else if (!pkt->htmTransactionFailedInCache()) {
// Situation in which the instruction has a hardware transactional
// memory fault but not the packet itself. This can occur with
// ldp_uop microops since access is spread over multiple packets.
DPRINTF(HtmCpu,
"%s writeback with HTM failure fault, "
"however, completing packet is not aware of "
"transaction failure. cause=%s htmUid=%u\n",
inst->staticInst->getName(),
htmFailureToStr(htm_fault->getHtmFailureFaultCause()),
htm_fault->getHtmUid());
}
DPRINTF(LSQUnit, "Not completing instruction [sn:%lli] access "
"due to pending fault.\n", inst->seqNum);

View File

@@ -172,7 +172,9 @@ MemDepUnit<MemDepPred, Impl>::insertBarrierSN(const DynInstPtr &barr_inst)
{
InstSeqNum barr_sn = barr_inst->seqNum;
// Memory barriers block loads and stores, write barriers only stores.
if (barr_inst->isMemBarrier()) {
// Required also for hardware transactional memory commands which
// can have strict ordering semantics
if (barr_inst->isMemBarrier() || barr_inst->isHtmCmd()) {
loadBarrierSNs.insert(barr_sn);
storeBarrierSNs.insert(barr_sn);
DPRINTF(MemDepUnit, "Inserted a memory barrier %s SN:%lli\n",
@@ -182,6 +184,7 @@ MemDepUnit<MemDepPred, Impl>::insertBarrierSN(const DynInstPtr &barr_inst)
DPRINTF(MemDepUnit, "Inserted a write barrier %s SN:%lli\n",
barr_inst->pcState(), barr_sn);
}
if (loadBarrierSNs.size() || storeBarrierSNs.size()) {
DPRINTF(MemDepUnit, "Outstanding load barriers = %d; "
"store barriers = %d\n",
@@ -440,7 +443,8 @@ MemDepUnit<MemDepPred, Impl>::completeInst(const DynInstPtr &inst)
wakeDependents(inst);
completed(inst);
InstSeqNum barr_sn = inst->seqNum;
if (inst->isMemBarrier()) {
if (inst->isMemBarrier() || inst->isHtmCmd()) {
assert(hasLoadBarrier());
assert(hasStoreBarrier());
loadBarrierSNs.erase(barr_sn);
@@ -459,9 +463,10 @@ template <class MemDepPred, class Impl>
void
MemDepUnit<MemDepPred, Impl>::wakeDependents(const DynInstPtr &inst)
{
// Only stores, atomics and barriers have dependents.
// Only stores, atomics, barriers and
// hardware transactional memory commands have dependents.
if (!inst->isStore() && !inst->isAtomic() && !inst->isMemBarrier() &&
!inst->isWriteBarrier()) {
!inst->isWriteBarrier() && !inst->isHtmCmd()) {
return;
}

View File

@@ -331,21 +331,24 @@ void
O3ThreadContext<Impl>::htmAbortTransaction(uint64_t htmUid,
HtmFailureFaultCause cause)
{
panic("function not implemented\n");
cpu->htmSendAbortSignal(thread->threadId(), htmUid, cause);
conditionalSquash();
}
template <class Impl>
BaseHTMCheckpointPtr&
O3ThreadContext<Impl>::getHtmCheckpointPtr()
{
panic("function not implemented\n");
return thread->htmCheckpoint;
}
template <class Impl>
void
O3ThreadContext<Impl>::setHtmCheckpointPtr(BaseHTMCheckpointPtr new_cpt)
{
panic("function not implemented\n");
assert(!thread->htmCheckpoint->valid());
thread->htmCheckpoint = std::move(new_cpt);
}
#endif //__CPU_O3_THREAD_CONTEXT_IMPL_HH__

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2012 ARM Limited
* Copyright (c) 2012, 2019 ARM Limited
* All rights reserved
*
* The license below extends only to copyright in the software and shall
@@ -92,6 +92,9 @@ struct O3ThreadState : public ThreadState {
*/
bool trapPending;
/** Pointer to the hardware transactional memory checkpoint. */
std::unique_ptr<BaseHTMCheckpoint> htmCheckpoint;
O3ThreadState(O3CPU *_cpu, int _thread_num, Process *_process)
: ThreadState(_cpu, _thread_num, _process), cpu(_cpu),
comInstEventQueue("instruction-based event queue"),