gem5/src/dev/net/dist_iface.cc

/*
 * Copyright (c) 2015 ARM Limited
 * All rights reserved
 *
 * The license below extends only to copyright in the software and shall
 * not be construed as granting a license to any other intellectual
 * property including but not limited to intellectual property relating
 * to a hardware implementation of the functionality of the software
 * licensed hereunder.  You may use the software subject to the license
 * terms below provided that you ensure that this notice is replicated
 * unmodified and in its entirety in all distributions of the software,
 * modified or unmodified, in source code or in binary form.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Authors: Gabor Dozsa
 */

/* @file
 * The interface class for dist-gem5 simulations.
 */

#include "dev/net/dist_iface.hh"

#include <queue>
#include <thread>

#include "base/random.hh"
#include "base/trace.hh"
#include "cpu/thread_context.hh"
#include "debug/DistEthernet.hh"
#include "debug/DistEthernetPkt.hh"
#include "dev/net/etherpkt.hh"
#include "sim/sim_exit.hh"
#include "sim/sim_object.hh"
#include "sim/system.hh"

using namespace std;
DistIface::Sync *DistIface::sync = nullptr;
System *DistIface::sys = nullptr;
DistIface::SyncEvent *DistIface::syncEvent = nullptr;
unsigned DistIface::distIfaceNum = 0;
unsigned DistIface::recvThreadsNum = 0;
DistIface *DistIface::master = nullptr;
bool DistIface::isSwitch = false;

void
DistIface::Sync::init(Tick start_tick, Tick repeat_tick)
{
    if (start_tick < nextAt) {
        nextAt = start_tick;
        inform("Next dist synchronisation tick is changed to %lu.\n", nextAt);
    }

    if (repeat_tick == 0)
        panic("Dist synchronisation interval must be greater than zero");

    if (repeat_tick < nextRepeat) {
        nextRepeat = repeat_tick;
        inform("Dist synchronisation interval is changed to %lu.\n",
               nextRepeat);
    }
}

DistIface::SyncSwitch::SyncSwitch(int num_nodes)
{
    numNodes = num_nodes;
    waitNum = num_nodes;
    numExitReq = 0;
    numCkptReq = 0;
    numStopSyncReq = 0;
    doExit = false;
    doCkpt = false;
    doStopSync = false;
    nextAt = std::numeric_limits<Tick>::max();
    nextRepeat = std::numeric_limits<Tick>::max();
}

DistIface::SyncNode::SyncNode()
{
    waitNum = 0;
    needExit = ReqType::none;
    needCkpt = ReqType::none;
    needStopSync = ReqType::none;
    doExit = false;
    doCkpt = false;
    doStopSync = false;
    nextAt = std::numeric_limits<Tick>::max();
    nextRepeat = std::numeric_limits<Tick>::max();
}

void
DistIface::SyncNode::run(bool same_tick)
{
    std::unique_lock<std::mutex> sync_lock(lock);
    Header header;

    assert(waitNum == 0);
    waitNum = DistIface::recvThreadsNum;
    // initiate the global synchronisation
    header.msgType = MsgType::cmdSyncReq;
    header.sendTick = curTick();
    header.syncRepeat = nextRepeat;
    header.needCkpt = needCkpt;
    header.needStopSync = needStopSync;
    if (needCkpt != ReqType::none)
        needCkpt = ReqType::pending;
    header.needExit = needExit;
    if (needExit != ReqType::none)
        needExit = ReqType::pending;
    if (needStopSync != ReqType::none)
        needStopSync = ReqType::pending;
    DistIface::master->sendCmd(header);
    // now wait until all receiver threads complete the synchronisation
    auto lf = [this]{ return waitNum == 0; };
    cv.wait(sync_lock, lf);
    // global synchronisation is done
    assert(!same_tick || (nextAt == curTick()));
}


void
DistIface::SyncSwitch::run(bool same_tick)
{
    std::unique_lock<std::mutex> sync_lock(lock);
    Header header;
    // Wait for the sync requests from the nodes
    if (waitNum > 0) {
        auto lf = [this]{ return waitNum == 0; };
        cv.wait(sync_lock, lf);
    }
    assert(waitNum == 0);
    assert(!same_tick || (nextAt == curTick()));
    waitNum = numNodes;
    // Complete the global synchronisation
    header.msgType = MsgType::cmdSyncAck;
    header.sendTick = nextAt;
    header.syncRepeat = nextRepeat;
    if (doCkpt || numCkptReq == numNodes) {
        doCkpt = true;
        header.needCkpt = ReqType::immediate;
        numCkptReq = 0;
    } else {
        header.needCkpt = ReqType::none;
    }
    if (doExit || numExitReq == numNodes) {
        doExit = true;
        header.needExit = ReqType::immediate;
    } else {
        header.needExit = ReqType::none;
    }
    if (doStopSync || numStopSyncReq == numNodes) {
        doStopSync = true;
        numStopSyncReq = 0;
        header.needStopSync = ReqType::immediate;
    } else {
        header.needStopSync = ReqType::none;
    }
    DistIface::master->sendCmd(header);
}

void
DistIface::SyncSwitch::progress(Tick send_tick,
                                 Tick sync_repeat,
                                 ReqType need_ckpt,
                                 ReqType need_exit,
                                 ReqType need_stop_sync)
{
    std::unique_lock<std::mutex> sync_lock(lock);
    assert(waitNum > 0);

    if (send_tick > nextAt)
        nextAt = send_tick;
    if (nextRepeat > sync_repeat)
        nextRepeat = sync_repeat;

    if (need_ckpt == ReqType::collective)
        numCkptReq++;
    else if (need_ckpt == ReqType::immediate)
        doCkpt = true;
    if (need_exit == ReqType::collective)
        numExitReq++;
    else if (need_exit == ReqType::immediate)
        doExit = true;
    if (need_stop_sync == ReqType::collective)
        numStopSyncReq++;
    else if (need_stop_sync == ReqType::immediate)
        doStopSync = true;

    waitNum--;
    // Notify the simulation thread if the on-going sync is complete
    if (waitNum == 0) {
        sync_lock.unlock();
        cv.notify_one();
    }
}

void
DistIface::SyncNode::progress(Tick max_send_tick,
                               Tick next_repeat,
                               ReqType do_ckpt,
                               ReqType do_exit,
                               ReqType do_stop_sync)
{
    std::unique_lock<std::mutex> sync_lock(lock);
    assert(waitNum > 0);

    nextAt = max_send_tick;
    nextRepeat = next_repeat;
    doCkpt = (do_ckpt != ReqType::none);
    doExit = (do_exit != ReqType::none);
    doStopSync = (do_stop_sync != ReqType::none);

    waitNum--;
    // Notify the simulation thread if the on-going sync is complete
    if (waitNum == 0) {
        sync_lock.unlock();
        cv.notify_one();
    }
}

void
DistIface::SyncNode::requestCkpt(ReqType req)
{
   std::lock_guard<std::mutex> sync_lock(lock);
   assert(req != ReqType::none);
   if (needCkpt != ReqType::none)
       warn("Ckpt requested multiple times (req:%d)\n", static_cast<int>(req));
   if (needCkpt == ReqType::none || req == ReqType::immediate)
       needCkpt = req;
}

void
DistIface::SyncNode::requestExit(ReqType req)
{
   std::lock_guard<std::mutex> sync_lock(lock);
   assert(req != ReqType::none);
   if (needExit != ReqType::none)
       warn("Exit requested multiple times (req:%d)\n", static_cast<int>(req));
   if (needExit == ReqType::none || req == ReqType::immediate)
       needExit = req;
}

void
DistIface::Sync::drainComplete()
{
    if (doCkpt) {
        // The first DistIface object called this right before writing the
        // checkpoint. We need to drain the underlying physical network here.
        // Note that other gem5 peers may enter this barrier at different
        // ticks due to draining.
        run(false);
        // Only the "first" DistIface object has to perform the sync
        doCkpt = false;
    }
}

void
DistIface::SyncNode::serialize(CheckpointOut &cp) const
{
    int need_exit = static_cast<int>(needExit);
    SERIALIZE_SCALAR(need_exit);
}

void
DistIface::SyncNode::unserialize(CheckpointIn &cp)
{
    int need_exit;
    UNSERIALIZE_SCALAR(need_exit);
    needExit = static_cast<ReqType>(need_exit);
}

void
DistIface::SyncSwitch::serialize(CheckpointOut &cp) const
{
    SERIALIZE_SCALAR(numExitReq);
}

void
DistIface::SyncSwitch::unserialize(CheckpointIn &cp)
{
    UNSERIALIZE_SCALAR(numExitReq);
}

void
DistIface::SyncEvent::start()
{
    // Note that this may be called either from startup() or drainResume()

    // At this point, all DistIface objects has already called Sync::init() so
    // we have a local minimum of the start tick and repeat for the periodic
    // sync.
    repeat = DistIface::sync->nextRepeat;
    // Do a global barrier to agree on a common repeat value (the smallest
    // one from all participating nodes.
    DistIface::sync->run(false);

    assert(!DistIface::sync->doCkpt);
    assert(!DistIface::sync->doExit);
    assert(!DistIface::sync->doStopSync);
    assert(DistIface::sync->nextAt >= curTick());
    assert(DistIface::sync->nextRepeat <= repeat);

    if (curTick() == 0)
        assert(!scheduled());

    // Use the maximum of the current tick for all participating nodes or a
    // user provided starting tick.
    if (scheduled())
        reschedule(DistIface::sync->nextAt);
    else
        schedule(DistIface::sync->nextAt);

    inform("Dist sync scheduled at %lu and repeats %lu\n",  when(),
           DistIface::sync->nextRepeat);
}

void
DistIface::SyncEvent::process()
{
    // We may not start a global periodic sync while draining before taking a
    // checkpoint.  This is due to the possibility that peer gem5 processes
    // may not hit the same periodic sync before they complete draining and
    // that would make this periodic sync clash with sync called from
    // DistIface::serialize() by other gem5 processes.
    // We would need a 'distributed drain' solution to eliminate this
    // restriction.
    // Note that if draining was not triggered by checkpointing then we are
    // fine since no extra global sync will happen (i.e. all peer gem5 will
    // hit this periodic sync eventually).
    panic_if(_draining && DistIface::sync->doCkpt,
             "Distributed sync is hit while draining");
    /*
     * Note that this is a global event so this process method will be called
     * by only exactly one thread.
     */
    /*
     * We hold the eventq lock at this point but the receiver thread may
     * need the lock to schedule new recv events while waiting for the
     * dist sync to complete.
     * Note that the other simulation threads also release their eventq
     * locks while waiting for us due to the global event semantics.
     */
    {
        EventQueue::ScopedRelease sr(curEventQueue());
        // we do a global sync here that is supposed to happen at the same
        // tick in all gem5 peers
        DistIface::sync->run(true);
        // global sync completed
    }
    if (DistIface::sync->doCkpt)
        exitSimLoop("checkpoint");
    if (DistIface::sync->doExit)
        exitSimLoop("exit request from gem5 peers");
    if (DistIface::sync->doStopSync) {
        DistIface::sync->doStopSync = false;
        inform("synchronization disabled at %lu\n", curTick());

        // The switch node needs to wait for the next sync immediately.
        if (DistIface::isSwitch) {
            start();
        } else {
            // Wake up thread contexts on non-switch nodes.
            for (int i = 0; i < DistIface::master->sys->numContexts(); i++) {
                ThreadContext *tc =
                    DistIface::master->sys->getThreadContext(i);
                if (tc->status() == ThreadContext::Suspended)
                    tc->activate();
                else
                    warn_once("Tried to wake up thread in dist-gem5, but it "
                              "was already awake!\n");
            }
        }
        return;
    }
    // schedule the next periodic sync
    repeat = DistIface::sync->nextRepeat;
    schedule(curTick() + repeat);
}

void
DistIface::RecvScheduler::init(Event *recv_done, Tick link_delay)
{
    // This is called from the receiver thread when it starts running. The new
    // receiver thread shares the event queue with the simulation thread
    // (associated with the simulated Ethernet link).
    curEventQueue(eventManager->eventQueue());

    recvDone = recv_done;
    linkDelay = link_delay;
}

Tick
DistIface::RecvScheduler::calcReceiveTick(Tick send_tick,
                                          Tick send_delay,
                                          Tick prev_recv_tick)
{
    Tick recv_tick = send_tick + send_delay + linkDelay;
    // sanity check (we need atleast a send delay long window)
    assert(recv_tick >= prev_recv_tick + send_delay);
    panic_if(prev_recv_tick + send_delay > recv_tick,
             "Receive window is smaller than send delay");
    panic_if(recv_tick <= curTick(),
             "Simulators out of sync - missed packet receive by %llu ticks"
             "(rev_recv_tick: %lu send_tick: %lu send_delay: %lu "
             "linkDelay: %lu )",
             curTick() - recv_tick, prev_recv_tick, send_tick, send_delay,
             linkDelay);

    return recv_tick;
}

void
DistIface::RecvScheduler::resumeRecvTicks()
{
    // Schedule pending packets asap in case link speed/delay changed when
    // restoring from the checkpoint.
    // This may be done during unserialize except that curTick() is unknown
    // so we call this during drainResume().
    // If we are not restoring from a checkppint then link latency could not
    // change so we just return.
    if (!ckptRestore)
        return;

    std::vector<Desc> v;
    while (!descQueue.empty()) {
        Desc d = descQueue.front();
        descQueue.pop();
        d.sendTick = curTick();
        d.sendDelay = d.packet->simLength; // assume 1 tick/byte max link speed
        v.push_back(d);
    }

    for (auto &d : v)
        descQueue.push(d);

    if (recvDone->scheduled()) {
        assert(!descQueue.empty());
        eventManager->reschedule(recvDone, curTick());
    } else {
        assert(descQueue.empty() && v.empty());
    }
    ckptRestore = false;
}

void
DistIface::RecvScheduler::pushPacket(EthPacketPtr new_packet,
                                     Tick send_tick,
                                     Tick send_delay)
{
    // Note : this is called from the receiver thread
    curEventQueue()->lock();
    Tick recv_tick = calcReceiveTick(send_tick, send_delay, prevRecvTick);

    DPRINTF(DistEthernetPkt, "DistIface::recvScheduler::pushPacket "
            "send_tick:%llu send_delay:%llu link_delay:%llu recv_tick:%llu\n",
            send_tick, send_delay, linkDelay, recv_tick);
    // Every packet must be sent and arrive in the same quantum
    assert(send_tick > master->syncEvent->when() -
           master->syncEvent->repeat);
    // No packet may be scheduled for receive in the arrival quantum
    assert(send_tick + send_delay + linkDelay > master->syncEvent->when());

    // Now we are about to schedule a recvDone event for the new data packet.
    // We use the same recvDone object for all incoming data packets. Packet
    // descriptors are saved in the ordered queue. The currently scheduled
    // packet is always on the top of the queue.
    // NOTE:  we use the event queue lock to protect the receive desc queue,
    // too, which is accessed both by the receiver thread and the simulation
    // thread.
    descQueue.emplace(new_packet, send_tick, send_delay);
    if (descQueue.size() == 1) {
        assert(!recvDone->scheduled());
        eventManager->schedule(recvDone, recv_tick);
    } else {
        assert(recvDone->scheduled());
        panic_if(descQueue.front().sendTick + descQueue.front().sendDelay > recv_tick,
                 "Out of order packet received (recv_tick: %lu top(): %lu\n",
                 recv_tick, descQueue.front().sendTick + descQueue.front().sendDelay);
    }
    curEventQueue()->unlock();
}

EthPacketPtr
DistIface::RecvScheduler::popPacket()
{
    // Note : this is called from the simulation thread when a receive done
    // event is being processed for the link. We assume that the thread holds
    // the event queue queue lock when this is called!
    EthPacketPtr next_packet = descQueue.front().packet;
    descQueue.pop();

    if (descQueue.size() > 0) {
        Tick recv_tick = calcReceiveTick(descQueue.front().sendTick,
                                         descQueue.front().sendDelay,
                                         curTick());
        eventManager->schedule(recvDone, recv_tick);
    }
    prevRecvTick = curTick();
    return next_packet;
}

void
DistIface::RecvScheduler::Desc::serialize(CheckpointOut &cp) const
{
        SERIALIZE_SCALAR(sendTick);
        SERIALIZE_SCALAR(sendDelay);
        packet->serialize("rxPacket", cp);
}

void
DistIface::RecvScheduler::Desc::unserialize(CheckpointIn &cp)
{
        UNSERIALIZE_SCALAR(sendTick);
        UNSERIALIZE_SCALAR(sendDelay);
        packet = std::make_shared<EthPacketData>();
        packet->unserialize("rxPacket", cp);
}

void
DistIface::RecvScheduler::serialize(CheckpointOut &cp) const
{
    SERIALIZE_SCALAR(prevRecvTick);
    // serialize the receive desc queue
    std::queue<Desc> tmp_queue(descQueue);
    unsigned n_desc_queue = descQueue.size();
    assert(tmp_queue.size() == descQueue.size());
    SERIALIZE_SCALAR(n_desc_queue);
    for (int i = 0; i < n_desc_queue; i++) {
        tmp_queue.front().serializeSection(cp, csprintf("rxDesc_%d", i));
        tmp_queue.pop();
    }
    assert(tmp_queue.empty());
}

void
DistIface::RecvScheduler::unserialize(CheckpointIn &cp)
{
    assert(descQueue.size() == 0);
    assert(!recvDone->scheduled());
    assert(!ckptRestore);

    UNSERIALIZE_SCALAR(prevRecvTick);
    // unserialize the receive desc queue
    unsigned n_desc_queue;
    UNSERIALIZE_SCALAR(n_desc_queue);
    for (int i = 0; i < n_desc_queue; i++) {
        Desc recv_desc;
        recv_desc.unserializeSection(cp, csprintf("rxDesc_%d", i));
        descQueue.push(recv_desc);
    }
    ckptRestore = true;
}

DistIface::DistIface(unsigned dist_rank,
                     unsigned dist_size,
                     Tick sync_start,
                     Tick sync_repeat,
                     EventManager *em,
                     bool use_pseudo_op,
                     bool is_switch, int num_nodes) :
    syncStart(sync_start), syncRepeat(sync_repeat),
    recvThread(nullptr), recvScheduler(em), syncStartOnPseudoOp(use_pseudo_op),
    rank(dist_rank), size(dist_size)
{
    DPRINTF(DistEthernet, "DistIface() ctor rank:%d\n",dist_rank);
    isMaster = false;
    if (master == nullptr) {
        assert(sync == nullptr);
        assert(syncEvent == nullptr);
        isSwitch = is_switch;
        if (is_switch)
            sync = new SyncSwitch(num_nodes);
        else
            sync = new SyncNode();
        syncEvent = new SyncEvent();
        master = this;
        isMaster = true;
    }
    distIfaceId = distIfaceNum;
    distIfaceNum++;
}

DistIface::~DistIface()
{
    assert(recvThread);
    delete recvThread;
    if (this == master) {
        assert(syncEvent);
        delete syncEvent;
        assert(sync);
        delete sync;
        master = nullptr;
    }
}

void
DistIface::packetOut(EthPacketPtr pkt, Tick send_delay)
{
    Header header;

    // Prepare a dist header packet for the Ethernet packet we want to
    // send out.
    header.msgType = MsgType::dataDescriptor;
    header.sendTick  = curTick();
    header.sendDelay = send_delay;

    header.dataPacketLength = pkt->length;
    header.simLength = pkt->simLength;

    // Send out the packet and the meta info.
    sendPacket(header, pkt);

    DPRINTF(DistEthernetPkt,
            "DistIface::sendDataPacket() done size:%d send_delay:%llu\n",
            pkt->length, send_delay);
}

void
DistIface::recvThreadFunc(Event *recv_done, Tick link_delay)
{
    EthPacketPtr new_packet;
    DistHeaderPkt::Header header;

    // Initialize receive scheduler parameters
    recvScheduler.init(recv_done, link_delay);

    // Main loop to wait for and process any incoming message.
    for (;;) {
        // recvHeader() blocks until the next dist header packet comes in.
        if (!recvHeader(header)) {
            // We lost connection to the peer gem5 processes most likely
            // because one of them called m5 exit. So we stop here.
            // Grab the eventq lock to stop the simulation thread
            curEventQueue()->lock();
            exitSimLoop("Message server closed connection, simulator "
                        "is exiting");
            curEventQueue()->unlock();
            break;
        }

        // We got a valid dist header packet, let's process it
        if (header.msgType == MsgType::dataDescriptor) {
            recvPacket(header, new_packet);
            recvScheduler.pushPacket(new_packet,
                                     header.sendTick,
                                     header.sendDelay);
        } else {
            // everything else must be synchronisation related command
            sync->progress(header.sendTick,
                           header.syncRepeat,
                           header.needCkpt,
                           header.needExit,
                           header.needStopSync);
        }
    }
}

void
DistIface::spawnRecvThread(const Event *recv_done, Tick link_delay)
{
    assert(recvThread == nullptr);

    recvThread = new std::thread(&DistIface::recvThreadFunc,
                                 this,
                                 const_cast<Event *>(recv_done),
                                 link_delay);
    recvThreadsNum++;
}

DrainState
DistIface::drain()
{
    DPRINTF(DistEthernet,"DistIFace::drain() called\n");
    // This can be called multiple times in the same drain cycle.
    if (this == master)
        syncEvent->draining(true);
    return DrainState::Drained;
}

void
DistIface::drainResume() {
    DPRINTF(DistEthernet,"DistIFace::drainResume() called\n");
    if (this == master)
        syncEvent->draining(false);
    recvScheduler.resumeRecvTicks();
}

void
DistIface::serialize(CheckpointOut &cp) const
{
    // Drain the dist interface before the checkpoint is taken. We cannot call
    // this as part of the normal drain cycle because this dist sync has to be
    // called exactly once after the system is fully drained.
    sync->drainComplete();

    unsigned rank_orig = rank, dist_iface_id_orig = distIfaceId;

    SERIALIZE_SCALAR(rank_orig);
    SERIALIZE_SCALAR(dist_iface_id_orig);

    recvScheduler.serializeSection(cp, "recvScheduler");
    if (this == master) {
        sync->serializeSection(cp, "Sync");
    }
}

void
DistIface::unserialize(CheckpointIn &cp)
{
    unsigned rank_orig, dist_iface_id_orig;
    UNSERIALIZE_SCALAR(rank_orig);
    UNSERIALIZE_SCALAR(dist_iface_id_orig);

    panic_if(rank != rank_orig, "Rank mismatch at resume (rank=%d, orig=%d)",
             rank, rank_orig);
    panic_if(distIfaceId != dist_iface_id_orig, "Dist iface ID mismatch "
             "at resume (distIfaceId=%d, orig=%d)", distIfaceId,
             dist_iface_id_orig);

    recvScheduler.unserializeSection(cp, "recvScheduler");
    if (this == master) {
        sync->unserializeSection(cp, "Sync");
    }
}

void
DistIface::init(const Event *done_event, Tick link_delay)
{
    // Init hook for the underlaying message transport to setup/finalize
    // communication channels
    initTransport();

    // Spawn a new receiver thread that will process messages
    // coming in from peer gem5 processes.
    // The receive thread will also schedule a (receive) doneEvent
    // for each incoming data packet.
    spawnRecvThread(done_event, link_delay);


    // Adjust the periodic sync start and interval. Different DistIface
    // might have different requirements. The singleton sync object
    // will select the minimum values for both params.
    assert(sync != nullptr);
    sync->init(syncStart, syncRepeat);

    // Initialize the seed for random generator to avoid the same sequence
    // in all gem5 peer processes
    assert(master != nullptr);
    if (this == master)
        random_mt.init(5489 * (rank+1) + 257);
}

void
DistIface::startup()
{
    DPRINTF(DistEthernet, "DistIface::startup() started\n");
    // Schedule synchronization unless we are not a switch in pseudo_op mode.
    if (this == master && (!syncStartOnPseudoOp || isSwitch))
        syncEvent->start();
    DPRINTF(DistEthernet, "DistIface::startup() done\n");
}

bool
DistIface::readyToCkpt(Tick delay, Tick period)
{
    bool ret = true;
    DPRINTF(DistEthernet, "DistIface::readyToCkpt() called, delay:%lu "
            "period:%lu\n", delay, period);
    if (master) {
        if (delay == 0) {
            inform("m5 checkpoint called with zero delay => triggering collaborative "
                   "checkpoint\n");
            sync->requestCkpt(ReqType::collective);
        } else {
            inform("m5 checkpoint called with non-zero delay => triggering immediate "
                   "checkpoint (at the next sync)\n");
            sync->requestCkpt(ReqType::immediate);
        }
        if (period != 0)
            inform("Non-zero period for m5_ckpt is ignored in "
                   "distributed gem5 runs\n");
        ret = false;
    }
    return ret;
}

void
DistIface::SyncNode::requestStopSync(ReqType req)
{
   std::lock_guard<std::mutex> sync_lock(lock);
   needStopSync = req;
}

void
DistIface::toggleSync(ThreadContext *tc)
{
    // Unforunate that we have to populate the system pointer member this way.
    master->sys = tc->getSystemPtr();

    // The invariant for both syncing and "unsyncing" is that all threads will
    // stop executing intructions until the desired sync state has been reached
    // for all nodes.  This is the easiest way to prevent deadlock (in the case
    // of "unsyncing") and causality errors (in the case of syncing).
    if (master->syncEvent->scheduled()) {
        inform("Request toggling syncronization off\n");
        master->sync->requestStopSync(ReqType::collective);

        // At this point, we have no clue when everyone will reach the sync
        // stop point.  Suspend execution of all local thread contexts.
        // Dist-gem5 will reactivate all thread contexts when everyone has
        // reached the sync stop point.
        for (int i = 0; i < master->sys->numContexts(); i++) {
            ThreadContext *tc = master->sys->getThreadContext(i);
            if (tc->status() == ThreadContext::Active)
                tc->quiesce();
        }
    } else {
        inform("Request toggling syncronization on\n");
        master->syncEvent->start();

        // We need to suspend all CPUs until the sync point is reached by all
        // nodes to prevent causality errors.  We can also schedule CPU
        // activation here, since we know exactly when the next sync will
        // occur.
        for (int i = 0; i < master->sys->numContexts(); i++) {
            ThreadContext *tc = master->sys->getThreadContext(i);
            if (tc->status() == ThreadContext::Active)
                tc->quiesceTick(master->syncEvent->when() + 1);
        }
    }
}

bool
DistIface::readyToExit(Tick delay)
{
    bool ret = true;
    DPRINTF(DistEthernet, "DistIface::readyToExit() called, delay:%lu\n",
            delay);
    if (master) {
        // To successfully coordinate an exit, all nodes must be synchronising
        if (!master->syncEvent->scheduled())
            master->syncEvent->start();

        if (delay == 0) {
            inform("m5 exit called with zero delay => triggering collaborative "
                   "exit\n");
            sync->requestExit(ReqType::collective);
        } else {
            inform("m5 exit called with non-zero delay => triggering immediate "
                   "exit (at the next sync)\n");
            sync->requestExit(ReqType::immediate);
        }
        ret = false;
    }
    return ret;
}

uint64_t
DistIface::rankParam()
{
    uint64_t val;
    if (master) {
        val = master->rank;
    } else {
        warn("Dist-rank parameter is queried in single gem5 simulation.");
        val = 0;
    }
    return val;
}

uint64_t
DistIface::sizeParam()
{
    uint64_t val;
    if (master) {
        val = master->size;
    } else {
        warn("Dist-size parameter is queried in single gem5 simulation.");
        val = 1;
    }
    return val;
}