From 8c685469f1678987bc1cb9d1fda326a6d7b321e3 Mon Sep 17 00:00:00 2001
From: Andreas Sandberg <andreas.sandberg@arm.com>
Date: Wed, 15 Sep 2021 13:18:21 +0100
Subject: [PATCH] sim: Fix fork for multithreaded simulations

It is currently not possible to call m5.fork when the simulator is
running in with multiple parallel event queues. The POSIX standard
have very weak guarantees when forking a process with multiple
threads. In order to use fork correctly, we need to ensure that all
helper threads servicing event queues have terminated before the fork
system call is invoked.

There are two ways this could be implemented: 1) Always terminate
helper threads when taking a global simulator exit event, or 2)
terminate helper threads just before fork is called from Python.

This change implements the second strategy since the KVM-based CPUs
currently assume that TIDs don't change unless there is a fork event.

Change-Id: I22feaecd49f7f81689b43185d63a8f14428bed63
Signed-off-by: Andreas Sandberg <andreas.sandberg@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/50408
Reviewed-by: Austin Harris <mail@austin-harris.com>
Tested-by: kokoro <noreply+kokoro@google.com>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
---
 src/python/m5/simulate.py    |   5 +-
 src/python/pybind11/event.cc |   3 +-
 src/sim/simulate.cc          | 229 ++++++++++++++++++++++++-----------
 src/sim/simulate.hh          |  21 ++++
 4 files changed, 185 insertions(+), 73 deletions(-)
diff --git a/src/python/m5/simulate.py b/src/python/m5/simulate.py
index 66e6a08d07..b5b8c78782 100644
--- a/src/python/m5/simulate.py
+++ b/src/python/m5/simulate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012,2019 ARM Limited
+# Copyright (c) 2012, 2019, 2021 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -348,6 +348,9 @@ def fork(simout="%(parent)s.f%(fork_seq)i"):
 
     drain()
 
+    # Terminate helper threads that service parallel event queues.
+    _m5.event.terminateEventQueueThreads()
+
     try:
         pid = os.fork()
     except OSError as e:
diff --git a/src/python/pybind11/event.cc b/src/python/pybind11/event.cc
index aefe50a221..7a02221611 100644
--- a/src/python/pybind11/event.cc
+++ b/src/python/pybind11/event.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited
+ * Copyright (c) 2017, 2021 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -107,6 +107,7 @@ pybind_init_event(py::module_ &m_native)
 
     m.def("simulate", &simulate,
           py::arg("ticks") = MaxTick);
+    m.def("terminateEventQueueThreads", &terminateEventQueueThreads);
     m.def("exitSimLoop", &exitSimLoop);
     m.def("getEventQueue", []() { return curEventQueue(); },
           py::return_value_policy::reference);
diff --git a/src/sim/simulate.cc b/src/sim/simulate.cc
index 4a008696b0..ec46cbfcda 100644
--- a/src/sim/simulate.cc
+++ b/src/sim/simulate.cc
@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2021 Arm Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 2006 The Regents of The University of Michigan
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * Copyright (c) 2013 Mark D. Hill and David A. Wood
@@ -30,6 +42,7 @@
 
 #include "sim/simulate.hh"
 
+#include <atomic>
 #include <mutex>
 #include <thread>
 
@@ -45,35 +58,125 @@
 namespace gem5
 {
 
-//! Mutex for handling async events.
-std::mutex asyncEventMutex;
-
-//! Global barrier for synchronizing threads entering/exiting the
-//! simulation loop.
-Barrier *threadBarrier;
-
 //! forward declaration
 Event *doSimLoop(EventQueue *);
 
-/**
- * The main function for all subordinate threads (i.e., all threads
- * other than the main thread).  These threads start by waiting on
- * threadBarrier.  Once all threads have arrived at threadBarrier,
- * they enter the simulation loop concurrently.  When they exit the
- * loop, they return to waiting on threadBarrier.  This process is
- * repeated until the simulation terminates.
- */
-static void
-thread_loop(EventQueue *queue)
-{
-    while (true) {
-        threadBarrier->wait();
-        doSimLoop(queue);
-    }
-}
-
 GlobalSimLoopExitEvent *simulate_limit_event = nullptr;
 
+class SimulatorThreads
+{
+  public:
+    SimulatorThreads() = delete;
+    SimulatorThreads(const SimulatorThreads &) = delete;
+    SimulatorThreads &operator=(SimulatorThreads &) = delete;
+
+    SimulatorThreads(uint32_t num_queues)
+        : terminate(false),
+          numQueues(num_queues),
+          barrier(num_queues)
+    {
+        threads.reserve(num_queues);
+    }
+
+    ~SimulatorThreads()
+    {
+        // This should only happen after exit has been
+        // called. Subordinate event queues should normally (assuming
+        // exit is called from Python) be waiting on the barrier when
+        // this happens.
+        //
+        // N.B.: Not terminating here would make it impossible to
+        // safely destroy the barrier.
+        terminateThreads();
+    }
+
+    void runUntilLocalExit()
+    {
+        assert(!terminate);
+
+        // Start subordinate threads if needed.
+        if (threads.empty()) {
+            // the main thread (the one running Python) handles queue 0,
+            // so we only need to allocate new threads for queues 1..N-1.
+            // We'll call these the "subordinate" threads.
+            for (uint32_t i = 1; i < numQueues; i++) {
+                threads.emplace_back(
+                    [this](EventQueue *eq) {
+                        thread_main(eq);
+                    }, mainEventQueue[i]);
+            }
+        }
+
+        // This method is called from the main thread. All subordinate
+        // threads should be waiting on the barrier when the function
+        // is called. The arrival of the main thread here will satisfy
+        // the barrier and start another iteration in the thread loop.
+        barrier.wait();
+    }
+
+    void
+    terminateThreads()
+    {
+        assert(!terminate);
+
+        /* This function should only be called when the simulator is
+         * handling a global exit event (typically from Python). This
+         * means that the helper threads will be waiting on the
+         * barrier. Tell the helper threads to exit and release them from
+         * their barrier. */
+        terminate = true;
+        barrier.wait();
+
+        /* Wait for all of the threads to terminate */
+        for (auto &t : threads) {
+            t.join();
+        }
+
+        terminate = false;
+        threads.clear();
+    }
+
+  protected:
+    /**
+     * The main function for all subordinate threads (i.e., all threads
+     * other than the main thread).  These threads start by waiting on
+     * threadBarrier.  Once all threads have arrived at threadBarrier,
+     * they enter the simulation loop concurrently.  When they exit the
+     * loop, they return to waiting on threadBarrier.  This process is
+     * repeated until the simulation terminates.
+     */
+    void
+    thread_main(EventQueue *queue)
+    {
+        /* Wait for all initialisation to complete */
+        barrier.wait();
+
+        while (!terminate) {
+            doSimLoop(queue);
+            barrier.wait();
+        }
+    }
+
+    std::atomic<bool> terminate;
+    uint32_t numQueues;
+    std::vector<std::thread> threads;
+    Barrier barrier;
+};
+
+static std::unique_ptr<SimulatorThreads> simulatorThreads;
+
+struct DescheduleDeleter
+{
+    void operator()(BaseGlobalEvent *event)
+    {
+        if (!event)
+            return;
+
+        event->deschedule();
+        delete event;
+    }
+};
+
 /** Simulate for num_cycles additional cycles.  If num_cycles is -1
  * (the default), do not limit simulation; some other event must
  * terminate the loop.  Exported to Python.
@@ -82,75 +185,57 @@ GlobalSimLoopExitEvent *simulate_limit_event = nullptr;
 GlobalSimLoopExitEvent *
 simulate(Tick num_cycles)
 {
-    // The first time simulate() is called from the Python code, we need to
-    // create a thread for each of event queues referenced by the
-    // instantiated sim objects.
-    static bool threads_initialized = false;
-    static std::vector<std::thread *> threads;
-
-    if (!threads_initialized) {
-        threadBarrier = new Barrier(numMainEventQueues);
-
-        // the main thread (the one we're currently running on)
-        // handles queue 0, so we only need to allocate new threads
-        // for queues 1..N-1.  We'll call these the "subordinate" threads.
-        for (uint32_t i = 1; i < numMainEventQueues; i++) {
-            threads.push_back(new std::thread(thread_loop, mainEventQueue[i]));
-        }
-
-        threads_initialized = true;
-        simulate_limit_event =
-            new GlobalSimLoopExitEvent(mainEventQueue[0]->getCurTick(),
-                                       "simulate() limit reached", 0);
-    }
+    std::unique_ptr<GlobalSyncEvent, DescheduleDeleter> quantum_event;
+    const Tick exit_tick = num_cycles < MaxTick - curTick() ?
+                                        curTick() + num_cycles : MaxTick;
 
     inform("Entering event queue @ %d.  Starting simulation...\n", curTick());
 
-    if (num_cycles < MaxTick - curTick())
-        num_cycles = curTick() + num_cycles;
-    else // counter would roll over or be set to MaxTick anyhow
-        num_cycles = MaxTick;
+    if (!simulatorThreads)
+        simulatorThreads.reset(new SimulatorThreads(numMainEventQueues));
 
-    simulate_limit_event->reschedule(num_cycles);
+    if (!simulate_limit_event) {
+        simulate_limit_event = new GlobalSimLoopExitEvent(
+            mainEventQueue[0]->getCurTick(),
+            "simulate() limit reached", 0);
+    }
+    simulate_limit_event->reschedule(exit_tick);
 
-    GlobalSyncEvent *quantum_event = NULL;
     if (numMainEventQueues > 1) {
-        if (simQuantum == 0) {
-            fatal("Quantum for multi-eventq simulation not specified");
-        }
+        fatal_if(simQuantum == 0,
+                 "Quantum for multi-eventq simulation not specified");
 
-        quantum_event = new GlobalSyncEvent(curTick() + simQuantum, simQuantum,
-                            EventBase::Progress_Event_Pri, 0);
+        quantum_event.reset(
+            new GlobalSyncEvent(curTick() + simQuantum, simQuantum,
+                                EventBase::Progress_Event_Pri, 0));
 
         inParallelMode = true;
     }
 
-    // all subordinate (created) threads should be waiting on the
-    // barrier; the arrival of the main thread here will satisfy the
-    // barrier, and all threads will enter doSimLoop in parallel
-    threadBarrier->wait();
+    simulatorThreads->runUntilLocalExit();
     Event *local_event = doSimLoop(mainEventQueue[0]);
-    assert(local_event != NULL);
+    assert(local_event);
 
     inParallelMode = false;
 
     // locate the global exit event and return it to Python
     BaseGlobalEvent *global_event = local_event->globalEvent();
-    assert(global_event != NULL);
+    assert(global_event);
 
     GlobalSimLoopExitEvent *global_exit_event =
         dynamic_cast<GlobalSimLoopExitEvent *>(global_event);
-    assert(global_exit_event != NULL);
-
-    //! Delete the simulation quantum event.
-    if (quantum_event != NULL) {
-        quantum_event->deschedule();
-        delete quantum_event;
-    }
+    assert(global_exit_event);
 
     return global_exit_event;
 }
 
+void
+terminateEventQueueThreads()
+{
+    simulatorThreads->terminateThreads();
+}
+
+
 /**
  * Test and clear the global async_event flag, such that each time the
  * flag is cleared, only one thread returns true (and thus is assigned
@@ -159,15 +244,17 @@ simulate(Tick num_cycles)
 static bool
 testAndClearAsyncEvent()
 {
+    static std::mutex mutex;
+
     bool was_set = false;
-    asyncEventMutex.lock();
+    mutex.lock();
 
     if (async_event) {
         was_set = true;
         async_event = false;
     }
 
-    asyncEventMutex.unlock();
+    mutex.unlock();
     return was_set;
 }
 
diff --git a/src/sim/simulate.hh b/src/sim/simulate.hh
index 0817bbde1a..5ef499541f 100644
--- a/src/sim/simulate.hh
+++ b/src/sim/simulate.hh
@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2021 Arm Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 2006 The Regents of The University of Michigan
  * All rights reserved.
  *
@@ -34,6 +46,15 @@ namespace gem5
 class GlobalSimLoopExitEvent;
 
 GlobalSimLoopExitEvent *simulate(Tick num_cycles = MaxTick);
+
+/**
+ * Terminate helper threads when running in parallel mode.
+ *
+ * @pre Simulator must have returned from simulate() to service a
+ * GlobalExitEvent prior to calling this function.
+ */
+void terminateEventQueueThreads();
+
 extern GlobalSimLoopExitEvent *simulate_limit_event;
 
 } // namespace gem5