Files
gem5/util/dist/gem5-dist.sh
Giacomo Travaglini 566cdd81a8 util: Warn line breaking the gem5-dist script
Once the switch node in a dist-gem5 simulation gets started it listens
the first available port starting from the initially supplied one.

To bind full system nodes to the switch, the switch logfile
is parsed for the exact port number.

This is fragile and it broke when the following line:

info: tcp_iface listening on port

changed to

build/ARM/dev/net/tcp_iface.cc:97: info: tcp_iface listening on port

This patch is fixing the problem with a more robust regex matching

Signed-off-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Change-Id: I2721b3c04653ac1e09878e80d8b1ea34ec1a0f73
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/62512
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Reviewed-by: Richard Cooper <richard.cooper@arm.com>
Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
2022-09-01 08:07:02 +00:00

389 lines
13 KiB
Bash
Executable File

#! /bin/bash
#
# Copyright (c) 2015, 2022 Arm Limited
# All rights reserved
#
# The license below extends only to copyright in the software and shall
# not be construed as granting a license to any other intellectual
# property including but not limited to intellectual property relating
# to a hardware implementation of the functionality of the software
# licensed hereunder. You may use the software subject to the license
# terms below provided that you ensure that this notice is replicated
# unmodified and in its entirety in all distributions of the software,
# modified or unmodified, in source code or in binary form.
#
# Copyright (c) 2015 University of Illinois Urbana Champaign
# All rights reserved
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# This is a wrapper script to run a dist gem5 simulations.
# See the usage_func() below for hints on how to use it. Also,
# there are some examples in the util/dist directory (e.g.
# see util/dist/test-2nodes-AArch64.sh)
#
#
# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS
# environment variable (which is what LSF does by default).
# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots
# allocated to launch the gem5 processes, 2 of them are on host hname1
# and 4 of them are on host hname2.
# If LSB_MCPU_HOSTS environment variable is not defined then we launch all
# processes on the localhost.
#
# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel
# boot params. The total number of gem5 processes is also passed in.
# These values can be used in the boot script to configure the MAC/IP
# addresses - among other things (see util/dist/bootscript.rcS).
#
# Each gem5 process will create an m5out.$GEM5_RANK directory for
# the usual output files. Furthermore, there will be a separate log file
# for each ssh session (we use ssh to start gem5 processes) and one for
# the server. These are called log.$GEM5_RANK and log.switch.
#
# print help
usage_func ()
{
echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch] [--sw-args sw_args] [-fs fullsystem] [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe "
echo " -debug : debug mode (start gem5 in gdb)"
echo " nnodes : number of gem5 processes"
echo " rundir : run simulation under this path. If not specified, current dir will be used"
echo " ckptdir : dump/restore checkpoints to/from this path. If not specified, current dir will be used"
echo " fullsystem: fullsystem config file"
echo " fs_args : fullsystem config specific argument list: arg1 arg2 ..."
echo " port : switch listen port"
echo " switch : switch config file"
echo " sw_args : switch config specific argument list: arg1 arg2 ..."
echo " conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..."
echo " gem5_exe : gem5 executable (full path required)"
echo " m5_args : common m5 argument list (e.g. debug flags): arg1 arg2 ..."
echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost."
}
# Process (optional) command line options
FS_ARGS=" "
SW_ARGS=" "
CF_ARGS=" "
M5_ARGS=" "
while (($# > 0))
do
case "x$1" in
x-debug)
GEM5_DEBUG="-debug"
shift 1
;;
x-n|x-nodes)
NNODES=$2
shift 2
;;
x-r|x-rundir)
RUN_DIR=$2
shift 2
;;
x-c|x-ckptdir)
CKPT_DIR=$2
shift 2
;;
x-p|x-port)
SW_PORT=$2
shift 2
;;
x-s|x-switch)
SW_CONFIG=$2
shift 2
;;
x--sw-args)
CUR_ARGS="SW_ARGS"
shift 1
;;
x-f|x-fullsystem)
FS_CONFIG=$2
shift 2
;;
x--fs-args)
CUR_ARGS="FS_ARGS"
shift 1
;;
x--cf-args)
CUR_ARGS="CF_ARGS"
shift 1
;;
x--m5-args)
CUR_ARGS="M5_ARGS"
shift 1
;;
x-x)
GEM5_EXE=$2
shift 2
;;
x-*)
[ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; }
case "x$2" in
x-*|x)
eval $CUR_ARGS=\"${!CUR_ARGS} $1\"
shift 1
;;
*)
eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\"
shift 2
;;
esac
;;
*)
echo "Unknown arg: $1"
usage_func
exit 1
;;
esac
done
# Default values to use (in case they are not defined as command line options)
DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py
DEFAULT_SW_CONFIG=$M5_PATH/configs/dist/sw.py
DEFAULT_SW_PORT=2200
[ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG
[ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG
[ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT
[ -z "$NNODES" ] && NNODES=2
[ -z "$RUN_DIR" ] && RUN_DIR=$(pwd)
[ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd)
# Check if all the executables we need exist
[ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; }
[ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; }
[ -x "$GEM5_EXE" ] || { echo "Executable ${GEM5_EXE} not found"; exit 1; }
# make sure that RUN_DIR exists
mkdir -p $RUN_DIR > /dev/null 2>&1
declare -a SSH_PIDS
declare -a HOSTS
declare -a NCORES
# Find out which cluster hosts/slots are allocated or
# use localhost if there is no LSF allocation.
# We assume that allocated slots are listed in the LSB_MCPU_HOSTS
# environment variable in the form:
# host1 nslots1 host2 nslots2 ...
# (This is what LSF does by default.)
NH=0
[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES"
host=""
for hc in $LSB_MCPU_HOSTS
do
if [ "x$host" == "x" ]
then
host=$hc
HOSTS+=($hc)
else
NCORES+=($hc)
((NH+=hc))
host=""
fi
done
((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; }
# function to clean up and abort if something goes wrong
abort_func ()
{
echo
echo "KILLED $(date)"
# Try to Kill the server first. That should trigger an exit for all connected
# gem5 processes.
[ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null
sleep 20
# (try to) kill gem5 processes - just in case something went wrong with the
# server triggered exit
bname=$(basename $GEM5_EXE)
killall -q -s SIGKILL $bname
for h in ${HOSTS[@]}
do
ssh $h killall -q -s SIGKILL $bname
done
sleep 5
# kill the watchdog
[ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null
exit -1
}
# We need a watchdog to trigger full clean up if a gem5 process dies
watchdog_func ()
{
while true
do
sleep 30
((NDEAD=0))
for p in ${SSH_PIDS[*]}
do
kill -0 $p 2>/dev/null || ((NDEAD+=1))
done
kill -0 $SW_PID || ((NDEAD+=1))
if ((NDEAD>0))
then
# we may be in the middle of an orderly termination,
# give it some time to complete before reporting abort
sleep 60
echo -n "(I) (some) gem5 process(es) exited"
abort_func
fi
done
}
# This function launches the gem5 processes. The only purpose is to enable
# launching gem5 processes under gdb control for debugging
start_func ()
{
local N=$1
local HOST=$2
local ENV_ARGS=$3
shift 3
if [ "x$GEM5_DEBUG" != "x" ]
then
echo "DEBUG starting terminal..."
MY_ARGS="$@"
xterm -e "gdb --args $MY_ARGS" &
else
ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N &
fi
}
# block till the gem5 process starts
connected ()
{
FILE=$1
STRING=$2
echo -n "waiting for $3 to start "
while : ;
do
kill -0 $4 || { echo "Failed to start $3"; exit -1; }
[[ -f "$FILE" ]] && \
grep -q "$STRING" "$FILE" && \
echo -e "\nnode #$3 started" && \
break
sleep 2
echo -n "."
done
}
# Trigger full clean up in case we are being killed by external signal
trap 'abort_func' INT TERM
# env args to be passed explicitly to gem5 processes started via ssh
ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH"
#cleanup log files before starting gem5 processes
rm $RUN_DIR/log.switch > /dev/null 2>&1
# make sure that CKPT_DIR exists
mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1
# launch switch gem5
SW_HOST=${HOSTS[0]}
echo "launch switch gem5 process on $SW_HOST ..."
start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch \
$M5_ARGS \
$SW_CONFIG \
$SW_ARGS \
$CF_ARGS \
--checkpoint-dir=$CKPT_DIR/m5out.switch \
--is-switch \
--dist-size=$NNODES \
--dist-server-port=$SW_PORT
SW_PID=$!
# block here till switch process starts
connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID
# actual port that switch is listening on may be different
# from what we specified if the port was busy
PORT_REGEX="tcp_iface listening on port ([0-9]+)"
SW_FILE=$(cat $RUN_DIR/log.switch)
if [[ $SW_FILE =~ $PORT_REGEX ]]; then
SW_PORT="${BASH_REMATCH[1]}"
else
echo "Unable to find port info from $RUN_DIR/log.switch"
abort_func
fi
# Now launch all the gem5 processes with ssh.
echo "START $(date)"
n=0
for ((i=0; i < ${#HOSTS[@]}; i++))
do
h=${HOSTS[$i]}
for ((j=0; j < ${NCORES[i]}; j++))
do
#cleanup log files before starting gem5 processes
rm $RUN_DIR/log.$n > /dev/null 2>&1
# make sure that CKPT_DIR exists
mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1
echo "starting gem5 on $h ..."
start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n \
$M5_ARGS \
$FS_CONFIG \
$FS_ARGS \
$CF_ARGS \
--checkpoint-dir=$CKPT_DIR/m5out.$n \
--dist \
--dist-rank=$n \
--dist-size=$NNODES \
--dist-server-name=${HOSTS[0]} \
--dist-server-port=$SW_PORT
SSH_PIDS[$n]=$!
((n+=1))
done
done
# Wait here if it is a debug session
[ "x$GEM5_DEBUG" == "x" ] || { echo "DEBUG session"; wait $SW_PID; exit -1; }
# start watchdog to trigger complete abort (after a grace period) if any
# gem5 process dies
watchdog_func &
WATCHDOG_PID=$!
# wait for exit statuses
((NFAIL=0))
for p in ${SSH_PIDS[*]}
do
wait $p || ((NFAIL+=1))
done
wait $SW_PID || ((NFAIL+=1))
# all done, let's terminate the watchdog
kill $WATCHDOG_PID 2>/dev/null
if ((NFAIL==0))
then
echo "EXIT $(date)"
else
echo "ABORT $(date)"
fi