Newer
Older
#!/bin/bash
#######################################
# Poll GekkoFS hostsfile until all daemons are started.
# Exits with 1 if daemons cannot be started.
# Globals:
# HOSTSFILE
# NODE_NUM
# Arguments:
# None
# Outputs:
# Writes error to stdout
#######################################
wait_for_gkfs_daemons() {
sleep 2
local server_wait_cnt=0
local nodes=1
if [[ -n ${NODE_NUM} ]]; then
nodes=${NODE_NUM}
fi
until [ $(($(wc -l "${HOSTSFILE}" 2> /dev/null | awk '{print $1}') + 0)) -eq "${nodes}" ]
do
#echo "Waiting for all servers to report connection. Try $server_wait_cnt"
sleep 2
server_wait_cnt=$((server_wait_cnt+1))
if [ ${server_wait_cnt} -gt 600 ]; then
echo "Server failed to start. Exiting ..."
exit 1
fi
done
}
#######################################
# Creates a pid file for a given pid. If pid file exists, we check if its pids are still valid.
# If valid, an additional line is added. Otherwise, the pid in the file is deleted.
# Globals:
# DAEMON_PID_FILE
# VERBOSE
# Arguments:
# pid to write to pid file
# Outputs:
# Writes status to stdout if VERBOSE is true
#######################################
create_pid_file() {
local pid_file=${DAEMON_PID_FILE}
local pid=${1}
echo "Creating pid file at ${pid_file} with pid ${pid} ..."
fi
# if PID file exists another daemon could run
if [[ -e ${pid_file} ]]; then
local pid_file_tmp=${DAEMON_PID_FILE}.swp
# create empty tmp file
truncate -s 0 "${pid_file_tmp}"
while IFS= read -r line
do
if ps -p "${line}" > /dev/null; then
# process with pid still running
echo "${line}" >> "${pid_file_tmp}"
fi
done < "${pid_file}"
# create pid file with only valid pids
mv "${pid_file_tmp}" "${pid_file}"
fi
echo "${pid}" >> "${pid_file}"
}
#######################################
# Starts GekkoFS daemons.
# Globals:
# SLURM_JOB_ID
# NODE_NUM
# MOUNTDIR
# ROOTDIR
# ARGS
# CPUS_PER_TASK
# VERBOSE
# USE_NUMACTL
# DAEMON_CPUNODEBIND
# DAEMON_MEMBIND
# GKFS_DAEMON_LOG_PATH
# GKFS_DAEMON_LOG_LEVEL
# RUN_FOREGROUND
# Outputs:
# Writes status to stdout
#######################################
start_daemon() {
local node_list
local srun_cmd
local daemon_execute
# setup
if [[ ${USE_SRUN} == true ]]; then
node_list=$(scontrol show job "${SLURM_JOB_ID}" | grep " NodeList=" | cut -d "=" -f2)
if [[ -z ${NODE_NUM} ]]; then
NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l)
fi
# Setting up base srun cmd
srun_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} "
echo "### mountdir: ${MOUNTDIR}"
echo "### rootdir: ${ROOTDIR}"
echo "### node_num: ${NODE_NUM}"
echo "### additional daemon args: ${ARGS}"
echo "### cpus_per_task: ${CPUS_PER_TASK}"
fi
echo "# Cleaning host file ..."
fi
rm "${HOSTSFILE}" 2> /dev/null
# Setting up base daemon cmd
local daemon_cmd="${DAEMON_BIN} -r ${ROOTDIR} -m ${MOUNTDIR} -H ${HOSTSFILE} ${ARGS}"
# Setting up numactl
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
daemon_cmd="numactl --cpunodebind=${DAEMON_CPUNODEBIND} --membind=${DAEMON_MEMBIND} ${daemon_cmd}"
fi
# final daemon execute command
daemon_execute="${srun_cmd}${daemon_cmd}"
if [[ ${VERBOSE} == true ]]; then
echo "### Full execute DAEMON command:"
echo "##### $daemon_execute"
fi
# setup environment variables
export GKFS_DAEMON_LOG_PATH=$GKFS_DAEMON_LOG_PATH
export GKFS_DAEMON_LOG_LEVEL=$GKFS_DAEMON_LOG_LEVEL
echo "Starting daemons ..."
${daemon_execute} &
local daemon_pid=$!
wait_for_gkfs_daemons
echo "Running ..."
if [[ ${RUN_FOREGROUND} == true ]]; then
echo "Press 'q' to exit"
while : ; do
read -n 1 k <&1
if [[ $k = q ]] ; then
echo
echo "Shutting down ..."
if [[ -n ${daemon_pid} ]]; then
kill -s SIGINT ${daemon_pid} &
wait ${daemon_pid}
fi
break
else
echo "Press 'q' to exit"
fi
done
else
create_pid_file ${daemon_pid}
fi
}
#######################################
# Stops GekkoFS daemons for the configured pid file
# Globals:
# DAEMON_PID_FILE
# VERBOSE
# Outputs:
# Writes status to stdout
#######################################
stop_daemons() {
local pid_file=${DAEMON_PID_FILE}
if [[ -e ${pid_file} ]]; then
while IFS= read -r line
do
if ps -p "${line}" > /dev/null; then
echo "Stopping daemon with pid ${line}"
fi
kill -s SIGINT "${line}" &
# poll pid until it stopped
echo "Waiting for daemons to exit ..."
fi
timeout 1 tail --pid=${line} -f /dev/null
fi
done < "${pid_file}"
rm "${pid_file}"
else
echo "No pid file found -> no daemon running. Exiting ..."
fi
}
#######################################
# Print short usage information
# Outputs:
# Writes help to stdout
#######################################
usage_short() {
echo "
usage: gkfs [-h/--help] [-r/--rootdir <path>] [-m/--mountdir <path>] [-a/--args <daemon_args>] [-f/--foreground <false>]
[--srun <false>] [-n/--numnodes <jobsize>] [--cpuspertask <64>] [--numactl <false>] [-v/--verbose <false>]
{start,stop}
"
}
#######################################
# Print detailed usage information
# Outputs:
# Writes help to stdout
#######################################
help_msg() {
usage_short
echo "
This script simplifies the starting and stopping GekkoFS daemons. If daemons are started on multiple nodes,
a Slurm environment is required. The script looks for the 'gkfs.conf' file in the same directory where
additional permanent configurations can be set.
positional arguments:
command Command to execute: 'start' and 'stop'
optional arguments:
-h, --help Shows this help message and exits
-r, --rootdir <path> The rootdir path for GekkoFS daemons.
-m, --mountdir <path> The mountdir path for GekkoFS daemons.
-a, --args <daemon_arguments>
Add various additional daemon arguments, e.g., \"-l ib0 -P ofi+psm2\".
-f, --foreground Starts the script in the foreground. Daemons are stopped by pressing 'q'.
--srun Use srun to start daemons on multiple nodes.
-n, --numnodes <n> GekkoFS daemons are started on n nodes.
Nodelist is extracted from Slurm via the SLURM_JOB_ID env variable.
--cpuspertask <#cores> Set the number of cores the daemons can use. Must use '--srun'.
--numactl Use numactl for the daemon. Modify gkfs.conf for further numactl configurations.
-c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory.
CONFIGPATH=""
argv=("$@")
# get config path first from argument list
for i in "${argv[@]}"; do
if [[ "${argv[i]}" == "-c" || "${argv[i]}" == "--config" ]]; then
CONFIGPATH=$(readlink -mn "${argv[i+1]}")
break
fi
done
# global variables
export FI_PSM2_DISCONNECT=1
export PSM2_MULTI_EP=1
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
# get default path if config wasn't given
if [[ -z ${CONFIGPATH} ]]; then
CONFIGPATH="${SCRIPTDIR}/gkfs.conf"
fi
if [[ ! -f ${CONFIGPATH} ]]; then
>&2 echo ">> No config file found at '${CONFIGPATH}'."
exit 1
fi
# get variables from CONFIGPATH
source "$CONFIGPATH"
# more global variables which may be overwritten by user input
VERBOSE=false
NODE_NUM=1
MOUNTDIR=${DAEMON_MOUNTDIR}
ROOTDIR=${DAEMON_ROOTDIR}
HOSTSFILE=${LIBGKFS_HOSTS_FILE}
CPUS_PER_TASK=$(grep -c ^processor /proc/cpuinfo)
ARGS=${DAEMON_ARGS}
USE_SRUN=${USE_SRUN}
RUN_FOREGROUND=false
USE_NUMACTL=${DAEMON_NUMACTL}
# parse input
POSITIONAL=()
while [[ $# -gt 0 ]]; do
key="$1"
case ${key} in
-r | --rootdir)
ROOTDIR=$2
shift # past argument
shift # past value
;;
-m | --mountdir)
MOUNTDIR=$2
shift # past argument
shift # past value
;;
-n | --numnodes)
NODE_NUM=$2
shift # past argument
shift # past value
;;
-a | --args)
shift # past argument
shift # past value
;;
--srun)
USE_SRUN=true
shift # past argument
;;
-f | --foreground)
RUN_FOREGROUND=true
shift # past argument
;;
--numactl)
USE_NUMACTL=true
shift # past argument
;;
--cpuspertask)
CPUS_PER_TASK=$2
shift # past argument
shift # past value
;;
-c | --config)
# skip. was handled above
shift # past argument
shift # past value
;;
-h | --help)
help_msg
exit
;;
-v | --verbose)
VERBOSE=true
shift # past argument
;;
*) # unknown option
POSITIONAL+=("$1") # save it in an array for later
shift # past argument
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
# positional arguments
if [[ -z ${1+x} ]]; then
echo "ERROR: Positional arguments missing."
usage_short
exit 1
fi
command="${1}"
if [[ ${command} != *"start"* ]] && [[ ${command} != *"stop"* ]]; then
echo "ERROR: command ${command} not supported"
usage_short
exit 1
fi
if [[ ${command} == "start" ]]; then
start_daemon
elif [[ ${command} == "stop" ]]; then
stop_daemons
fi
echo "Nothing left to do. Exiting :)"
fi