gkfs 10.3 KiB
Newer Older
#######################################
# Poll GekkoFS hostsfile until all daemons are started. 
# Exits with 1 if daemons cannot be started.
# Globals:
#   HOSTSFILE
#   NODE_NUM
# Arguments:
#   None
# Outputs:
#   Writes error to stdout
#######################################
wait_for_gkfs_daemons() {
	  sleep 2
    local server_wait_cnt=0
    local nodes=1
    if [[ -n ${NODE_NUM} ]]; then
        nodes=${NODE_NUM}
    fi
    until [ $(($(wc -l "${HOSTSFILE}"  2> /dev/null | awk '{print $1}') + 0)) -eq "${nodes}" ]
    do
		    #echo "Waiting for all servers to report connection. Try $server_wait_cnt"
        sleep 2
        server_wait_cnt=$((server_wait_cnt+1))
        if [ ${server_wait_cnt} -gt 600 ]; then
            echo "Server failed to start. Exiting ..."
            exit 1
        fi
    done
}
#######################################
# Creates a pid file for a given pid. If pid file exists, we check if its pids are still valid.
# If valid, an additional line is added. Otherwise, the pid in the file is deleted.
# Globals:
#   DAEMON_PID_FILE
#   VERBOSE
# Arguments:
#   pid to write to pid file
# Outputs:
#   Writes status to stdout if VERBOSE is true
#######################################
create_pid_file() {
    local pid_file=${DAEMON_PID_FILE}
    local pid=${1}
    if [[ ${VERBOSE} == true ]]; then
        echo "Creating pid file at ${pid_file} with pid ${pid} ..."
    fi
    # if PID file exists another daemon could run
    if [[ -e ${pid_file} ]]; then
        local pid_file_tmp=${DAEMON_PID_FILE}.swp
        # create empty tmp file
        truncate -s 0 "${pid_file_tmp}"
        while IFS= read -r line
        do
            if ps -p "${line}" > /dev/null; then
                # process with pid still running
                echo "${line}" >> "${pid_file_tmp}"
            fi
        done < "${pid_file}"
        # create pid file with only valid pids
        mv "${pid_file_tmp}" "${pid_file}"
    fi
    echo "${pid}" >> "${pid_file}"
}
#######################################
# Starts GekkoFS daemons.
# Globals:
#   SLURM_JOB_ID
#   NODE_NUM
#   MOUNTDIR
#   ROOTDIR
#   ARGS
#   CPUS_PER_TASK
#   VERBOSE
#   USE_NUMACTL
#   DAEMON_CPUNODEBIND
#   DAEMON_MEMBIND
#   GKFS_DAEMON_LOG_PATH
#   GKFS_DAEMON_LOG_LEVEL
#   RUN_FOREGROUND
# Outputs:
#   Writes status to stdout
#######################################
start_daemon() {
    local node_list
    local srun_cmd
    local daemon_execute
    # setup
    if [[ ${USE_SRUN} == true ]]; then
        node_list=$(scontrol show job "${SLURM_JOB_ID}" | grep " NodeList=" | cut -d "=" -f2)
        if [[ -z ${NODE_NUM} ]]; then
            NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l)
        fi
        # Setting up base srun cmd
        srun_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} "
    if [[ ${VERBOSE} == true ]]; then
        echo "### mountdir: ${MOUNTDIR}"
        echo "### rootdir: ${ROOTDIR}"
        echo "### node_num: ${NODE_NUM}"
        echo "### additional daemon args: ${ARGS}"
        echo "### cpus_per_task: ${CPUS_PER_TASK}"
    fi
    if [[ ${VERBOSE} == true ]]; then
        echo "# Cleaning host file ..."
    fi
    rm "${HOSTSFILE}" 2> /dev/null
    # Setting up base daemon cmd
    local daemon_cmd="${DAEMON_BIN} -r ${ROOTDIR} -m ${MOUNTDIR} -H ${HOSTSFILE} ${ARGS}"
    # Setting up numactl
    if [[ ${USE_NUMACTL} == true ]]; then
        daemon_cmd="numactl --cpunodebind=${DAEMON_CPUNODEBIND} --membind=${DAEMON_MEMBIND} ${daemon_cmd}"
    fi
    # final daemon execute command
    daemon_execute="${srun_cmd}${daemon_cmd}"

    if [[ ${VERBOSE} == true ]]; then
        echo "### Full execute DAEMON command:"
        echo "##### $daemon_execute"
    fi
    # setup environment variables
    export GKFS_DAEMON_LOG_PATH=$GKFS_DAEMON_LOG_PATH
    export GKFS_DAEMON_LOG_LEVEL=$GKFS_DAEMON_LOG_LEVEL

    echo "Starting daemons ..."
    ${daemon_execute} &
    local daemon_pid=$!
    wait_for_gkfs_daemons
    echo "Running ..."

    if [[ ${RUN_FOREGROUND} == true ]]; then
        echo "Press 'q' to exit"
        while : ; do
            read -n 1 k <&1
            if [[ $k = q ]] ; then
                echo
                echo "Shutting down ..."
                if [[ -n ${daemon_pid} ]]; then
                    kill -s SIGINT ${daemon_pid} &
                    wait ${daemon_pid}
                fi
                break
            else
                echo "Press 'q' to exit"
            fi
        done
    else
        create_pid_file ${daemon_pid}
    fi
}
#######################################
# Stops GekkoFS daemons for the configured pid file
# Globals:
#   DAEMON_PID_FILE
#   VERBOSE
# Outputs:
#   Writes status to stdout
#######################################
stop_daemons() {
    local pid_file=${DAEMON_PID_FILE}
    if [[ -e ${pid_file} ]]; then
        while IFS= read -r line
        do
            if ps -p "${line}" > /dev/null; then
                if [[ ${VERBOSE} == true ]]; then
                    echo "Stopping daemon with pid ${line}"
                fi
                kill -s SIGINT "${line}" &
                # poll pid until it stopped
                if [[ ${VERBOSE} == true ]]; then
                    echo "Waiting for daemons to exit ..."
                fi
                timeout 1 tail --pid=${line} -f /dev/null
            fi
        done < "${pid_file}"
        rm "${pid_file}"
    else
        echo "No pid file found -> no daemon running. Exiting ..."
    fi
}
#######################################
# Print short usage information
# Outputs:
#   Writes help to stdout
#######################################
usage: gkfs [-h/--help] [-r/--rootdir <path>] [-m/--mountdir <path>] [-a/--args <daemon_args>] [-f/--foreground <false>]
        [--srun <false>] [-n/--numnodes <jobsize>] [--cpuspertask <64>] [--numactl <false>] [-v/--verbose <false>]
#######################################
# Print detailed usage information
# Outputs:
#   Writes help to stdout
#######################################
    echo "
    This script simplifies the starting and stopping GekkoFS daemons. If daemons are started on multiple nodes,
    a Slurm environment is required. The script looks for the 'gkfs.conf' file in the same directory where
    additional permanent configurations can be set.

    positional arguments:
            command                 Command to execute: 'start' and 'stop'

    optional arguments:
            -h, --help              Shows this help message and exits
            -r, --rootdir <path>    The rootdir path for GekkoFS daemons.
            -m, --mountdir <path>   The mountdir path for GekkoFS daemons.
            -a, --args <daemon_arguments>
                                    Add various additional daemon arguments, e.g., \"-l ib0 -P ofi+psm2\".
            -f, --foreground        Starts the script in the foreground. Daemons are stopped by pressing 'q'.
            --srun                  Use srun to start daemons on multiple nodes.
            -n, --numnodes <n>      GekkoFS daemons are started on n nodes.
                                    Nodelist is extracted from Slurm via the SLURM_JOB_ID env variable.
            --cpuspertask <#cores>  Set the number of cores the daemons can use. Must use '--srun'.
            --numactl               Use numactl for the daemon. Modify gkfs.conf for further numactl configurations.
            -c, --config            Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory.
            -v, --verbose           Increase verbosity
            "
CONFIGPATH=""
argv=("$@")
# get config path first from argument list
for i in "${argv[@]}"; do
    if [[ "${argv[i]}" == "-c" || "${argv[i]}" == "--config" ]]; then
        CONFIGPATH=$(readlink -mn "${argv[i+1]}")
        break
    fi
done
# global variables
export FI_PSM2_DISCONNECT=1
export PSM2_MULTI_EP=1
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"

# get default path if config wasn't given
if [[ -z ${CONFIGPATH} ]]; then
    CONFIGPATH="${SCRIPTDIR}/gkfs.conf"
fi
if [[ ! -f ${CONFIGPATH} ]]; then
    >&2 echo ">> No config file found at '${CONFIGPATH}'."
    exit 1
fi
# get variables from CONFIGPATH
source "$CONFIGPATH"
# more global variables which may be overwritten by user input
VERBOSE=false
NODE_NUM=1
MOUNTDIR=${DAEMON_MOUNTDIR}
ROOTDIR=${DAEMON_ROOTDIR}
HOSTSFILE=${LIBGKFS_HOSTS_FILE}
CPUS_PER_TASK=$(grep -c ^processor /proc/cpuinfo)
ARGS=${DAEMON_ARGS}
USE_SRUN=${USE_SRUN}
RUN_FOREGROUND=false
USE_NUMACTL=${DAEMON_NUMACTL}
# parse input
POSITIONAL=()
while [[ $# -gt 0 ]]; do
    key="$1"

    case ${key} in
    -r | --rootdir)
        ROOTDIR=$2
        shift # past argument
        shift # past value
        ;;
    -m | --mountdir)
        MOUNTDIR=$2
        shift # past argument
        shift # past value
        ;;
    -n | --numnodes)
        NODE_NUM=$2
        shift # past argument
        shift # past value
        ;;
    -a | --args)
        ARGS="${ARGS} $2"
        shift # past argument
        shift # past value
        ;;
    --srun)
        USE_SRUN=true
        shift # past argument
        ;;
    -f | --foreground)
        RUN_FOREGROUND=true
        shift # past argument
        ;;
    --numactl)
        USE_NUMACTL=true
        shift # past argument
        ;;
    --cpuspertask)
        CPUS_PER_TASK=$2
        shift # past argument
        shift # past value
        ;;
    -c | --config)
            # skip. was handled above
            shift # past argument
            shift # past value
            ;;
    -h | --help)
        help_msg
        exit
        ;;
    -v | --verbose)
        VERBOSE=true
        shift # past argument
        ;;
    *) # unknown option
        POSITIONAL+=("$1") # save it in an array for later
        shift              # past argument
        ;;
    esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters

# positional arguments
if [[ -z ${1+x} ]]; then
    echo "ERROR: Positional arguments missing."
    usage_short
    exit 1
fi
command="${1}"
# checking input
if [[ ${command} != *"start"* ]] && [[ ${command} != *"stop"* ]]; then
    echo "ERROR: command ${command} not supported"
    usage_short
    exit 1
fi
# Run script
if [[ ${command} == "start" ]]; then
    start_daemon
elif [[ ${command} == "stop" ]]; then
    stop_daemons
fi
if [[ ${VERBOSE} == true ]]; then
    echo "Nothing left to do. Exiting :)"
fi