From b4564080c57706247b91bb8e36214a808b96a8ca Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Mon, 25 Apr 2022 19:50:31 +0200 Subject: [PATCH 1/6] Changing GKFS_LOG_LEVEL to GKFS_DAEMON_LOG_LEVEL --- README.md | 10 ++++------ docs/sphinx/users/running.md | 2 +- include/config.hpp | 3 ++- include/daemon/env.hpp | 2 +- src/daemon/daemon.cpp | 2 +- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 5aceb1016..60bfd2292 100644 --- a/README.md +++ b/README.md @@ -182,13 +182,11 @@ setting the `LIBGKFS_LOG_SYSCALL_FILTER` environment variable. For instance, setting it to `LIBGKFS_LOG_SYSCALL_FILTER=epoll_wait,epoll_create` will filter out any log entries from the `epoll_wait()` and `epoll_create()` system calls. -Additionally, setting the `LIBGKFS_LOG_OUTPUT_TRUNC` environment variable with -a value different from `0` will instruct the logging subsystem to truncate -the file used for logging, rather than append to it. +Additionally, setting the `LIBGKFS_LOG_OUTPUT_TRUNC` environment variable with a value different from `0` will instruct +the logging subsystem to truncate the file used for logging, rather than append to it. -For the daemon, the `GKFS_DAEMON_LOG_PATH=` environment variable -can be provided to set the path to the log file, and the log module can be -selected with the `GKFS_LOG_LEVEL={off,critical,err,warn,info,debug,trace}` +For the daemon, the `GKFS_DAEMON_LOG_PATH=` environment variable can be provided to set the path to the +log file, and the log module can be selected with the `GKFS_DAEMON_LOG_LEVEL={off,critical,err,warn,info,debug,trace}` environment variable. # Miscellaneous diff --git a/docs/sphinx/users/running.md b/docs/sphinx/users/running.md index 7b8e16790..3346c4daf 100644 --- a/docs/sphinx/users/running.md +++ b/docs/sphinx/users/running.md @@ -178,5 +178,5 @@ the logging subsystem to truncate the file used for logging, rather than append #### Daemon logging For the daemon, the `GKFS_DAEMON_LOG_PATH=` environment variable can be provided to set the path to the -log file, and the log module can be selected with the `GKFS_LOG_LEVEL={off,critical,err,warn,info,debug,trace}` +log file, and the log module can be selected with the `GKFS_DAEMON_LOG_LEVEL={off,critical,err,warn,info,debug,trace}` environment variable whereas `trace` produces the most trace records while `info` is the default value. \ No newline at end of file diff --git a/include/config.hpp b/include/config.hpp index 768fa8cd8..44f0bb60e 100644 --- a/include/config.hpp +++ b/include/config.hpp @@ -33,7 +33,8 @@ // environment prefixes (are concatenated in env module at compile time) #define CLIENT_ENV_PREFIX "LIBGKFS_" -#define DAEMON_ENV_PREFIX "GKFS_" +#define DAEMON_ENV_PREFIX "GKFS_DAEMON_" +#define COMMON_ENV_PREFIX "GKFS_" namespace gkfs::config { diff --git a/include/daemon/env.hpp b/include/daemon/env.hpp index 9ba70a03d..754f2e27c 100644 --- a/include/daemon/env.hpp +++ b/include/daemon/env.hpp @@ -35,7 +35,7 @@ #include -#define ADD_PREFIX(str) DAEMON_ENV_PREFIX str +#define ADD_PREFIX(str) COMMON_ENV_PREFIX str /* Environment variables for the GekkoFS daemon */ namespace gkfs::env { diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index 97476e2aa..f39a0a21b 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -459,7 +459,7 @@ initialize_loggers() { std::string path = gkfs::config::log::daemon_log_path; // Try to get log path from env variable std::string env_path_key = DAEMON_ENV_PREFIX; - env_path_key += "DAEMON_LOG_PATH"; + env_path_key += "LOG_PATH"; char* env_path = getenv(env_path_key.c_str()); if(env_path != nullptr) { path = env_path; -- GitLab From 642cdcb38184aa2bce1c9f87b9e12c4367831dc9 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Tue, 26 Apr 2022 20:05:03 +0200 Subject: [PATCH 2/6] New script: gkfs start and stop daemons locally and for srun (beta) --- scripts/run/gkfs | 243 ++++++++++++++++++++++++++++++++++++++++++ scripts/run/gkfs.conf | 24 +++++ 2 files changed, 267 insertions(+) create mode 100755 scripts/run/gkfs create mode 100644 scripts/run/gkfs.conf diff --git a/scripts/run/gkfs b/scripts/run/gkfs new file mode 100755 index 000000000..585f86f40 --- /dev/null +++ b/scripts/run/gkfs @@ -0,0 +1,243 @@ +#!/bin/bash + +# global variables +export FI_PSM2_DISCONNECT=1 +export PSM2_MULTI_EP=1 +SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)" +CONFIGPATH="${SCRIPTDIR}/gkfs.conf" +source "$CONFIGPATH" + +VERBOSE=false +NODE_NUM=1 +MOUNTDIR=${DAEMON_MOUNTDIR} +ROOTDIR=${DAEMON_ROOTDIR} +HOSTSFILE=${LIBGKFS_HOSTS_FILE} +CPUS_PER_TASK=$(grep -c ^processor /proc/cpuinfo) +ARGS=${DAEMON_ARGS} +USE_SRUN=false +RUN_FOREGROUND=false + +wait_for_gkfs_daemons() { + sleep 2 + local server_wait_cnt=0 + local nodes=1 + if [[ -n ${NODE_NUM} ]]; then + nodes=${NODE_NUM} + fi + until [ $(($(wc -l "${HOSTSFILE}" 2> /dev/null | awk '{print $1}') + 0)) -eq "${nodes}" ] + do + #echo "Waiting for all servers to report connection. Try $server_wait_cnt" + sleep 2 + server_wait_cnt=$((server_wait_cnt+1)) + if [ ${server_wait_cnt} -gt 600 ]; then + echo "Server failed to start. Exiting ..." + exit 1 + fi + done +} + +create_pid_file() { + local pid_file=${DAEMON_PID_FILE} + local pid=${1} + if [[ $VERBOSE == true ]]; then + echo "Creating pid file at ${pid_file} with pid ${pid} ..." + fi + # if PID file exists another daemon could run + if [[ -e ${pid_file} ]]; then + local pid_file_tmp=${DAEMON_PID_FILE}.swp + # create empty tmp file + truncate -s 0 "${pid_file_tmp}" + while IFS= read -r line + do + if ps -p "${line}" > /dev/null; then + # process with pid still running + echo "${line}" >> "${pid_file_tmp}" + fi + done < "${pid_file}" + # create pid file with only valid pids + mv "${pid_file_tmp}" "${pid_file}" + fi + echo "${pid}" >> "${pid_file}" +} + +start_daemon() { + local node_list + local srun_cmd + local daemon_execute + # setup + if [[ ${USE_SRUN} == true ]]; then + node_list=$(scontrol show job "${SLURM_JOB_ID}" | grep " NodeList=" | cut -d "=" -f2) + if [[ -z ${NODE_NUM} ]]; then + NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l) + fi + # Setting up base srun cmd + srun_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --ntasks-per-node=1 --overcommit --contiguous --cpus-per-task=${CPUS_PER_TASK} --oversubscribe --mem=0 " + fi + + if [[ $VERBOSE == true ]]; then + echo "### mountdir: ${MOUNTDIR}" + echo "### rootdir: ${ROOTDIR}" + echo "### node_num: ${NODE_NUM}" + echo "### args: ${ARGS}" + echo "### cpus_per_task: ${CPUS_PER_TASK}" + fi + if [[ $VERBOSE == true ]]; then + echo "# Cleaning host file ..." + fi + rm "${HOSTSFILE}" 2> /dev/null + # Setting up base daemon cmd + local daemon_cmd="${DAEMON_BIN} -r ${ROOTDIR} -m ${MOUNTDIR} -H ${HOSTSFILE} ${ARGS}" + # Setting up numactl + if [[ ${DAEMON_NUMACTL} == true ]]; then + daemon_cmd="numactl --cpunodebind=${DAEMON_CPUNODEBIND} --membind=${DAEMON_MEMBIND} ${daemon_cmd}" + fi + # final daemon execute command + daemon_execute="${srun_cmd}${daemon_cmd}" + + if [[ ${VERBOSE} == true ]]; then + echo "### Full execute DAEMON command:" + echo "##### $daemon_execute" + fi + # setup environment variables + export GKFS_DAEMON_LOG_PATH=$GKFS_DAEMON_LOG_PATH + export GKFS_DAEMON_LOG_LEVEL=$GKFS_DAEMON_LOG_LEVEL + + echo "Starting daemons ..." + ${daemon_execute} & + local daemon_pid=$! + wait_for_gkfs_daemons + echo "Running ..." + + if [[ ${RUN_FOREGROUND} == true ]]; then + echo "Press 'q' to exit" + while : ; do + read -n 1 k <&1 + if [[ $k = q ]] ; then + echo + echo "Shutting down ..." + if [[ -n ${daemon_pid} ]]; then + kill -s SIGINT ${daemon_pid} & + wait ${daemon_pid} + fi + break + else + echo "Press 'q' to exit" + fi + done + else + create_pid_file ${daemon_pid} + fi +} + +stop_daemons() { + local pid_file=${DAEMON_PID_FILE} + if [[ -e ${pid_file} ]]; then + while IFS= read -r line + do + if ps -p "${line}" > /dev/null; then + if [[ $VERBOSE == true ]]; then + echo "Stopping daemon with pid ${line}" + fi + kill -s SIGINT "${line}" & + # poll pid until it stopped + if [[ $VERBOSE == true ]]; then + echo "Waiting for daemons to exit ..." + fi + timeout 1 tail --pid=${line} -f /dev/null + fi + done < "${pid_file}" + rm "${pid_file}" + else + echo "No pid file found -> no daemon running. Exiting ..." + fi +} + +usage_short() { + echo " +usage: gkfs.sh [-h] [-r/--rootdir ] [-m/--mountdir ] [-n/--numnodes ] [-f/--foreground ] + [-a/--args ] [--srun ] [-c/--cpuspertask <64>] [-v/--verbose ] + {start,stop} + " +} + +help_msg() { + + usage_short +} +# parse input +POSITIONAL=() +while [[ $# -gt 0 ]]; do + key="$1" + + case ${key} in + -r | --rootdir) + ROOTDIR=$2 + shift # past argument + shift # past value + ;; + -m | --mountdir) + MOUNTDIR=$2 + shift # past argument + shift # past value + ;; + -n | --numnodes) + NODE_NUM=$2 + shift # past argument + shift # past value + ;; + -a | --args) + ARGS=$2 + shift # past argument + shift # past value + ;; + --srun) + USE_SRUN=true + shift # past argument + ;; + -f | --foreground) + RUN_FOREGROUND=true + shift # past argument + ;; + -c | --cpuspertask) + CPUS_PER_TASK=$2 + shift # past argument + shift # past value + ;; + -h | --help) + help_msg + exit + ;; + -v | --verbose) + VERBOSE=true + shift # past argument + ;; + *) # unknown option + POSITIONAL+=("$1") # save it in an array for later + shift # past argument + ;; + esac +done +set -- "${POSITIONAL[@]}" # restore positional parameters + +# positional arguments +if [[ -z ${1+x} ]]; then + echo "ERROR: Positional arguments missing." + usage_short + exit 1 +fi +command="${1}" + +if [[ ${command} != *"start"* ]] && [[ ${command} != *"stop"* ]]; then + echo "ERROR: command ${command} not supported" + usage_short + exit 1 +fi + +if [[ ${command} == "start" ]]; then + start_daemon +elif [[ ${command} == "stop" ]]; then + stop_daemons +fi +if [[ $VERBOSE == true ]]; then + echo "Nothing left to do. Exiting :)" +fi \ No newline at end of file diff --git a/scripts/run/gkfs.conf b/scripts/run/gkfs.conf new file mode 100644 index 000000000..3606f1a1d --- /dev/null +++ b/scripts/run/gkfs.conf @@ -0,0 +1,24 @@ +#!/bin/bash + +# binaries (default for project_dir/build +PRELOAD_LIB=../../build/src/client/libgkfs_intercept.so +DAEMON_BIN=../../build/src/daemon/gkfs_daemon +PROXY_BIN=../../build/src/proxy/gkfs_proxy + +# client configuration +LIBGKFS_HOSTS_FILE=../../build/gkfs_hostfile + +# daemon configuration +DAEMON_ROOTDIR=/dev/shm/gkfs_rootdir +DAEMON_MOUNTDIR=/dev/shm/gkfs_mountdir +DAEMON_NUMACTL=false +DAEMON_CPUNODEBIND="1" +DAEMON_MEMBIND="1" +DAEMON_PID_FILE=/dev/shm/gkfs_daemon.pid +DAEMON_ARGS="" + +# logging +GKFS_DAEMON_LOG_LEVEL=info +GKFS_DAEMON_LOG_PATH=/dev/shm/vef_gkfs_daemon.log +LIBGKFS_LOG=errors,warnings +LIBGKFS_LOG_OUTPUT=/dev/shm/vef_gkfs_client.log -- GitLab From d2d2671b2496abd5b00ac0e5b46059603e147330 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Tue, 26 Apr 2022 19:55:26 +0200 Subject: [PATCH 3/6] Adding documentation for gkfs script --- README.md | 48 +++++++++-- docs/sphinx/users/running.md | 36 ++++++++ scripts/run/gkfs | 155 ++++++++++++++++++++++++++--------- scripts/run/gkfs.conf | 6 +- 4 files changed, 199 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 60bfd2292..dfab50b0a 100644 --- a/README.md +++ b/README.md @@ -144,16 +144,52 @@ to be empty. For MPI application, the `LD_PRELOAD` variable can be passed with the `-x` argument for `mpirun/mpiexec`. +## Run GekkoFS daemons on multiple nodes (beta version!) + +The `scripts/run/gkfs` script can be used to simplify starting the GekkoFS daemon on one or multiple nodes. To start +GekkoFS on multiple nodes, a Slurm environment that can execute `srun` is required. Users can further +modify `scripts/run/gkfs.conf` to mold default configurations to their environment. + +The following options are available for `scripts/run/gkfs`: + +```bash +usage: gkfs [-h/--help] [-r/--rootdir ] [-m/--mountdir ] [-a/--args ] [-f/--foreground ] + [--srun ] [-n/--numnodes ] [--cpuspertask <64>] [--numactl ] [-v/--verbose ] + {start,stop} + + + This script simplifies the starting and stopping GekkoFS daemons. If daemons are started on multiple nodes, + a Slurm environment is required. The script looks for the 'gkfs.conf' file in the same directory where + additional permanent configurations can be set. + + positional arguments: + command Command to execute: 'start' and 'stop' + + optional arguments: + -h, --help Shows this help message and exits + -r, --rootdir Providing the rootdir path for GekkoFS daemons. + -m, --mountdir Providing the mountdir path for GekkoFS daemons. + -a, --args + Add various additional daemon arguments, e.g., "-l ib0 -P ofi+psm2". + -f, --foreground Starts the script in the foreground. Daemons are stopped by pressing 'q'. + --srun Use srun to start daemons on multiple nodes. + -n, --numnodes GekkoFS daemons are started on n nodes. + Nodelist is extracted from Slurm via the SLURM_JOB_ID env variable. + --cpuspertask <#cores> Set the number of cores the daemons can use. Must use '--srun'. + --numactl Use numactl for the daemon. Modify gkfs.conf for further numactl configurations. + -v, --verbose Increase verbosity +``` + ### Logging -The following environment variables can be used to enable logging in the client -library: `LIBGKFS_LOG=` and `LIBGKFS_LOG_OUTPUT=` to -configure the output module and set the path to the log file of the client -library. If not path is specified in `LIBGKFS_LOG_OUTPUT`, the client library -will send log messages to `/tmp/gkfs_client.log`. + +The following environment variables can be used to enable logging in the client library: `LIBGKFS_LOG=` +and `LIBGKFS_LOG_OUTPUT=` to configure the output module and set the path to the log file of the client +library. If not path is specified in `LIBGKFS_LOG_OUTPUT`, the client library will send log messages +to `/tmp/gkfs_client.log`. The following modules are available: - - `none`: don't print any messages +- `none`: don't print any messages - `syscalls`: Trace system calls: print the name of each system call, its arguments, and its return value. All system calls are printed after being executed save for those that may not return, such as `execve()`, diff --git a/docs/sphinx/users/running.md b/docs/sphinx/users/running.md index 3346c4daf..f2257dfa1 100644 --- a/docs/sphinx/users/running.md +++ b/docs/sphinx/users/running.md @@ -136,6 +136,42 @@ to be empty. For MPI applications, the `LD_PRELOAD` and `LIBGKFS_HOSTS_FILE` variables can be passed with the `-x` argument for `mpirun/mpiexec`. +## Run GekkoFS daemons on multiple nodes (beta version!) + +The `scripts/run/gkfs` script can be used to simplify starting the GekkoFS daemon on one or multiple nodes. To start +GekkoFS on multiple nodes, a Slurm environment that can execute `srun` is required. Users can further +modify `scripts/run/gkfs.conf` to mold default configurations to their environment. + +The following options are available for `scripts/run/gkfs`: + +```bash +usage: gkfs [-h/--help] [-r/--rootdir ] [-m/--mountdir ] [-a/--args ] [-f/--foreground ] + [--srun ] [-n/--numnodes ] [--cpuspertask <64>] [--numactl ] [-v/--verbose ] + {start,stop} + + + This script simplifies the starting and stopping GekkoFS daemons. If daemons are started on multiple nodes, + a Slurm environment is required. The script looks for the 'gkfs.conf' file in the same directory where + additional permanent configurations can be set. + + positional arguments: + command Command to execute: 'start' and 'stop' + + optional arguments: + -h, --help Shows this help message and exits + -r, --rootdir Providing the rootdir path for GekkoFS daemons. + -m, --mountdir Providing the mountdir path for GekkoFS daemons. + -a, --args + Add various additional daemon arguments, e.g., "-l ib0 -P ofi+psm2". + -f, --foreground Starts the script in the foreground. Daemons are stopped by pressing 'q'. + --srun Use srun to start daemons on multiple nodes. + -n, --numnodes GekkoFS daemons are started on n nodes. + Nodelist is extracted from Slurm via the SLURM_JOB_ID env variable. + --cpuspertask <#cores> Set the number of cores the daemons can use. Must use '--srun'. + --numactl Use numactl for the daemon. Modify gkfs.conf for further numactl configurations. + -v, --verbose Increase verbosity +``` + ### Logging #### Client logging diff --git a/scripts/run/gkfs b/scripts/run/gkfs index 585f86f40..42f4f889e 100755 --- a/scripts/run/gkfs +++ b/scripts/run/gkfs @@ -1,22 +1,15 @@ #!/bin/bash - -# global variables -export FI_PSM2_DISCONNECT=1 -export PSM2_MULTI_EP=1 -SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)" -CONFIGPATH="${SCRIPTDIR}/gkfs.conf" -source "$CONFIGPATH" - -VERBOSE=false -NODE_NUM=1 -MOUNTDIR=${DAEMON_MOUNTDIR} -ROOTDIR=${DAEMON_ROOTDIR} -HOSTSFILE=${LIBGKFS_HOSTS_FILE} -CPUS_PER_TASK=$(grep -c ^processor /proc/cpuinfo) -ARGS=${DAEMON_ARGS} -USE_SRUN=false -RUN_FOREGROUND=false - +####################################### +# Poll GekkoFS hostsfile until all daemons are started. +# Exits with 1 if daemons cannot be started. +# Globals: +# HOSTSFILE +# NODE_NUM +# Arguments: +# None +# Outputs: +# Writes error to stdout +####################################### wait_for_gkfs_daemons() { sleep 2 local server_wait_cnt=0 @@ -35,11 +28,21 @@ wait_for_gkfs_daemons() { fi done } - +####################################### +# Creates a pid file for a given pid. If pid file exists, we check if its pids are still valid. +# If valid, an additional line is added. Otherwise, the pid in the file is deleted. +# Globals: +# DAEMON_PID_FILE +# VERBOSE +# Arguments: +# pid to write to pid file +# Outputs: +# Writes status to stdout if VERBOSE is true +####################################### create_pid_file() { local pid_file=${DAEMON_PID_FILE} local pid=${1} - if [[ $VERBOSE == true ]]; then + if [[ ${VERBOSE} == true ]]; then echo "Creating pid file at ${pid_file} with pid ${pid} ..." fi # if PID file exists another daemon could run @@ -59,7 +62,25 @@ create_pid_file() { fi echo "${pid}" >> "${pid_file}" } - +####################################### +# Starts GekkoFS daemons. +# Globals: +# SLURM_JOB_ID +# NODE_NUM +# MOUNTDIR +# ROOTDIR +# ARGS +# CPUS_PER_TASK +# VERBOSE +# USE_NUMACTL +# DAEMON_CPUNODEBIND +# DAEMON_MEMBIND +# GKFS_DAEMON_LOG_PATH +# GKFS_DAEMON_LOG_LEVEL +# RUN_FOREGROUND +# Outputs: +# Writes status to stdout +####################################### start_daemon() { local node_list local srun_cmd @@ -74,21 +95,21 @@ start_daemon() { srun_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --ntasks-per-node=1 --overcommit --contiguous --cpus-per-task=${CPUS_PER_TASK} --oversubscribe --mem=0 " fi - if [[ $VERBOSE == true ]]; then + if [[ ${VERBOSE} == true ]]; then echo "### mountdir: ${MOUNTDIR}" echo "### rootdir: ${ROOTDIR}" echo "### node_num: ${NODE_NUM}" echo "### args: ${ARGS}" echo "### cpus_per_task: ${CPUS_PER_TASK}" fi - if [[ $VERBOSE == true ]]; then + if [[ ${VERBOSE} == true ]]; then echo "# Cleaning host file ..." fi rm "${HOSTSFILE}" 2> /dev/null # Setting up base daemon cmd local daemon_cmd="${DAEMON_BIN} -r ${ROOTDIR} -m ${MOUNTDIR} -H ${HOSTSFILE} ${ARGS}" # Setting up numactl - if [[ ${DAEMON_NUMACTL} == true ]]; then + if [[ ${USE_NUMACTL} == true ]]; then daemon_cmd="numactl --cpunodebind=${DAEMON_CPUNODEBIND} --membind=${DAEMON_MEMBIND} ${daemon_cmd}" fi # final daemon execute command @@ -128,19 +149,26 @@ start_daemon() { create_pid_file ${daemon_pid} fi } - +####################################### +# Stops GekkoFS daemons for the configured pid file +# Globals: +# DAEMON_PID_FILE +# VERBOSE +# Outputs: +# Writes status to stdout +####################################### stop_daemons() { local pid_file=${DAEMON_PID_FILE} if [[ -e ${pid_file} ]]; then while IFS= read -r line do if ps -p "${line}" > /dev/null; then - if [[ $VERBOSE == true ]]; then + if [[ ${VERBOSE} == true ]]; then echo "Stopping daemon with pid ${line}" fi kill -s SIGINT "${line}" & # poll pid until it stopped - if [[ $VERBOSE == true ]]; then + if [[ ${VERBOSE} == true ]]; then echo "Waiting for daemons to exit ..." fi timeout 1 tail --pid=${line} -f /dev/null @@ -151,19 +179,68 @@ stop_daemons() { echo "No pid file found -> no daemon running. Exiting ..." fi } - +####################################### +# Print short usage information +# Outputs: +# Writes help to stdout +####################################### usage_short() { echo " -usage: gkfs.sh [-h] [-r/--rootdir ] [-m/--mountdir ] [-n/--numnodes ] [-f/--foreground ] - [-a/--args ] [--srun ] [-c/--cpuspertask <64>] [-v/--verbose ] +usage: gkfs [-h/--help] [-r/--rootdir ] [-m/--mountdir ] [-a/--args ] [-f/--foreground ] + [--srun ] [-n/--numnodes ] [--cpuspertask <64>] [--numactl ] [-v/--verbose ] {start,stop} " } - +####################################### +# Print detailed usage information +# Outputs: +# Writes help to stdout +####################################### help_msg() { - usage_short + echo " + This script simplifies the starting and stopping GekkoFS daemons. If daemons are started on multiple nodes, + a Slurm environment is required. The script looks for the 'gkfs.conf' file in the same directory where + additional permanent configurations can be set. + + positional arguments: + command Command to execute: 'start' and 'stop' + + optional arguments: + -h, --help Shows this help message and exits + -r, --rootdir Providing the rootdir path for GekkoFS daemons. + -m, --mountdir Providing the mountdir path for GekkoFS daemons. + -a, --args + Add various additional daemon arguments, e.g., \"-l ib0 -P ofi+psm2\". + -f, --foreground Starts the script in the foreground. Daemons are stopped by pressing 'q'. + --srun Use srun to start daemons on multiple nodes. + -n, --numnodes GekkoFS daemons are started on n nodes. + Nodelist is extracted from Slurm via the SLURM_JOB_ID env variable. + --cpuspertask <#cores> Set the number of cores the daemons can use. Must use '--srun'. + --numactl Use numactl for the daemon. Modify gkfs.conf for further numactl configurations. + -v, --verbose Increase verbosity + " } + +# global variables +export FI_PSM2_DISCONNECT=1 +export PSM2_MULTI_EP=1 +SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)" +CONFIGPATH="${SCRIPTDIR}/gkfs.conf" +source "$CONFIGPATH" + +# more global variables which may be overwritten by user input +VERBOSE=false +NODE_NUM=1 +MOUNTDIR=${DAEMON_MOUNTDIR} +ROOTDIR=${DAEMON_ROOTDIR} +HOSTSFILE=${LIBGKFS_HOSTS_FILE} +CPUS_PER_TASK=$(grep -c ^processor /proc/cpuinfo) +ARGS=${DAEMON_ARGS} +USE_SRUN=${USE_SRUN} +RUN_FOREGROUND=false +USE_NUMACTL=${DAEMON_NUMACTL} + # parse input POSITIONAL=() while [[ $# -gt 0 ]]; do @@ -186,7 +263,7 @@ while [[ $# -gt 0 ]]; do shift # past value ;; -a | --args) - ARGS=$2 + ARGS="${ARGS} $2" shift # past argument shift # past value ;; @@ -198,7 +275,11 @@ while [[ $# -gt 0 ]]; do RUN_FOREGROUND=true shift # past argument ;; - -c | --cpuspertask) + --numactl) + USE_NUMACTL=true + shift # past argument + ;; + --cpuspertask) CPUS_PER_TASK=$2 shift # past argument shift # past value @@ -226,18 +307,18 @@ if [[ -z ${1+x} ]]; then exit 1 fi command="${1}" - +# checking input if [[ ${command} != *"start"* ]] && [[ ${command} != *"stop"* ]]; then echo "ERROR: command ${command} not supported" usage_short exit 1 fi - +# Run script if [[ ${command} == "start" ]]; then start_daemon elif [[ ${command} == "stop" ]]; then stop_daemons fi -if [[ $VERBOSE == true ]]; then +if [[ ${VERBOSE} == true ]]; then echo "Nothing left to do. Exiting :)" fi \ No newline at end of file diff --git a/scripts/run/gkfs.conf b/scripts/run/gkfs.conf index 3606f1a1d..67bb7c85b 100644 --- a/scripts/run/gkfs.conf +++ b/scripts/run/gkfs.conf @@ -3,10 +3,9 @@ # binaries (default for project_dir/build PRELOAD_LIB=../../build/src/client/libgkfs_intercept.so DAEMON_BIN=../../build/src/daemon/gkfs_daemon -PROXY_BIN=../../build/src/proxy/gkfs_proxy # client configuration -LIBGKFS_HOSTS_FILE=../../build/gkfs_hostfile +LIBGKFS_HOSTS_FILE=./gkfs_hostfile # daemon configuration DAEMON_ROOTDIR=/dev/shm/gkfs_rootdir @@ -14,8 +13,9 @@ DAEMON_MOUNTDIR=/dev/shm/gkfs_mountdir DAEMON_NUMACTL=false DAEMON_CPUNODEBIND="1" DAEMON_MEMBIND="1" -DAEMON_PID_FILE=/dev/shm/gkfs_daemon.pid +DAEMON_PID_FILE=./gkfs_daemon.pid DAEMON_ARGS="" +USE_SRUN=false # logging GKFS_DAEMON_LOG_LEVEL=info -- GitLab From 600b09ea698575cb909690fe98438374399fc9f3 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Tue, 26 Apr 2022 20:10:48 +0200 Subject: [PATCH 4/6] Adding changelog entry --- CHANGELOG.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a8160da9..0d6c73389 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,16 +7,16 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] - ### New +- Added a new script for starting and stopping daemons on multiple + nodes ([!135](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/135)). - Added statistics gathering on daemons ([!132](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/132)). - Stats output can be enabled with: - `--enable-collection` collects normal statistics. - `--enable-chunkstats` collects extended chunk statistics. - Statistics output to file is controlled by `--output-stats ` -- Added Prometheus support for outputting - statistics ([!132](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/132)): +- Added Prometheus support for outputting statistics ([!132](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/132)): - Prometheus dependency optional and enabled at compile time with the CMake argument `GKFS_ENABLE_PROMETHEUS`. - `--enable-prometheus` enables statistics pushing to Prometheus if statistics are enabled. - `--prometheus-gateway` sets an IP and port for the Prometheus connection. @@ -29,6 +29,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - `-c` argument has been moved to `--clean-rootdir-finish` and is now used to clean rootdir/metadir on daemon shutdown ([!110](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/110)). +- Environment variable to change Daemon log levels was changed from `GKFS_LOG_LEVEL` + to `GKFS_DAEMON_LOG_LEVEL` ([!135](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/135)). ### Removed - Removed old initialization code in the GekkoFS -- GitLab From af1417c2de72bc84d984944acfa3f3711a53ff04 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Thu, 28 Apr 2022 20:19:49 +0200 Subject: [PATCH 5/6] Adding -c argument to pass specific config file to gkfs script --- README.md | 1 + docs/sphinx/users/running.md | 1 + scripts/run/gkfs | 35 ++++++++++++++++++++++++++++------- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index dfab50b0a..696aebdc9 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,7 @@ usage: gkfs [-h/--help] [-r/--rootdir ] [-m/--mountdir ] [-a/--args Nodelist is extracted from Slurm via the SLURM_JOB_ID env variable. --cpuspertask <#cores> Set the number of cores the daemons can use. Must use '--srun'. --numactl Use numactl for the daemon. Modify gkfs.conf for further numactl configurations. + -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. -v, --verbose Increase verbosity ``` diff --git a/docs/sphinx/users/running.md b/docs/sphinx/users/running.md index f2257dfa1..dff987d91 100644 --- a/docs/sphinx/users/running.md +++ b/docs/sphinx/users/running.md @@ -169,6 +169,7 @@ usage: gkfs [-h/--help] [-r/--rootdir ] [-m/--mountdir ] [-a/--args Nodelist is extracted from Slurm via the SLURM_JOB_ID env variable. --cpuspertask <#cores> Set the number of cores the daemons can use. Must use '--srun'. --numactl Use numactl for the daemon. Modify gkfs.conf for further numactl configurations. + -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. -v, --verbose Increase verbosity ``` diff --git a/scripts/run/gkfs b/scripts/run/gkfs index 42f4f889e..b85afd25f 100755 --- a/scripts/run/gkfs +++ b/scripts/run/gkfs @@ -99,7 +99,7 @@ start_daemon() { echo "### mountdir: ${MOUNTDIR}" echo "### rootdir: ${ROOTDIR}" echo "### node_num: ${NODE_NUM}" - echo "### args: ${ARGS}" + echo "### additional daemon args: ${ARGS}" echo "### cpus_per_task: ${CPUS_PER_TASK}" fi if [[ ${VERBOSE} == true ]]; then @@ -208,8 +208,8 @@ help_msg() { optional arguments: -h, --help Shows this help message and exits - -r, --rootdir Providing the rootdir path for GekkoFS daemons. - -m, --mountdir Providing the mountdir path for GekkoFS daemons. + -r, --rootdir The rootdir path for GekkoFS daemons. + -m, --mountdir The mountdir path for GekkoFS daemons. -a, --args Add various additional daemon arguments, e.g., \"-l ib0 -P ofi+psm2\". -f, --foreground Starts the script in the foreground. Daemons are stopped by pressing 'q'. @@ -218,17 +218,34 @@ help_msg() { Nodelist is extracted from Slurm via the SLURM_JOB_ID env variable. --cpuspertask <#cores> Set the number of cores the daemons can use. Must use '--srun'. --numactl Use numactl for the daemon. Modify gkfs.conf for further numactl configurations. + -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. -v, --verbose Increase verbosity " } - +CONFIGPATH="" +argv=("$@") +# get config path first from argument list +for i in "${argv[@]}"; do + if [[ "${argv[i]}" == "-c" || "${argv[i]}" == "--config" ]]; then + CONFIGPATH=$(readlink -mn "${argv[i+1]}") + break + fi +done # global variables export FI_PSM2_DISCONNECT=1 export PSM2_MULTI_EP=1 SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)" -CONFIGPATH="${SCRIPTDIR}/gkfs.conf" -source "$CONFIGPATH" +# get default path if config wasn't given +if [[ -z ${CONFIGPATH} ]]; then + CONFIGPATH="${SCRIPTDIR}/gkfs.conf" +fi +if [[ ! -f ${CONFIGPATH} ]]; then + >&2 echo ">> No config file found at '${CONFIGPATH}'." + exit 1 +fi +# get variables from CONFIGPATH +source "$CONFIGPATH" # more global variables which may be overwritten by user input VERBOSE=false NODE_NUM=1 @@ -240,7 +257,6 @@ ARGS=${DAEMON_ARGS} USE_SRUN=${USE_SRUN} RUN_FOREGROUND=false USE_NUMACTL=${DAEMON_NUMACTL} - # parse input POSITIONAL=() while [[ $# -gt 0 ]]; do @@ -284,6 +300,11 @@ while [[ $# -gt 0 ]]; do shift # past argument shift # past value ;; + -c | --config) + # skip. was handled above + shift # past argument + shift # past value + ;; -h | --help) help_msg exit -- GitLab From 33179ba3d62c03c8e27cf37e78f599fbfe2e86af Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Thu, 28 Apr 2022 20:24:22 +0200 Subject: [PATCH 6/6] gkfs script: Allow specific srun arguments via config file --- scripts/run/gkfs | 2 +- scripts/run/gkfs.conf | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/run/gkfs b/scripts/run/gkfs index b85afd25f..f097d81fc 100755 --- a/scripts/run/gkfs +++ b/scripts/run/gkfs @@ -92,7 +92,7 @@ start_daemon() { NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l) fi # Setting up base srun cmd - srun_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --ntasks-per-node=1 --overcommit --contiguous --cpus-per-task=${CPUS_PER_TASK} --oversubscribe --mem=0 " + srun_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " fi if [[ ${VERBOSE} == true ]]; then diff --git a/scripts/run/gkfs.conf b/scripts/run/gkfs.conf index 67bb7c85b..bd74219a9 100644 --- a/scripts/run/gkfs.conf +++ b/scripts/run/gkfs.conf @@ -10,12 +10,17 @@ LIBGKFS_HOSTS_FILE=./gkfs_hostfile # daemon configuration DAEMON_ROOTDIR=/dev/shm/gkfs_rootdir DAEMON_MOUNTDIR=/dev/shm/gkfs_mountdir -DAEMON_NUMACTL=false -DAEMON_CPUNODEBIND="1" -DAEMON_MEMBIND="1" +# path to daemon pid file; created where the script is run DAEMON_PID_FILE=./gkfs_daemon.pid +# additional daemon arguments (see `gkfs_daemon -h`) DAEMON_ARGS="" +# Use Slurm's srun to start the daemons on multiple nodes and set specific srun args USE_SRUN=false +SRUN_ARGS="--ntasks-per-node=1 --overcommit --contiguous --oversubscribe --mem=0" +# use numactl to pin daemon to socket +DAEMON_NUMACTL=false +DAEMON_CPUNODEBIND="1" +DAEMON_MEMBIND="1" # logging GKFS_DAEMON_LOG_LEVEL=info -- GitLab