Commit 29f2cbf7 authored by Alberto Miranda's avatar Alberto Miranda ♨️
Browse files

plugins/slurm: Add scripts for job control

- `plugins/slurm/scord_prolog.sh`: Prolog script. Starts `scord-ctl`
   server and sets up environment.
- `plugins/slurm/scord_epilog.sh`: Epilog script. Stops `scord-ctl`
   server and cleans up environment.
- `plugins/slurm/scord_common.sh`: Common code to prolog and epilog.
parent bd34b449
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -57,3 +57,18 @@ install(
  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
)

configure_file(scord_common.sh.in scord_common.sh @ONLY)
configure_file(scord_prolog.sh.in scord_prolog.sh @ONLY)
configure_file(scord_epilog.sh.in scord_epilog.sh @ONLY)

install(
  FILES ${CMAKE_CURRENT_BINARY_DIR}/scord_common.sh
        ${CMAKE_CURRENT_BINARY_DIR}/scord_prolog.sh
        ${CMAKE_CURRENT_BINARY_DIR}/scord_epilog.sh
  DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/slurm
  PERMISSIONS
    OWNER_EXECUTE OWNER_WRITE OWNER_READ
    GROUP_EXECUTE GROUP_READ
    WORLD_EXECUTE WORLD_READ
)
+155 −0
Original line number Diff line number Diff line
################################################################################
# Copyright 2022-2023, Inria, France.                                          #
# Copyright 2023, Barcelona Supercomputing Center (BSC), Spain.                #
# All rights reserved.                                                         #
#                                                                              #
# This software was partially supported by the EuroHPC-funded project ADMIRE   #
#   (Project ID: 956748, https://www.admire-eurohpc.eu).                       #
#                                                                              #
# This file is part of scord.                                                  #
#                                                                              #
# scord is free software: you can redistribute it and/or modify                #
# it under the terms of the GNU General Public License as published by         #
# the Free Software Foundation, either version 3 of the License, or            #
# (at your option) any later version.                                          #
#                                                                              #
# scord is distributed in the hope that it will be useful,                     #
# but WITHOUT ANY WARRANTY; without even the implied warranty of               #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                #
# GNU General Public License for more details.                                 #
#                                                                              #
# You should have received a copy of the GNU General Public License            #
# along with scord.  If not, see <https://www.gnu.org/licenses/>.              #
#                                                                              #
# SPDX-License-Identifier: GPL-3.0-or-later                                    #
################################################################################

# Run a command as a user
#   Usage: run_as <username> <command>
#   Example:
#     run_as root mkdir /root/test
#     run_as root "mkdir /root/test && chown root:root /root/test"
function run_as {
  local username=$1
  shift

  if [[ -z "$*" ]]; then
    echo "No command specified"
    return 1
  fi

  if ! id -u "$username" &>/dev/null; then
    echo "User '$username' does not exist"
    return 1
  fi

  if [[ "$USER" == "$username" ]]; then
    echo "Already running as user '$username'"
    ${SHELL} -c "$*"
    return 0
  fi

  if su "$username" -c "$*"; then
    return 0
  fi

  echo "Failed to run '$*' as user '$username'"
  return 1
}

# Get the list of hosts associated with a list of hostnames
#   Usage: get_hostlist <out_var> <hostnames>
#   Example:
#     declare hl
#     get_hostlist hl tux1,tux3,tux4,snoo1,snoo2
#     echo "$hl"  # tux[1,3-4],snoo[1-2]
function get_hostlist {

  if [[ -z "$1" ]]; then
    echo "No output variable specified"
    return 1
  fi

  if [[ -z "$2" ]]; then
    echo "No hostlist specified"
    return 1
  fi

  local -n rv=$1
  rv=$(scontrol show hostlist "$2")
}

# Get the list of hostnames associated with a hostlist
#   Usage: get_nodelist <out_array> <nodelist>
#   Example:
#     declare -a hn
#     get_nodelist hn tux[1,3-4],snoo[1-2]
#     echo "${hn[0]}"  # tux1
#     echo "${hn[@]}"  # tux1 tux3 tux4 snoo1 snoo2
function get_nodelist {

  if [[ -z "$1" ]]; then
    echo "No output array specified"
    return 1
  fi

  if [[ -z "$2" ]]; then
    echo "No hostlist specified"
    return 1
  fi

  local -n rv=$1
  # shellcheck disable=SC2034
  readarray -t rv < <(scontrol show hostnames "$2")
}

# Get the list of IP addresses associated with a hostname
#   Usage: get_addrs <out_array> <hostname>
#   Example:
#     declare -a addrs
#     get_addrs addrs tux1
#     echo "${addrs[0]}"  # 192.18.0.7
function get_addrs {

  if [[ -z "$1" ]]; then
    echo >&2 "No output array specified"
    return 1
  fi

  if [[ -z "$2" ]]; then
    echo >&2 "No hostname specified"
    return 1
  fi

  local db="ahosts"

  if [[ -n "$3" ]]; then
    case "$3" in
    v4 | V4)
      db+="v4"
      ;;
    v6 | V6)
      db+="v6"
      ;;
    "") ;;

    *)
      echo >&2 "Invalid address family requested: $3"
      return 1
      ;;

    esac
  fi

  local -n out=$1
  local hostname=$2

  # shellcheck disable=SC2034
  readarray -t out < <(getent $db "$hostname")
  return 0
}

# shellcheck disable=SC2034
export SCORDCTL_PROGRAM="@SCORD_CTL_BIN@"
export SCORDCTL_PROTO="@SCORD_TRANSPORT_PROTOCOL@"
export SCORDCTL_PORT="@SCORD_CTL_BIND_PORT@"
+83 −0
Original line number Diff line number Diff line
#!/bin/bash
################################################################################
# Copyright 2022-2023, Inria, France.                                          #
# Copyright 2023, Barcelona Supercomputing Center (BSC), Spain.                #
# All rights reserved.                                                         #
#                                                                              #
# This software was partially supported by the EuroHPC-funded project ADMIRE   #
#   (Project ID: 956748, https://www.admire-eurohpc.eu).                       #
#                                                                              #
# This file is part of scord.                                                  #
#                                                                              #
# scord is free software: you can redistribute it and/or modify                #
# it under the terms of the GNU General Public License as published by         #
# the Free Software Foundation, either version 3 of the License, or            #
# (at your option) any later version.                                          #
#                                                                              #
# scord is distributed in the hope that it will be useful,                     #
# but WITHOUT ANY WARRANTY; without even the implied warranty of               #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                #
# GNU General Public License for more details.                                 #
#                                                                              #
# You should have received a copy of the GNU General Public License            #
# along with scord.  If not, see <https://www.gnu.org/licenses/>.              #
#                                                                              #
# SPDX-License-Identifier: GPL-3.0-or-later                                    #
################################################################################

source @CMAKE_INSTALL_FULL_DATADIR@/@PROJECT_NAME@/slurm/scord_common.sh

# determine the temporary directory to use for the epilog logs
if [[ -n "$TMPDIR" ]]; then
  EPILOG_TMPDIR="$TMPDIR"
else
  EPILOG_TMPDIR="/tmp"
fi

# redirect stdout and stderr to a log file in $EPILOG_TMPDIR
exec &>"$EPILOG_TMPDIR/scord_epilog.$SLURM_JOB_ID.log"
# print out all commands
set -x
# print out the value of all variables
env

# if no ADMIRE CLI options were specified, we don't need to do anything
if ! compgen -v SPANK__SLURM_SPANK_OPTION_admire_cli_; then
  echo "SCORD SLURM plugin not requested. Exiting."
  exit 0
fi

# find out some information about the job and where we are running
HOSTNAME=$(hostname -s)
declare -a hostnames
get_nodelist hostnames "$SLURM_NODELIST"

# create a temporary directory for the job and redirect both stdout and stderr
# to a log file within it
WORKDIR="$EPILOG_TMPDIR/$SLURM_JOB_USER/$SLURM_JOBID"
if [ ! -d "$WORKDIR" ]; then
  run_as "$SLURM_JOB_USER" mkdir -p "$WORKDIR"
fi

# now that we have a specific working directory, move the previous log file
# into $WORKDIR so that we have all messages in one place (since the file is
# still open by the shell, the move operation will not affect where the
# messages are written)
mv "$EPILOG_TMPDIR/scord_epilog.$SLURM_JOB_ID.log" "$WORKDIR/scord_epilog.log"

if ((${#hostnames[@]} == 0)); then
  echo "No hostnames found for job $SLURM_JOB_ID. Weird."
  exit 0
fi

# only run on the first node of the allocation (scord-ctl will always be
# started on the first node of the allocation)
if [[ "$HOSTNAME" != "${hostnames[0]}" ]]; then
  exit 0
fi

echo "Shutting down adhoc controller for job $SLURM_JOB_ID (user: $SLURM_JOB_USER)"
PIDFILE="$EPILOG_TMPDIR/$SLURM_JOB_USER/$SLURM_JOBID/scord-ctl.pid"
if [[ -f "$PIDFILE" ]]; then
  kill -TERM "$(<"$PIDFILE")"
fi
+123 −0
Original line number Diff line number Diff line
#!/bin/bash
################################################################################
# Copyright 2022-2023, Inria, France.                                          #
# Copyright 2023, Barcelona Supercomputing Center (BSC), Spain.                #
# All rights reserved.                                                         #
#                                                                              #
# This software was partially supported by the EuroHPC-funded project ADMIRE   #
#   (Project ID: 956748, https://www.admire-eurohpc.eu).                       #
#                                                                              #
# This file is part of scord.                                                  #
#                                                                              #
# scord is free software: you can redistribute it and/or modify                #
# it under the terms of the GNU General Public License as published by         #
# the Free Software Foundation, either version 3 of the License, or            #
# (at your option) any later version.                                          #
#                                                                              #
# scord is distributed in the hope that it will be useful,                     #
# but WITHOUT ANY WARRANTY; without even the implied warranty of               #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                #
# GNU General Public License for more details.                                 #
#                                                                              #
# You should have received a copy of the GNU General Public License            #
# along with scord.  If not, see <https://www.gnu.org/licenses/>.              #
#                                                                              #
# SPDX-License-Identifier: GPL-3.0-or-later                                    #
################################################################################


# This is a prolog script for SLURM that starts the SCORD adhoc controller
# for the job. It is meant to be used with the SCORD SLURM plugin.
# The script is executed as the user that submitted the job. The script
# creates a temporary directory for the job and starts the adhoc controller
# in the background. The PID of the adhoc controller is stored in a file
# in the temporary directory.

source @CMAKE_INSTALL_FULL_DATADIR@/@PROJECT_NAME@/slurm/scord_common.sh

# determine the temporary directory to use for the prolog logs
if [[ -n "$TMPDIR" ]]; then
  PROLOG_TMPDIR="$TMPDIR"
else
  PROLOG_TMPDIR="/tmp"
fi

# redirect stdout and stderr to a log file in $PROLOG_TMPDIR
exec &>"$PROLOG_TMPDIR/scord_prolog.$SLURM_JOB_ID.log"
# print out all commands
set -x
# print out the value of all variables
env

# if no ADMIRE CLI options were specified, we don't need to do anything
if ! compgen -v SPANK__SLURM_SPANK_OPTION_admire_cli_; then
  echo "SCORD SLURM plugin not requested. Exiting."
  exit 0
fi

# find out some information about the job and where we are running
HOSTNAME=$(hostname -s)
declare -a hostnames
get_nodelist hostnames "$SLURM_NODELIST"

# create a temporary directory for the job and redirect both stdout and stderr
# to a log file within it
WORKDIR="$PROLOG_TMPDIR/$SLURM_JOB_USER/$SLURM_JOBID"
if [ ! -d "$WORKDIR" ]; then
  run_as "$SLURM_JOB_USER" mkdir -p "$WORKDIR"
fi

if ((${#hostnames[@]} == 0)); then
  echo "No hostnames found for job $SLURM_JOB_ID. Weird."
  exit 0
fi

# only run on the first node of the allocation (scord-ctl will always be
# started on the first node of the allocation)
if [[ "$HOSTNAME" != "${hostnames[0]}" ]]; then
  exit 0
fi

# find out the IP address of the first node of the allocation
declare -a addrs
if ! get_addrs addrs "$HOSTNAME" v4; then
  echo "Error searching IP addresses for $HOSTNAME."
  exit 1
fi

if ((${#addrs[@]} == 0)); then
  echo "No addresses found."
  exit 1
fi

ADDRESS=$(echo "${addrs[@]}" | awk '{ print $1; exit }')

# now that we have a specific working directory, move the previous log file
# into $WORKDIR so that we have all messages in one place (since the file is
# still open by the shell, the move operation will not affect where the
# messages are written)
mv "$PROLOG_TMPDIR/scord_prolog.$SLURM_JOB_ID.log" "$WORKDIR/scord_prolog.log"

# start the adhoc controller in the background and store its PID in a file
echo "Starting adhoc controller for job $SLURM_JOB_ID (user: $SLURM_JOB_USER)"
run_as "$SLURM_JOB_USER" \
  "$SCORDCTL_PROGRAM" \
  --listen "$SCORDCTL_PROTO://$ADDRESS:$SCORDCTL_PORT" \
  --output "$WORKDIR/scord-ctl.log" \
  --pidfile "$WORKDIR/scord-ctl.pid" \
  '&'

# TODO: this doesn't work for background processes
# shellcheck disable=SC2181
if [[ $? -ne 0 ]]; then
  echo "Failed to start adhoc controller"
  exit 1
fi

# give some time to ensure that the PID file has been created
sleep 0.5s
PID=$(<"$WORKDIR/scord-ctl.pid")

echo "Adhoc controller started successfully (PID: $PID)"

exit 0