Loading plugins/adhoc_services.d/gekkofs.sh +16 −8 Original line number Diff line number Diff line #!/usr/bin/bash echo "GEKKOFS Script Called" $HOSTNAME $SLURM_JOBID # If GKFS_DAEMON is not defined then define it here if [ -z "$GKFS_DAEMON" ]; then GKFS_DAEMON=/home/rnou/iodeps/bin/gkfs_daemon fi # If LIBGKFS_HOSTS_FILE is not defined then define it here if [ -z "$LIBGKFS_HOSTS_FILE" ]; then LIBGKFS_HOSTS_FILE=/tmp/gekkofs/gkfs_hosts.txt fi if [ "$1" == "start" ]; then echo "Starting GEKKOFS" nodes=$3 num_nodes=$(echo $nodes | awk -F, '{print NF}') # If num_nodes is greater than 40, we are on the testing environment if [ $num_nodes -gt 40 ]; then # If num_nodes is 50, we are on the testing environment if [ $num_nodes -eq 50 ]; then exit 0 fi workdir=$5 datadir=$7 mountdir=$9 unset SLURM_CPU_BIND SLURM_CPU_BIND_LIST SLURM_CPU_BIND_TYPE SLURM_CPU_BIND_VERBOSE srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "mkdir -p $mountdir; mkdir -p $datadir" srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=4 --mem-per-cpu=1 --export=ALL bash -c "gkfs_daemon --rootdir $datadir --mountdir $mountdir" & srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "mkdir -p $mountdir; mkdir -p $datadir" srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "$GKFS_DAEMON --rootdir $datadir --mountdir $mountdir -H $LIBGKFS_HOSTS_FILE" & sleep 4 elif [ "$1" == "stop" ]; then echo "Stopping GEKKOFS" nodes=$3 num_nodes=$(echo $nodes | awk -F, '{print NF}') # If num_nodes is greater than 40, we are on the testing environment if [ $num_nodes -gt 40 ]; then # If num_nodes is 50, we are on the testing environment if [ $num_nodes -eq 50 ]; then exit 0 fi unset SLURM_CPU_BIND SLURM_CPU_BIND_LIST SLURM_CPU_BIND_TYPE SLURM_CPU_BIND_VERBOSE srun -N $num_nodes -n $num_nodes --overlap --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "pkill -9 gkfs_daemon" srun -N $num_nodes -n $num_nodes --overlap --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "pkill -9 gkfs_daemon" elif [ "$1" == "expand" ]; then echo "Expand command" elif [ "$1" == "shrink" ]; then Loading plugins/slurm/scord_prolog.sh.in +15 −4 Original line number Diff line number Diff line Loading @@ -176,11 +176,21 @@ CARGO_ID=$(echo "cargo_$SLURM_JOB_ID.$SLURM_JOB_UID" | sha256sum | awk '{ print CARGO_CONFIG_FILE=$CARGO_CONFIG_DIRECTORY/$CARGO_ID.cfg CARGO_MASTER_ADDRESS="$SCORDCTL_PROTO://$ADDRESS:$CARGO_PORT" CARGO_INSTANCE_NAME=$(systemd-escape --template cargo@.service "$CARGO_ID") # This will fail always as we do not have the job registered in this moment if ! CARGO_NUM_NODES=$(@SCORD_QUERY_PROGRAM@ -s @SCORD_SERVICE_ADDRESS@ "$SLURM_JOB_ID" | grep io_procs | awk '{ print $2 }'); then echo "Failed to determine the number of I/O processes for job $SLURM_JOB_ID" else #if ! CARGO_NUM_NODES=$(@SCORD_QUERY_PROGRAM@ -s @SCORD_SERVICE_ADDRESS@ "$SLURM_JOB_ID" | grep io_procs | awk '{ print $2 }'); then # echo "Failed to determine the number of I/O processes for job $SLURM_JOB_ID" #else CARGO_NUM_NODES=${#hostnames[@]} #fi # If LIBGKFS_HOSTS_FILE is nor defined then do it if [ -z "$LIBGKFS_HOSTS_FILE" ]; then LIBGKFS_HOSTS_FILE=/tmp/gekkofs/gkfs_hosts.txt fi # if number of CARGO_NUM_NODES is below 2, use 2, they will be colocated if [ $CARGO_NUM_NODES -lt 2 ]; then CARGO_NUM_NODES=2 fi cat <<EOT >>"$CARGO_CONFIG_FILE" Loading @@ -188,6 +198,7 @@ CARGO_ID=$CARGO_ID CARGO_HOSTS=$hostnames_csv CARGO_NUM_NODES=$CARGO_NUM_NODES CARGO_ADDRESS=$CARGO_MASTER_ADDRESS LIBGKFS_HOSTS_FILE=$LIBGKFS_HOSTS_FILE EOT CUID=$(id -u $SLURM_JOB_USER) chown "$SLURM_JOB_USER":"$SLURM_JOB_GROUP" "$CARGO_CONFIG_FILE" Loading plugins/slurm/slurmadmcli.c +4 −4 Original line number Diff line number Diff line Loading @@ -953,11 +953,11 @@ scord_unregister_job(spank_t sp, scord_plugin_config_t cfg, } // remove_adhoc_storage ADM_remove_adhoc_storage(scord_server, adhoc_storage); ADM_terminate_adhoc_storage(scord_server, adhoc_storage); // ADM_remove_adhoc_storage(scord_server, adhoc_storage); // remove all the files (this should be done on all the nodes.. TODO) remove_dir_content(adhoc_path); rmdir(adhoc_path); // remove job ADM_remove_job(scord_server, scord_job); Loading Loading
plugins/adhoc_services.d/gekkofs.sh +16 −8 Original line number Diff line number Diff line #!/usr/bin/bash echo "GEKKOFS Script Called" $HOSTNAME $SLURM_JOBID # If GKFS_DAEMON is not defined then define it here if [ -z "$GKFS_DAEMON" ]; then GKFS_DAEMON=/home/rnou/iodeps/bin/gkfs_daemon fi # If LIBGKFS_HOSTS_FILE is not defined then define it here if [ -z "$LIBGKFS_HOSTS_FILE" ]; then LIBGKFS_HOSTS_FILE=/tmp/gekkofs/gkfs_hosts.txt fi if [ "$1" == "start" ]; then echo "Starting GEKKOFS" nodes=$3 num_nodes=$(echo $nodes | awk -F, '{print NF}') # If num_nodes is greater than 40, we are on the testing environment if [ $num_nodes -gt 40 ]; then # If num_nodes is 50, we are on the testing environment if [ $num_nodes -eq 50 ]; then exit 0 fi workdir=$5 datadir=$7 mountdir=$9 unset SLURM_CPU_BIND SLURM_CPU_BIND_LIST SLURM_CPU_BIND_TYPE SLURM_CPU_BIND_VERBOSE srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "mkdir -p $mountdir; mkdir -p $datadir" srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=4 --mem-per-cpu=1 --export=ALL bash -c "gkfs_daemon --rootdir $datadir --mountdir $mountdir" & srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "mkdir -p $mountdir; mkdir -p $datadir" srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "$GKFS_DAEMON --rootdir $datadir --mountdir $mountdir -H $LIBGKFS_HOSTS_FILE" & sleep 4 elif [ "$1" == "stop" ]; then echo "Stopping GEKKOFS" nodes=$3 num_nodes=$(echo $nodes | awk -F, '{print NF}') # If num_nodes is greater than 40, we are on the testing environment if [ $num_nodes -gt 40 ]; then # If num_nodes is 50, we are on the testing environment if [ $num_nodes -eq 50 ]; then exit 0 fi unset SLURM_CPU_BIND SLURM_CPU_BIND_LIST SLURM_CPU_BIND_TYPE SLURM_CPU_BIND_VERBOSE srun -N $num_nodes -n $num_nodes --overlap --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "pkill -9 gkfs_daemon" srun -N $num_nodes -n $num_nodes --overlap --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "pkill -9 gkfs_daemon" elif [ "$1" == "expand" ]; then echo "Expand command" elif [ "$1" == "shrink" ]; then Loading
plugins/slurm/scord_prolog.sh.in +15 −4 Original line number Diff line number Diff line Loading @@ -176,11 +176,21 @@ CARGO_ID=$(echo "cargo_$SLURM_JOB_ID.$SLURM_JOB_UID" | sha256sum | awk '{ print CARGO_CONFIG_FILE=$CARGO_CONFIG_DIRECTORY/$CARGO_ID.cfg CARGO_MASTER_ADDRESS="$SCORDCTL_PROTO://$ADDRESS:$CARGO_PORT" CARGO_INSTANCE_NAME=$(systemd-escape --template cargo@.service "$CARGO_ID") # This will fail always as we do not have the job registered in this moment if ! CARGO_NUM_NODES=$(@SCORD_QUERY_PROGRAM@ -s @SCORD_SERVICE_ADDRESS@ "$SLURM_JOB_ID" | grep io_procs | awk '{ print $2 }'); then echo "Failed to determine the number of I/O processes for job $SLURM_JOB_ID" else #if ! CARGO_NUM_NODES=$(@SCORD_QUERY_PROGRAM@ -s @SCORD_SERVICE_ADDRESS@ "$SLURM_JOB_ID" | grep io_procs | awk '{ print $2 }'); then # echo "Failed to determine the number of I/O processes for job $SLURM_JOB_ID" #else CARGO_NUM_NODES=${#hostnames[@]} #fi # If LIBGKFS_HOSTS_FILE is nor defined then do it if [ -z "$LIBGKFS_HOSTS_FILE" ]; then LIBGKFS_HOSTS_FILE=/tmp/gekkofs/gkfs_hosts.txt fi # if number of CARGO_NUM_NODES is below 2, use 2, they will be colocated if [ $CARGO_NUM_NODES -lt 2 ]; then CARGO_NUM_NODES=2 fi cat <<EOT >>"$CARGO_CONFIG_FILE" Loading @@ -188,6 +198,7 @@ CARGO_ID=$CARGO_ID CARGO_HOSTS=$hostnames_csv CARGO_NUM_NODES=$CARGO_NUM_NODES CARGO_ADDRESS=$CARGO_MASTER_ADDRESS LIBGKFS_HOSTS_FILE=$LIBGKFS_HOSTS_FILE EOT CUID=$(id -u $SLURM_JOB_USER) chown "$SLURM_JOB_USER":"$SLURM_JOB_GROUP" "$CARGO_CONFIG_FILE" Loading
plugins/slurm/slurmadmcli.c +4 −4 Original line number Diff line number Diff line Loading @@ -953,11 +953,11 @@ scord_unregister_job(spank_t sp, scord_plugin_config_t cfg, } // remove_adhoc_storage ADM_remove_adhoc_storage(scord_server, adhoc_storage); ADM_terminate_adhoc_storage(scord_server, adhoc_storage); // ADM_remove_adhoc_storage(scord_server, adhoc_storage); // remove all the files (this should be done on all the nodes.. TODO) remove_dir_content(adhoc_path); rmdir(adhoc_path); // remove job ADM_remove_job(scord_server, scord_job); Loading