Skip to content
Snippets Groups Projects
Commit 9ed93496 authored by Ramon Nou's avatar Ramon Nou
Browse files

Solving root installation defines

parent 6152a5f2
No related branches found
No related tags found
1 merge request!126Resolve "Root issues and stage-out with SPANK"
Pipeline #4451 failed
#!/usr/bin/bash
echo "GEKKOFS Script Called" $HOSTNAME $SLURM_JOBID
# If GKFS_DAEMON is not defined then define it here
if [ -z "$GKFS_DAEMON" ]; then
GKFS_DAEMON=/home/rnou/iodeps/bin/gkfs_daemon
fi
# If LIBGKFS_HOSTS_FILE is not defined then define it here
if [ -z "$LIBGKFS_HOSTS_FILE" ]; then
LIBGKFS_HOSTS_FILE=/tmp/gekkofs/gkfs_hosts.txt
fi
if [ "$1" == "start" ]; then
echo "Starting GEKKOFS"
nodes=$3
num_nodes=$(echo $nodes | awk -F, '{print NF}')
# If num_nodes is greater than 40, we are on the testing environment
if [ $num_nodes -gt 40 ]; then
# If num_nodes is 50, we are on the testing environment
if [ $num_nodes -eq 50 ]; then
exit 0
fi
workdir=$5
datadir=$7
mountdir=$9
unset SLURM_CPU_BIND SLURM_CPU_BIND_LIST SLURM_CPU_BIND_TYPE SLURM_CPU_BIND_VERBOSE
srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "mkdir -p $mountdir; mkdir -p $datadir"
srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=4 --mem-per-cpu=1 --export=ALL bash -c "gkfs_daemon --rootdir $datadir --mountdir $mountdir" &
srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "mkdir -p $mountdir; mkdir -p $datadir"
srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "$GKFS_DAEMON --rootdir $datadir --mountdir $mountdir -H $LIBGKFS_HOSTS_FILE" &
sleep 4
elif [ "$1" == "stop" ]; then
echo "Stopping GEKKOFS"
nodes=$3
num_nodes=$(echo $nodes | awk -F, '{print NF}')
# If num_nodes is greater than 40, we are on the testing environment
if [ $num_nodes -gt 40 ]; then
# If num_nodes is 50, we are on the testing environment
if [ $num_nodes -eq 50 ]; then
exit 0
fi
unset SLURM_CPU_BIND SLURM_CPU_BIND_LIST SLURM_CPU_BIND_TYPE SLURM_CPU_BIND_VERBOSE
srun -N $num_nodes -n $num_nodes --overlap --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "pkill -9 gkfs_daemon"
srun -N $num_nodes -n $num_nodes --overlap --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "pkill -9 gkfs_daemon"
elif [ "$1" == "expand" ]; then
echo "Expand command"
elif [ "$1" == "shrink" ]; then
......
......@@ -176,11 +176,21 @@ CARGO_ID=$(echo "cargo_$SLURM_JOB_ID.$SLURM_JOB_UID" | sha256sum | awk '{ print
CARGO_CONFIG_FILE=$CARGO_CONFIG_DIRECTORY/$CARGO_ID.cfg
CARGO_MASTER_ADDRESS="$SCORDCTL_PROTO://$ADDRESS:$CARGO_PORT"
CARGO_INSTANCE_NAME=$(systemd-escape --template cargo@.service "$CARGO_ID")
# This will fail always as we do not have the job registered in this moment
#if ! CARGO_NUM_NODES=$(@SCORD_QUERY_PROGRAM@ -s @SCORD_SERVICE_ADDRESS@ "$SLURM_JOB_ID" | grep io_procs | awk '{ print $2 }'); then
# echo "Failed to determine the number of I/O processes for job $SLURM_JOB_ID"
#else
CARGO_NUM_NODES=${#hostnames[@]}
#fi
# If LIBGKFS_HOSTS_FILE is nor defined then do it
if [ -z "$LIBGKFS_HOSTS_FILE" ]; then
LIBGKFS_HOSTS_FILE=/tmp/gekkofs/gkfs_hosts.txt
fi
if ! CARGO_NUM_NODES=$(@SCORD_QUERY_PROGRAM@ -s @SCORD_SERVICE_ADDRESS@ "$SLURM_JOB_ID" | grep io_procs | awk '{ print $2 }'); then
echo "Failed to determine the number of I/O processes for job $SLURM_JOB_ID"
else
CARGO_NUM_NODES=${#hostnames[@]}
# if number of CARGO_NUM_NODES is below 2, use 2, they will be colocated
if [ $CARGO_NUM_NODES -lt 2 ]; then
CARGO_NUM_NODES=2
fi
cat <<EOT >>"$CARGO_CONFIG_FILE"
......@@ -188,6 +198,7 @@ CARGO_ID=$CARGO_ID
CARGO_HOSTS=$hostnames_csv
CARGO_NUM_NODES=$CARGO_NUM_NODES
CARGO_ADDRESS=$CARGO_MASTER_ADDRESS
LIBGKFS_HOSTS_FILE=$LIBGKFS_HOSTS_FILE
EOT
CUID=$(id -u $SLURM_JOB_USER)
chown "$SLURM_JOB_USER":"$SLURM_JOB_GROUP" "$CARGO_CONFIG_FILE"
......
......@@ -953,11 +953,11 @@ scord_unregister_job(spank_t sp, scord_plugin_config_t cfg,
}
// remove_adhoc_storage
ADM_remove_adhoc_storage(scord_server, adhoc_storage);
// remove all the files (this should be done on all the nodes.. TODO)
ADM_terminate_adhoc_storage(scord_server, adhoc_storage);
// ADM_remove_adhoc_storage(scord_server, adhoc_storage);
// remove all the files (this should be done on all the nodes.. TODO)
remove_dir_content(adhoc_path);
rmdir(adhoc_path);
// remove job
ADM_remove_job(scord_server, scord_job);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment