Commit 9ed93496 authored by Ramon Nou's avatar Ramon Nou
Browse files

Solving root installation defines

parent 6152a5f2
Loading
Loading
Loading
Loading
Loading
+16 −8
Original line number Diff line number Diff line
#!/usr/bin/bash
echo "GEKKOFS Script Called" $HOSTNAME $SLURM_JOBID

# If GKFS_DAEMON is not defined then define it here
if [ -z "$GKFS_DAEMON" ]; then
    GKFS_DAEMON=/home/rnou/iodeps/bin/gkfs_daemon
fi
# If LIBGKFS_HOSTS_FILE is not defined then define it here
if [ -z "$LIBGKFS_HOSTS_FILE" ]; then
    LIBGKFS_HOSTS_FILE=/tmp/gekkofs/gkfs_hosts.txt
fi

if [ "$1" == "start" ]; then
    echo "Starting GEKKOFS"

    nodes=$3
    num_nodes=$(echo $nodes | awk -F, '{print NF}')
    # If num_nodes is greater than 40, we are on the testing environment
    if [ $num_nodes -gt 40 ]; then
    # If num_nodes is 50, we are on the testing environment
    if [ $num_nodes -eq 50 ]; then
        exit 0
    fi
    workdir=$5
    datadir=$7
    mountdir=$9
    unset SLURM_CPU_BIND SLURM_CPU_BIND_LIST SLURM_CPU_BIND_TYPE SLURM_CPU_BIND_VERBOSE
    srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "mkdir -p $mountdir; mkdir -p $datadir" 
    srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=4 --mem-per-cpu=1 --export=ALL bash -c "gkfs_daemon --rootdir $datadir --mountdir $mountdir" &
    
    srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "mkdir -p $mountdir; mkdir -p $datadir" 
    srun -N $num_nodes -n $num_nodes --oversubscribe --overlap --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "$GKFS_DAEMON --rootdir $datadir --mountdir $mountdir -H $LIBGKFS_HOSTS_FILE" &
    sleep 4
elif [ "$1" == "stop" ]; then
    echo "Stopping GEKKOFS"
    
    nodes=$3
    num_nodes=$(echo $nodes | awk -F, '{print NF}')
    # If num_nodes is greater than 40, we are on the testing environment
    if [ $num_nodes -gt 40 ]; then
    # If num_nodes is 50, we are on the testing environment
    if [ $num_nodes -eq 50 ]; then
        exit 0
    fi
    unset SLURM_CPU_BIND SLURM_CPU_BIND_LIST SLURM_CPU_BIND_TYPE SLURM_CPU_BIND_VERBOSE
    srun -N $num_nodes -n $num_nodes --overlap --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "pkill -9 gkfs_daemon"
    srun -N $num_nodes -n $num_nodes --overlap --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL /usr/bin/bash -c "pkill -9 gkfs_daemon"
elif [ "$1" == "expand" ]; then
    echo "Expand command"
elif [ "$1" == "shrink" ]; then
+15 −4
Original line number Diff line number Diff line
@@ -176,11 +176,21 @@ CARGO_ID=$(echo "cargo_$SLURM_JOB_ID.$SLURM_JOB_UID" | sha256sum | awk '{ print
CARGO_CONFIG_FILE=$CARGO_CONFIG_DIRECTORY/$CARGO_ID.cfg
CARGO_MASTER_ADDRESS="$SCORDCTL_PROTO://$ADDRESS:$CARGO_PORT"
CARGO_INSTANCE_NAME=$(systemd-escape --template cargo@.service "$CARGO_ID")
# This will fail always as we do not have the job registered in this moment

if ! CARGO_NUM_NODES=$(@SCORD_QUERY_PROGRAM@ -s @SCORD_SERVICE_ADDRESS@ "$SLURM_JOB_ID" | grep io_procs | awk '{ print $2 }'); then
  echo "Failed to determine the number of I/O processes for job $SLURM_JOB_ID"
else
#if ! CARGO_NUM_NODES=$(@SCORD_QUERY_PROGRAM@ -s @SCORD_SERVICE_ADDRESS@ "$SLURM_JOB_ID" | grep io_procs | awk '{ print $2 }'); then
#  echo "Failed to determine the number of I/O processes for job $SLURM_JOB_ID"
#else
CARGO_NUM_NODES=${#hostnames[@]}
#fi
# If LIBGKFS_HOSTS_FILE is nor defined then do it
if [ -z "$LIBGKFS_HOSTS_FILE" ]; then
    LIBGKFS_HOSTS_FILE=/tmp/gekkofs/gkfs_hosts.txt
fi

# if number of CARGO_NUM_NODES is below 2, use 2, they will be colocated
if [ $CARGO_NUM_NODES -lt 2 ]; then
    CARGO_NUM_NODES=2
fi

cat <<EOT >>"$CARGO_CONFIG_FILE"
@@ -188,6 +198,7 @@ CARGO_ID=$CARGO_ID
CARGO_HOSTS=$hostnames_csv
CARGO_NUM_NODES=$CARGO_NUM_NODES
CARGO_ADDRESS=$CARGO_MASTER_ADDRESS
LIBGKFS_HOSTS_FILE=$LIBGKFS_HOSTS_FILE
EOT
CUID=$(id -u $SLURM_JOB_USER)
chown "$SLURM_JOB_USER":"$SLURM_JOB_GROUP" "$CARGO_CONFIG_FILE"
+4 −4
Original line number Diff line number Diff line
@@ -953,11 +953,11 @@ scord_unregister_job(spank_t sp, scord_plugin_config_t cfg,
    }

    // remove_adhoc_storage

    ADM_remove_adhoc_storage(scord_server, adhoc_storage);
    ADM_terminate_adhoc_storage(scord_server, adhoc_storage);
    // ADM_remove_adhoc_storage(scord_server, adhoc_storage);
    //  remove all the files (this should be done on all the nodes.. TODO)
    remove_dir_content(adhoc_path);

    rmdir(adhoc_path);
    // remove job
    ADM_remove_job(scord_server, scord_job);