Commit bd93f8a9 authored by Ramon Nou's avatar Ramon Nou
Browse files

Fix GEKKOFS script and join environments

parent 2c7f8b8a
Loading
Loading
Loading
Loading
Loading
+22 −20
Original line number Diff line number Diff line
#!/usr/bin/env bash
echo "GEKKOFS Script Called"
#!/usr/bin/bash
echo "GEKKOFS Script Called" $HOSTNAME $SLURM_JOBID

# example of a script that can be called by the adhoc service
# [2023-11-23 09:37:32.583868] [scord-ctl] [2199567] [info] rpc => id: 0 name: "ADM_deploy_adhoc_storage" from: "ofi+tcp;ofi_rxm://127.0.0.1:52000" body: {uuid: "gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5", type: ADM_ADHOC_STORAGE_GEKKOFS, resources: {nodes: [{hostname: "broadwell-001", type: regular}, {hostname: "broadwell-002", type: regular}, {hostname: "broadwell-003", type: regular}, {hostname: "broadwell-004", type: regular}]}}
# option: start --hosts "broadwell-001,broadwell-002,broadwell-003,broadwell-004" --workdir /tmp/gekkofs/gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5 --datadir /tmp/gekkofs/gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5/data --mountdir /tmp/gekkofs/gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5/mnt

# code to count the number of elements in a comma separated list environment variable (called $nodes) and store in $num_nodes

if ($1 == "start") then
if [ "$1" == "start" ]; then
    echo "Starting GEKKOFS"
    $nodes = $3
    . /beegfs/home/r.nou/spack/share/spack/setup-env.sh
    spack load gekkofs
    spack load slurm@23.02.6
    nodes=$3
    num_nodes=$(echo $nodes | awk -F, '{print NF-1}')
    # If num_nodes is >40, we are on the testing environment
    if ($num_nodes > 40) then
    # If num_nodes is greater than 40, we are on the testing environment
    if [ $num_nodes -gt 40 ]; then
        exit 0
    end
    $workdir = $5
    $datadir = $7
    $mountdir = $9
    fi
    workdir=$5
    datadir=$7
    mountdir=$9

    mkdir -p $5
    srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-task=1 --export=ALL bash -c "gkfs_daemon --rootdir $datadir --mountdir $mountdir" & 
else if ($1 == "stop") then
    mkdir -p $workdir
    /opt/slurm/bin/srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "gkfs_daemon --rootdir $datadir --mountdir $mountdir" &
   sleep 2
elif [ "$1" == "stop" ]; then
    echo "Stopping GEKKOFS"
    srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-task=1 --export=ALL bash -c "pkill -9 gkfs_daemon"
    . /beegfs/home/r.nou/spack/share/spack/setup-env.sh
    spack load gekkofs
    spack load slurm@23.02.6
    /opt/slurm/bin/srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-task=1 pkill -9 gkfs_daemon
else
    echo "Unknown command"
    exit 1

fi

exit 0
+36 −1
Original line number Diff line number Diff line
@@ -195,6 +195,38 @@ command::as_vector() const {

    return tmp;
}
// Function to join two sets of environment variables
char**
joinEnvironments(char** env1, const char** env2) {
    // Count the number of variables in each environment
    int count1 = 0;
    while(env1[count1] != nullptr) {
        ++count1;
    }

    int count2 = 0;
    while(env2[count2] != nullptr) {
        ++count2;
    }

    // Allocate memory for the combined environment
    char** combinedEnv = new char*[count1 + count2 + 1];

    // Copy the variables from the first environment
    for(int i = 0; i < count1; ++i) {
        combinedEnv[i] = strdup(env1[i]);
    }

    // Copy the variables from the second environment
    for(int i = 0; i < count2; ++i) {
        combinedEnv[count1 + i] = strdup(env2[i]);
    }

    // Null-terminate the combined environment
    combinedEnv[count1 + count2] = nullptr;

    return combinedEnv;
}

void
command::exec() const {
@@ -207,8 +239,11 @@ command::exec() const {

    switch(const auto pid = ::fork()) {
        case 0: {

            // Join the environments
            char** combinedEnv = joinEnvironments(environ, envp.get());
            ::execvpe(argv[0], const_cast<char* const*>(argv.get()),
                      const_cast<char* const*>(envp.get()));
                      const_cast<char* const*>(combinedEnv));
            // We cannot use the default logger in the child process because it
            // is not fork-safe, and even though we received a copy of the
            // global logger, it is not valid because the child process does
+6 −0
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@
#include <net/utilities.hpp>
#include "rpc_server.hpp"

extern char** environ;

using namespace std::literals;

@@ -142,6 +143,9 @@ rpc_server::deploy_adhoc_storage(
        const auto& adhoc_cfg = it->second;

        LOGGER_DEBUG("deploy \"{:e}\" (ID: {})", adhoc_type, adhoc_uuid);
        for (int i = 0; environ[i] != nullptr; ++i) {
        std::cout << environ[i] << std::endl;
    }

        // 1. Create a working directory for the adhoc storage instance
        adhoc_dir = adhoc_cfg.working_directory() / adhoc_uuid;
@@ -172,6 +176,8 @@ rpc_server::deploy_adhoc_storage(

        const auto cmd = adhoc_cfg.startup_command().eval(
                adhoc_uuid, *adhoc_dir, hostnames);
        // Fill environment


        // 4. Execute the startup command
        try {