Loading plugins/adhoc_services.d/gekkofs.sh +22 −20 Original line number Diff line number Diff line #!/usr/bin/env bash echo "GEKKOFS Script Called" #!/usr/bin/bash echo "GEKKOFS Script Called" $HOSTNAME $SLURM_JOBID # example of a script that can be called by the adhoc service # [2023-11-23 09:37:32.583868] [scord-ctl] [2199567] [info] rpc => id: 0 name: "ADM_deploy_adhoc_storage" from: "ofi+tcp;ofi_rxm://127.0.0.1:52000" body: {uuid: "gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5", type: ADM_ADHOC_STORAGE_GEKKOFS, resources: {nodes: [{hostname: "broadwell-001", type: regular}, {hostname: "broadwell-002", type: regular}, {hostname: "broadwell-003", type: regular}, {hostname: "broadwell-004", type: regular}]}} # option: start --hosts "broadwell-001,broadwell-002,broadwell-003,broadwell-004" --workdir /tmp/gekkofs/gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5 --datadir /tmp/gekkofs/gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5/data --mountdir /tmp/gekkofs/gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5/mnt # code to count the number of elements in a comma separated list environment variable (called $nodes) and store in $num_nodes if ($1 == "start") then if [ "$1" == "start" ]; then echo "Starting GEKKOFS" $nodes = $3 . /beegfs/home/r.nou/spack/share/spack/setup-env.sh spack load gekkofs spack load slurm@23.02.6 nodes=$3 num_nodes=$(echo $nodes | awk -F, '{print NF-1}') # If num_nodes is >40, we are on the testing environment if ($num_nodes > 40) then # If num_nodes is greater than 40, we are on the testing environment if [ $num_nodes -gt 40 ]; then exit 0 end $workdir = $5 $datadir = $7 $mountdir = $9 fi workdir=$5 datadir=$7 mountdir=$9 mkdir -p $5 srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-task=1 --export=ALL bash -c "gkfs_daemon --rootdir $datadir --mountdir $mountdir" & else if ($1 == "stop") then mkdir -p $workdir /opt/slurm/bin/srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "gkfs_daemon --rootdir $datadir --mountdir $mountdir" & sleep 2 elif [ "$1" == "stop" ]; then echo "Stopping GEKKOFS" srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-task=1 --export=ALL bash -c "pkill -9 gkfs_daemon" . /beegfs/home/r.nou/spack/share/spack/setup-env.sh spack load gekkofs spack load slurm@23.02.6 /opt/slurm/bin/srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-task=1 pkill -9 gkfs_daemon else echo "Unknown command" exit 1 fi exit 0 src/scord-ctl/command.cpp +36 −1 Original line number Diff line number Diff line Loading @@ -195,6 +195,38 @@ command::as_vector() const { return tmp; } // Function to join two sets of environment variables char** joinEnvironments(char** env1, const char** env2) { // Count the number of variables in each environment int count1 = 0; while(env1[count1] != nullptr) { ++count1; } int count2 = 0; while(env2[count2] != nullptr) { ++count2; } // Allocate memory for the combined environment char** combinedEnv = new char*[count1 + count2 + 1]; // Copy the variables from the first environment for(int i = 0; i < count1; ++i) { combinedEnv[i] = strdup(env1[i]); } // Copy the variables from the second environment for(int i = 0; i < count2; ++i) { combinedEnv[count1 + i] = strdup(env2[i]); } // Null-terminate the combined environment combinedEnv[count1 + count2] = nullptr; return combinedEnv; } void command::exec() const { Loading @@ -207,8 +239,11 @@ command::exec() const { switch(const auto pid = ::fork()) { case 0: { // Join the environments char** combinedEnv = joinEnvironments(environ, envp.get()); ::execvpe(argv[0], const_cast<char* const*>(argv.get()), const_cast<char* const*>(envp.get())); const_cast<char* const*>(combinedEnv)); // We cannot use the default logger in the child process because it // is not fork-safe, and even though we received a copy of the // global logger, it is not valid because the child process does Loading src/scord-ctl/rpc_server.cpp +6 −0 Original line number Diff line number Diff line Loading @@ -28,6 +28,7 @@ #include <net/utilities.hpp> #include "rpc_server.hpp" extern char** environ; using namespace std::literals; Loading Loading @@ -142,6 +143,9 @@ rpc_server::deploy_adhoc_storage( const auto& adhoc_cfg = it->second; LOGGER_DEBUG("deploy \"{:e}\" (ID: {})", adhoc_type, adhoc_uuid); for (int i = 0; environ[i] != nullptr; ++i) { std::cout << environ[i] << std::endl; } // 1. Create a working directory for the adhoc storage instance adhoc_dir = adhoc_cfg.working_directory() / adhoc_uuid; Loading Loading @@ -172,6 +176,8 @@ rpc_server::deploy_adhoc_storage( const auto cmd = adhoc_cfg.startup_command().eval( adhoc_uuid, *adhoc_dir, hostnames); // Fill environment // 4. Execute the startup command try { Loading Loading
plugins/adhoc_services.d/gekkofs.sh +22 −20 Original line number Diff line number Diff line #!/usr/bin/env bash echo "GEKKOFS Script Called" #!/usr/bin/bash echo "GEKKOFS Script Called" $HOSTNAME $SLURM_JOBID # example of a script that can be called by the adhoc service # [2023-11-23 09:37:32.583868] [scord-ctl] [2199567] [info] rpc => id: 0 name: "ADM_deploy_adhoc_storage" from: "ofi+tcp;ofi_rxm://127.0.0.1:52000" body: {uuid: "gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5", type: ADM_ADHOC_STORAGE_GEKKOFS, resources: {nodes: [{hostname: "broadwell-001", type: regular}, {hostname: "broadwell-002", type: regular}, {hostname: "broadwell-003", type: regular}, {hostname: "broadwell-004", type: regular}]}} # option: start --hosts "broadwell-001,broadwell-002,broadwell-003,broadwell-004" --workdir /tmp/gekkofs/gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5 --datadir /tmp/gekkofs/gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5/data --mountdir /tmp/gekkofs/gekkofs-JR4ny5xHMhmlwh6KqThfYt71IaoR9cH5/mnt # code to count the number of elements in a comma separated list environment variable (called $nodes) and store in $num_nodes if ($1 == "start") then if [ "$1" == "start" ]; then echo "Starting GEKKOFS" $nodes = $3 . /beegfs/home/r.nou/spack/share/spack/setup-env.sh spack load gekkofs spack load slurm@23.02.6 nodes=$3 num_nodes=$(echo $nodes | awk -F, '{print NF-1}') # If num_nodes is >40, we are on the testing environment if ($num_nodes > 40) then # If num_nodes is greater than 40, we are on the testing environment if [ $num_nodes -gt 40 ]; then exit 0 end $workdir = $5 $datadir = $7 $mountdir = $9 fi workdir=$5 datadir=$7 mountdir=$9 mkdir -p $5 srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-task=1 --export=ALL bash -c "gkfs_daemon --rootdir $datadir --mountdir $mountdir" & else if ($1 == "stop") then mkdir -p $workdir /opt/slurm/bin/srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-cpu=1 --export=ALL bash -c "gkfs_daemon --rootdir $datadir --mountdir $mountdir" & sleep 2 elif [ "$1" == "stop" ]; then echo "Stopping GEKKOFS" srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-task=1 --export=ALL bash -c "pkill -9 gkfs_daemon" . /beegfs/home/r.nou/spack/share/spack/setup-env.sh spack load gekkofs spack load slurm@23.02.6 /opt/slurm/bin/srun -N $num_nodes -n $num_nodes --oversubscribe --cpus-per-task=1 --mem-per-task=1 pkill -9 gkfs_daemon else echo "Unknown command" exit 1 fi exit 0
src/scord-ctl/command.cpp +36 −1 Original line number Diff line number Diff line Loading @@ -195,6 +195,38 @@ command::as_vector() const { return tmp; } // Function to join two sets of environment variables char** joinEnvironments(char** env1, const char** env2) { // Count the number of variables in each environment int count1 = 0; while(env1[count1] != nullptr) { ++count1; } int count2 = 0; while(env2[count2] != nullptr) { ++count2; } // Allocate memory for the combined environment char** combinedEnv = new char*[count1 + count2 + 1]; // Copy the variables from the first environment for(int i = 0; i < count1; ++i) { combinedEnv[i] = strdup(env1[i]); } // Copy the variables from the second environment for(int i = 0; i < count2; ++i) { combinedEnv[count1 + i] = strdup(env2[i]); } // Null-terminate the combined environment combinedEnv[count1 + count2] = nullptr; return combinedEnv; } void command::exec() const { Loading @@ -207,8 +239,11 @@ command::exec() const { switch(const auto pid = ::fork()) { case 0: { // Join the environments char** combinedEnv = joinEnvironments(environ, envp.get()); ::execvpe(argv[0], const_cast<char* const*>(argv.get()), const_cast<char* const*>(envp.get())); const_cast<char* const*>(combinedEnv)); // We cannot use the default logger in the child process because it // is not fork-safe, and even though we received a copy of the // global logger, it is not valid because the child process does Loading
src/scord-ctl/rpc_server.cpp +6 −0 Original line number Diff line number Diff line Loading @@ -28,6 +28,7 @@ #include <net/utilities.hpp> #include "rpc_server.hpp" extern char** environ; using namespace std::literals; Loading Loading @@ -142,6 +143,9 @@ rpc_server::deploy_adhoc_storage( const auto& adhoc_cfg = it->second; LOGGER_DEBUG("deploy \"{:e}\" (ID: {})", adhoc_type, adhoc_uuid); for (int i = 0; environ[i] != nullptr; ++i) { std::cout << environ[i] << std::endl; } // 1. Create a working directory for the adhoc storage instance adhoc_dir = adhoc_cfg.working_directory() / adhoc_uuid; Loading Loading @@ -172,6 +176,8 @@ rpc_server::deploy_adhoc_storage( const auto cmd = adhoc_cfg.startup_command().eval( adhoc_uuid, *adhoc_dir, hostnames); // Fill environment // 4. Execute the startup command try { Loading