Loading CHANGELOG.md +2 −0 Original line number Diff line number Diff line Loading @@ -14,6 +14,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.9.3] - 2024-07 ### New - Added option to scripts/bin/gkfs to start without previous job allocation and walltime parameter ([!206](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/206)). - Added a write size cache to the file system client to reduce potential metadata network bottlenecks during small I/O operations ([!193](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/193)). - The cache is experimental and thus disabled by default. Added the following environment variables. Loading README.md +3 −0 Original line number Diff line number Diff line Loading @@ -166,6 +166,9 @@ Options: --enable-prometheus Enables prometheus output and a corresponding thread. --prometheus-gateway TEXT Defines the prometheus gateway <ip:port> (Default 127.0.0.1:9091). --version Print version and exit. -t,--time TEXT Set a limit on the total run time of the slurm job allocation. Default is 15min. -A,--account TEXT Account for the slurm job (only required for job allocation) -P,--partition TEXT Partition for the slurm job (only required for job allocation) ``` It is possible to run multiple independent GekkoFS instances on the same node. Note, that when these GekkoFS instances Loading scripts/run/gkfs +48 −5 Original line number Diff line number Diff line Loading @@ -92,6 +92,8 @@ write_pid_file() { ####################################### # Starts GekkoFS daemons. # Globals: # SLURM_ACCOUNT # SLURM_PARTITION # SLURM_JOB_ID # NODE_NUM # MOUNTDIR Loading Loading @@ -120,8 +122,21 @@ start_daemons() { local srun_proxy_cmd local daemon_execute local proxy_execute # only used when started without slurm job local srun_account_partition # setup if [[ ${USE_SRUN} == true ]]; then if [[ -z ${SLURM_JOB_ID} ]]; then if [[ -z ${SLURM_ACCOUNT} || -z ${SLURM_PARTITION} ]]; then echo -e "${C_AST_RED}ERROR: Slurm Account or Partition not set. Exiting ..." exit 1 fi srun_account_partition="--partition=${SLURM_PARTITION} --account=${SLURM_ACCOUNT}" # TODO remove echo -e "observe: ${srun_account_partition}" else srun_account_partition="" fi # check for daemon first if [[ -n ${DAEMON_NODELIST_} ]]; then if [[ ! -f ${DAEMON_NODELIST_} ]]; then Loading @@ -129,14 +144,14 @@ start_daemons() { exit 1 fi NODE_NUM=$(wc -l < "${DAEMON_NODELIST_}") srun_daemon_cmd="srun --disable-status --nodelist=${DAEMON_NODELIST_} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_daemon_cmd="srun --disable-status --nodelist=${DAEMON_NODELIST_} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " else node_list=$(scontrol show job "${SLURM_JOB_ID}" | grep " NodeList=" | cut -d "=" -f2) if [[ -z ${NODE_NUM} ]]; then NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l) fi # Setting up base srun cmd srun_daemon_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_daemon_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " fi if [[ ${USE_PROXY} == true ]]; then if [[ -n ${PROXY_NODELIST_} ]]; then Loading @@ -145,9 +160,9 @@ start_daemons() { exit 1 fi NODE_NUM_PROXY=$(wc -l < "${PROXY_NODELIST_}") srun_proxy_cmd="srun --disable-status --nodelist=${PROXY_NODELIST_} --ntasks=${NODE_NUM_PROXY} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_proxy_cmd="srun --disable-status --nodelist=${PROXY_NODELIST_} --ntasks=${NODE_NUM_PROXY} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " else srun_proxy_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_proxy_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " NODE_NUM_PROXY=$NODE_NUM fi fi Loading Loading @@ -189,6 +204,16 @@ start_daemons() { if [[ -n ${DAEMON_AFFINITY_} ]]; then daemon_cmd="${DAEMON_AFFINITY_} ${daemon_cmd}" fi # add --time parameter to ARGS if [[ -n ${SRUN_TIME} ]]; then SRUN_DAEMON_ARGS="--time=${SRUN_TIME} ${SRUN_DAEMON_ARGS}" SRUN_PROXY_ARGS="--time=${SRUN_TIME} ${SRUN_PROXY_ARGS}" else SRUN_DAEMON_ARGS="--time=00:15:00 ${SRUN_DAEMON_ARGS}" SRUN_PROXY_ARGS="--time=00:15:00 ${SRUN_PROXY_ARGS}" fi # final daemon execute COMMAND daemon_execute="${srun_daemon_cmd} ${SRUN_DAEMON_ARGS} ${daemon_cmd}" Loading Loading @@ -515,6 +540,9 @@ help_msg() { -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. -e, --expand_hostfile Path to the hostfile with new nodes where GekkoFS should be extended to (hostfile contains one line per node). -v, --verbose Increase verbosity -t, --time Set a limit on the total run time of the slurm job allocation. -A, --account Account for the slurm job (only required for job allocation) -P, --partition Partition for the slurm job (only required for job allocation) " } CONFIGPATH="" Loading Loading @@ -644,6 +672,21 @@ while [[ $# -gt 0 ]]; do VERBOSE=true shift # past argument ;; -t | --time) SRUN_TIME=$2 shift # past argument shift # past value ;; -A | --account) SLURM_ACCOUNT=$2 shift # past argument shift # past value ;; -P | --partition) SLURM_PARTITION=$2 shift # past argument shift # past value ;; *) # unknown option POSITIONAL+=("$1") # save it in an array for later shift # past argument Loading scripts/run/gkfs.conf +4 −0 Original line number Diff line number Diff line Loading @@ -44,6 +44,10 @@ SRUN_ARGS="--overlap --ntasks-per-node=1 --overcommit --overlap --oversubscribe SRUN_DAEMON_ARGS="" # Specific srun args for proxy SRUN_PROXY_ARGS="" SRUN_TIME="00:15:00" # srun without job allocation SLURM_ACCOUNT="" SLURM_PARTITION="" # logging GKFS_DAEMON_LOG_LEVEL=trace Loading Loading
CHANGELOG.md +2 −0 Original line number Diff line number Diff line Loading @@ -14,6 +14,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.9.3] - 2024-07 ### New - Added option to scripts/bin/gkfs to start without previous job allocation and walltime parameter ([!206](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/206)). - Added a write size cache to the file system client to reduce potential metadata network bottlenecks during small I/O operations ([!193](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/193)). - The cache is experimental and thus disabled by default. Added the following environment variables. Loading
README.md +3 −0 Original line number Diff line number Diff line Loading @@ -166,6 +166,9 @@ Options: --enable-prometheus Enables prometheus output and a corresponding thread. --prometheus-gateway TEXT Defines the prometheus gateway <ip:port> (Default 127.0.0.1:9091). --version Print version and exit. -t,--time TEXT Set a limit on the total run time of the slurm job allocation. Default is 15min. -A,--account TEXT Account for the slurm job (only required for job allocation) -P,--partition TEXT Partition for the slurm job (only required for job allocation) ``` It is possible to run multiple independent GekkoFS instances on the same node. Note, that when these GekkoFS instances Loading
scripts/run/gkfs +48 −5 Original line number Diff line number Diff line Loading @@ -92,6 +92,8 @@ write_pid_file() { ####################################### # Starts GekkoFS daemons. # Globals: # SLURM_ACCOUNT # SLURM_PARTITION # SLURM_JOB_ID # NODE_NUM # MOUNTDIR Loading Loading @@ -120,8 +122,21 @@ start_daemons() { local srun_proxy_cmd local daemon_execute local proxy_execute # only used when started without slurm job local srun_account_partition # setup if [[ ${USE_SRUN} == true ]]; then if [[ -z ${SLURM_JOB_ID} ]]; then if [[ -z ${SLURM_ACCOUNT} || -z ${SLURM_PARTITION} ]]; then echo -e "${C_AST_RED}ERROR: Slurm Account or Partition not set. Exiting ..." exit 1 fi srun_account_partition="--partition=${SLURM_PARTITION} --account=${SLURM_ACCOUNT}" # TODO remove echo -e "observe: ${srun_account_partition}" else srun_account_partition="" fi # check for daemon first if [[ -n ${DAEMON_NODELIST_} ]]; then if [[ ! -f ${DAEMON_NODELIST_} ]]; then Loading @@ -129,14 +144,14 @@ start_daemons() { exit 1 fi NODE_NUM=$(wc -l < "${DAEMON_NODELIST_}") srun_daemon_cmd="srun --disable-status --nodelist=${DAEMON_NODELIST_} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_daemon_cmd="srun --disable-status --nodelist=${DAEMON_NODELIST_} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " else node_list=$(scontrol show job "${SLURM_JOB_ID}" | grep " NodeList=" | cut -d "=" -f2) if [[ -z ${NODE_NUM} ]]; then NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l) fi # Setting up base srun cmd srun_daemon_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_daemon_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " fi if [[ ${USE_PROXY} == true ]]; then if [[ -n ${PROXY_NODELIST_} ]]; then Loading @@ -145,9 +160,9 @@ start_daemons() { exit 1 fi NODE_NUM_PROXY=$(wc -l < "${PROXY_NODELIST_}") srun_proxy_cmd="srun --disable-status --nodelist=${PROXY_NODELIST_} --ntasks=${NODE_NUM_PROXY} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_proxy_cmd="srun --disable-status --nodelist=${PROXY_NODELIST_} --ntasks=${NODE_NUM_PROXY} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " else srun_proxy_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_proxy_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " NODE_NUM_PROXY=$NODE_NUM fi fi Loading Loading @@ -189,6 +204,16 @@ start_daemons() { if [[ -n ${DAEMON_AFFINITY_} ]]; then daemon_cmd="${DAEMON_AFFINITY_} ${daemon_cmd}" fi # add --time parameter to ARGS if [[ -n ${SRUN_TIME} ]]; then SRUN_DAEMON_ARGS="--time=${SRUN_TIME} ${SRUN_DAEMON_ARGS}" SRUN_PROXY_ARGS="--time=${SRUN_TIME} ${SRUN_PROXY_ARGS}" else SRUN_DAEMON_ARGS="--time=00:15:00 ${SRUN_DAEMON_ARGS}" SRUN_PROXY_ARGS="--time=00:15:00 ${SRUN_PROXY_ARGS}" fi # final daemon execute COMMAND daemon_execute="${srun_daemon_cmd} ${SRUN_DAEMON_ARGS} ${daemon_cmd}" Loading Loading @@ -515,6 +540,9 @@ help_msg() { -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. -e, --expand_hostfile Path to the hostfile with new nodes where GekkoFS should be extended to (hostfile contains one line per node). -v, --verbose Increase verbosity -t, --time Set a limit on the total run time of the slurm job allocation. -A, --account Account for the slurm job (only required for job allocation) -P, --partition Partition for the slurm job (only required for job allocation) " } CONFIGPATH="" Loading Loading @@ -644,6 +672,21 @@ while [[ $# -gt 0 ]]; do VERBOSE=true shift # past argument ;; -t | --time) SRUN_TIME=$2 shift # past argument shift # past value ;; -A | --account) SLURM_ACCOUNT=$2 shift # past argument shift # past value ;; -P | --partition) SLURM_PARTITION=$2 shift # past argument shift # past value ;; *) # unknown option POSITIONAL+=("$1") # save it in an array for later shift # past argument Loading
scripts/run/gkfs.conf +4 −0 Original line number Diff line number Diff line Loading @@ -44,6 +44,10 @@ SRUN_ARGS="--overlap --ntasks-per-node=1 --overcommit --overlap --oversubscribe SRUN_DAEMON_ARGS="" # Specific srun args for proxy SRUN_PROXY_ARGS="" SRUN_TIME="00:15:00" # srun without job allocation SLURM_ACCOUNT="" SLURM_PARTITION="" # logging GKFS_DAEMON_LOG_LEVEL=trace Loading