Loading CHANGELOG.md +2 −0 Original line number Original line Diff line number Diff line Loading @@ -14,6 +14,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.9.3] - 2024-07 ## [0.9.3] - 2024-07 ### New ### New - Added option to scripts/bin/gkfs to start without previous job allocation and walltime parameter ([!206](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/206)). - Added a write size cache to the file system client to reduce potential metadata network bottlenecks during small I/O - Added a write size cache to the file system client to reduce potential metadata network bottlenecks during small I/O operations ([!193](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/193)). operations ([!193](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/193)). - The cache is experimental and thus disabled by default. Added the following environment variables. - The cache is experimental and thus disabled by default. Added the following environment variables. Loading README.md +3 −0 Original line number Original line Diff line number Diff line Loading @@ -166,6 +166,9 @@ Options: --enable-prometheus Enables prometheus output and a corresponding thread. --enable-prometheus Enables prometheus output and a corresponding thread. --prometheus-gateway TEXT Defines the prometheus gateway <ip:port> (Default 127.0.0.1:9091). --prometheus-gateway TEXT Defines the prometheus gateway <ip:port> (Default 127.0.0.1:9091). --version Print version and exit. --version Print version and exit. -t,--time TEXT Set a limit on the total run time of the slurm job allocation. Default is 15min. -A,--account TEXT Account for the slurm job (only required for job allocation) -P,--partition TEXT Partition for the slurm job (only required for job allocation) ``` ``` It is possible to run multiple independent GekkoFS instances on the same node. Note, that when these GekkoFS instances It is possible to run multiple independent GekkoFS instances on the same node. Note, that when these GekkoFS instances Loading scripts/run/gkfs +48 −5 Original line number Original line Diff line number Diff line Loading @@ -92,6 +92,8 @@ write_pid_file() { ####################################### ####################################### # Starts GekkoFS daemons. # Starts GekkoFS daemons. # Globals: # Globals: # SLURM_ACCOUNT # SLURM_PARTITION # SLURM_JOB_ID # SLURM_JOB_ID # NODE_NUM # NODE_NUM # MOUNTDIR # MOUNTDIR Loading Loading @@ -120,8 +122,21 @@ start_daemons() { local srun_proxy_cmd local srun_proxy_cmd local daemon_execute local daemon_execute local proxy_execute local proxy_execute # only used when started without slurm job local srun_account_partition # setup # setup if [[ ${USE_SRUN} == true ]]; then if [[ ${USE_SRUN} == true ]]; then if [[ -z ${SLURM_JOB_ID} ]]; then if [[ -z ${SLURM_ACCOUNT} || -z ${SLURM_PARTITION} ]]; then echo -e "${C_AST_RED}ERROR: Slurm Account or Partition not set. Exiting ..." exit 1 fi srun_account_partition="--partition=${SLURM_PARTITION} --account=${SLURM_ACCOUNT}" # TODO remove echo -e "observe: ${srun_account_partition}" else srun_account_partition="" fi # check for daemon first # check for daemon first if [[ -n ${DAEMON_NODELIST_} ]]; then if [[ -n ${DAEMON_NODELIST_} ]]; then if [[ ! -f ${DAEMON_NODELIST_} ]]; then if [[ ! -f ${DAEMON_NODELIST_} ]]; then Loading @@ -129,14 +144,14 @@ start_daemons() { exit 1 exit 1 fi fi NODE_NUM=$(wc -l < "${DAEMON_NODELIST_}") NODE_NUM=$(wc -l < "${DAEMON_NODELIST_}") srun_daemon_cmd="srun --disable-status --nodelist=${DAEMON_NODELIST_} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_daemon_cmd="srun --disable-status --nodelist=${DAEMON_NODELIST_} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " else else node_list=$(scontrol show job "${SLURM_JOB_ID}" | grep " NodeList=" | cut -d "=" -f2) node_list=$(scontrol show job "${SLURM_JOB_ID}" | grep " NodeList=" | cut -d "=" -f2) if [[ -z ${NODE_NUM} ]]; then if [[ -z ${NODE_NUM} ]]; then NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l) NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l) fi fi # Setting up base srun cmd # Setting up base srun cmd srun_daemon_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_daemon_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " fi fi if [[ ${USE_PROXY} == true ]]; then if [[ ${USE_PROXY} == true ]]; then if [[ -n ${PROXY_NODELIST_} ]]; then if [[ -n ${PROXY_NODELIST_} ]]; then Loading @@ -145,9 +160,9 @@ start_daemons() { exit 1 exit 1 fi fi NODE_NUM_PROXY=$(wc -l < "${PROXY_NODELIST_}") NODE_NUM_PROXY=$(wc -l < "${PROXY_NODELIST_}") srun_proxy_cmd="srun --disable-status --nodelist=${PROXY_NODELIST_} --ntasks=${NODE_NUM_PROXY} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_proxy_cmd="srun --disable-status --nodelist=${PROXY_NODELIST_} --ntasks=${NODE_NUM_PROXY} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " else else srun_proxy_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_proxy_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " NODE_NUM_PROXY=$NODE_NUM NODE_NUM_PROXY=$NODE_NUM fi fi fi fi Loading Loading @@ -189,6 +204,16 @@ start_daemons() { if [[ -n ${DAEMON_AFFINITY_} ]]; then if [[ -n ${DAEMON_AFFINITY_} ]]; then daemon_cmd="${DAEMON_AFFINITY_} ${daemon_cmd}" daemon_cmd="${DAEMON_AFFINITY_} ${daemon_cmd}" fi fi # add --time parameter to ARGS if [[ -n ${SRUN_TIME} ]]; then SRUN_DAEMON_ARGS="--time=${SRUN_TIME} ${SRUN_DAEMON_ARGS}" SRUN_PROXY_ARGS="--time=${SRUN_TIME} ${SRUN_PROXY_ARGS}" else SRUN_DAEMON_ARGS="--time=00:15:00 ${SRUN_DAEMON_ARGS}" SRUN_PROXY_ARGS="--time=00:15:00 ${SRUN_PROXY_ARGS}" fi # final daemon execute COMMAND # final daemon execute COMMAND daemon_execute="${srun_daemon_cmd} ${SRUN_DAEMON_ARGS} ${daemon_cmd}" daemon_execute="${srun_daemon_cmd} ${SRUN_DAEMON_ARGS} ${daemon_cmd}" Loading Loading @@ -515,6 +540,9 @@ help_msg() { -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. -e, --expand_hostfile Path to the hostfile with new nodes where GekkoFS should be extended to (hostfile contains one line per node). -e, --expand_hostfile Path to the hostfile with new nodes where GekkoFS should be extended to (hostfile contains one line per node). -v, --verbose Increase verbosity -v, --verbose Increase verbosity -t, --time Set a limit on the total run time of the slurm job allocation. -A, --account Account for the slurm job (only required for job allocation) -P, --partition Partition for the slurm job (only required for job allocation) " " } } CONFIGPATH="" CONFIGPATH="" Loading Loading @@ -644,6 +672,21 @@ while [[ $# -gt 0 ]]; do VERBOSE=true VERBOSE=true shift # past argument shift # past argument ;; ;; -t | --time) SRUN_TIME=$2 shift # past argument shift # past value ;; -A | --account) SLURM_ACCOUNT=$2 shift # past argument shift # past value ;; -P | --partition) SLURM_PARTITION=$2 shift # past argument shift # past value ;; *) # unknown option *) # unknown option POSITIONAL+=("$1") # save it in an array for later POSITIONAL+=("$1") # save it in an array for later shift # past argument shift # past argument Loading scripts/run/gkfs.conf +4 −0 Original line number Original line Diff line number Diff line Loading @@ -44,6 +44,10 @@ SRUN_ARGS="--overlap --ntasks-per-node=1 --overcommit --overlap --oversubscribe SRUN_DAEMON_ARGS="" SRUN_DAEMON_ARGS="" # Specific srun args for proxy # Specific srun args for proxy SRUN_PROXY_ARGS="" SRUN_PROXY_ARGS="" SRUN_TIME="00:15:00" # srun without job allocation SLURM_ACCOUNT="" SLURM_PARTITION="" # logging # logging GKFS_DAEMON_LOG_LEVEL=trace GKFS_DAEMON_LOG_LEVEL=trace Loading Loading
CHANGELOG.md +2 −0 Original line number Original line Diff line number Diff line Loading @@ -14,6 +14,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.9.3] - 2024-07 ## [0.9.3] - 2024-07 ### New ### New - Added option to scripts/bin/gkfs to start without previous job allocation and walltime parameter ([!206](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/206)). - Added a write size cache to the file system client to reduce potential metadata network bottlenecks during small I/O - Added a write size cache to the file system client to reduce potential metadata network bottlenecks during small I/O operations ([!193](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/193)). operations ([!193](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/193)). - The cache is experimental and thus disabled by default. Added the following environment variables. - The cache is experimental and thus disabled by default. Added the following environment variables. Loading
README.md +3 −0 Original line number Original line Diff line number Diff line Loading @@ -166,6 +166,9 @@ Options: --enable-prometheus Enables prometheus output and a corresponding thread. --enable-prometheus Enables prometheus output and a corresponding thread. --prometheus-gateway TEXT Defines the prometheus gateway <ip:port> (Default 127.0.0.1:9091). --prometheus-gateway TEXT Defines the prometheus gateway <ip:port> (Default 127.0.0.1:9091). --version Print version and exit. --version Print version and exit. -t,--time TEXT Set a limit on the total run time of the slurm job allocation. Default is 15min. -A,--account TEXT Account for the slurm job (only required for job allocation) -P,--partition TEXT Partition for the slurm job (only required for job allocation) ``` ``` It is possible to run multiple independent GekkoFS instances on the same node. Note, that when these GekkoFS instances It is possible to run multiple independent GekkoFS instances on the same node. Note, that when these GekkoFS instances Loading
scripts/run/gkfs +48 −5 Original line number Original line Diff line number Diff line Loading @@ -92,6 +92,8 @@ write_pid_file() { ####################################### ####################################### # Starts GekkoFS daemons. # Starts GekkoFS daemons. # Globals: # Globals: # SLURM_ACCOUNT # SLURM_PARTITION # SLURM_JOB_ID # SLURM_JOB_ID # NODE_NUM # NODE_NUM # MOUNTDIR # MOUNTDIR Loading Loading @@ -120,8 +122,21 @@ start_daemons() { local srun_proxy_cmd local srun_proxy_cmd local daemon_execute local daemon_execute local proxy_execute local proxy_execute # only used when started without slurm job local srun_account_partition # setup # setup if [[ ${USE_SRUN} == true ]]; then if [[ ${USE_SRUN} == true ]]; then if [[ -z ${SLURM_JOB_ID} ]]; then if [[ -z ${SLURM_ACCOUNT} || -z ${SLURM_PARTITION} ]]; then echo -e "${C_AST_RED}ERROR: Slurm Account or Partition not set. Exiting ..." exit 1 fi srun_account_partition="--partition=${SLURM_PARTITION} --account=${SLURM_ACCOUNT}" # TODO remove echo -e "observe: ${srun_account_partition}" else srun_account_partition="" fi # check for daemon first # check for daemon first if [[ -n ${DAEMON_NODELIST_} ]]; then if [[ -n ${DAEMON_NODELIST_} ]]; then if [[ ! -f ${DAEMON_NODELIST_} ]]; then if [[ ! -f ${DAEMON_NODELIST_} ]]; then Loading @@ -129,14 +144,14 @@ start_daemons() { exit 1 exit 1 fi fi NODE_NUM=$(wc -l < "${DAEMON_NODELIST_}") NODE_NUM=$(wc -l < "${DAEMON_NODELIST_}") srun_daemon_cmd="srun --disable-status --nodelist=${DAEMON_NODELIST_} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_daemon_cmd="srun --disable-status --nodelist=${DAEMON_NODELIST_} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " else else node_list=$(scontrol show job "${SLURM_JOB_ID}" | grep " NodeList=" | cut -d "=" -f2) node_list=$(scontrol show job "${SLURM_JOB_ID}" | grep " NodeList=" | cut -d "=" -f2) if [[ -z ${NODE_NUM} ]]; then if [[ -z ${NODE_NUM} ]]; then NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l) NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l) fi fi # Setting up base srun cmd # Setting up base srun cmd srun_daemon_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_daemon_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " fi fi if [[ ${USE_PROXY} == true ]]; then if [[ ${USE_PROXY} == true ]]; then if [[ -n ${PROXY_NODELIST_} ]]; then if [[ -n ${PROXY_NODELIST_} ]]; then Loading @@ -145,9 +160,9 @@ start_daemons() { exit 1 exit 1 fi fi NODE_NUM_PROXY=$(wc -l < "${PROXY_NODELIST_}") NODE_NUM_PROXY=$(wc -l < "${PROXY_NODELIST_}") srun_proxy_cmd="srun --disable-status --nodelist=${PROXY_NODELIST_} --ntasks=${NODE_NUM_PROXY} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_proxy_cmd="srun --disable-status --nodelist=${PROXY_NODELIST_} --ntasks=${NODE_NUM_PROXY} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " else else srun_proxy_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${SRUN_ARGS} " srun_proxy_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --cpus-per-task=${CPUS_PER_TASK} ${srun_account_partition} ${SRUN_ARGS} " NODE_NUM_PROXY=$NODE_NUM NODE_NUM_PROXY=$NODE_NUM fi fi fi fi Loading Loading @@ -189,6 +204,16 @@ start_daemons() { if [[ -n ${DAEMON_AFFINITY_} ]]; then if [[ -n ${DAEMON_AFFINITY_} ]]; then daemon_cmd="${DAEMON_AFFINITY_} ${daemon_cmd}" daemon_cmd="${DAEMON_AFFINITY_} ${daemon_cmd}" fi fi # add --time parameter to ARGS if [[ -n ${SRUN_TIME} ]]; then SRUN_DAEMON_ARGS="--time=${SRUN_TIME} ${SRUN_DAEMON_ARGS}" SRUN_PROXY_ARGS="--time=${SRUN_TIME} ${SRUN_PROXY_ARGS}" else SRUN_DAEMON_ARGS="--time=00:15:00 ${SRUN_DAEMON_ARGS}" SRUN_PROXY_ARGS="--time=00:15:00 ${SRUN_PROXY_ARGS}" fi # final daemon execute COMMAND # final daemon execute COMMAND daemon_execute="${srun_daemon_cmd} ${SRUN_DAEMON_ARGS} ${daemon_cmd}" daemon_execute="${srun_daemon_cmd} ${SRUN_DAEMON_ARGS} ${daemon_cmd}" Loading Loading @@ -515,6 +540,9 @@ help_msg() { -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. -e, --expand_hostfile Path to the hostfile with new nodes where GekkoFS should be extended to (hostfile contains one line per node). -e, --expand_hostfile Path to the hostfile with new nodes where GekkoFS should be extended to (hostfile contains one line per node). -v, --verbose Increase verbosity -v, --verbose Increase verbosity -t, --time Set a limit on the total run time of the slurm job allocation. -A, --account Account for the slurm job (only required for job allocation) -P, --partition Partition for the slurm job (only required for job allocation) " " } } CONFIGPATH="" CONFIGPATH="" Loading Loading @@ -644,6 +672,21 @@ while [[ $# -gt 0 ]]; do VERBOSE=true VERBOSE=true shift # past argument shift # past argument ;; ;; -t | --time) SRUN_TIME=$2 shift # past argument shift # past value ;; -A | --account) SLURM_ACCOUNT=$2 shift # past argument shift # past value ;; -P | --partition) SLURM_PARTITION=$2 shift # past argument shift # past value ;; *) # unknown option *) # unknown option POSITIONAL+=("$1") # save it in an array for later POSITIONAL+=("$1") # save it in an array for later shift # past argument shift # past argument Loading
scripts/run/gkfs.conf +4 −0 Original line number Original line Diff line number Diff line Loading @@ -44,6 +44,10 @@ SRUN_ARGS="--overlap --ntasks-per-node=1 --overcommit --overlap --oversubscribe SRUN_DAEMON_ARGS="" SRUN_DAEMON_ARGS="" # Specific srun args for proxy # Specific srun args for proxy SRUN_PROXY_ARGS="" SRUN_PROXY_ARGS="" SRUN_TIME="00:15:00" # srun without job allocation SLURM_ACCOUNT="" SLURM_PARTITION="" # logging # logging GKFS_DAEMON_LOG_LEVEL=trace GKFS_DAEMON_LOG_LEVEL=trace Loading