Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/bin/bash
# global variables
export FI_PSM2_DISCONNECT=1
export PSM2_MULTI_EP=1
SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
CONFIGPATH="${SCRIPTDIR}/gkfs.conf"
source "$CONFIGPATH"
VERBOSE=false
NODE_NUM=1
MOUNTDIR=${DAEMON_MOUNTDIR}
ROOTDIR=${DAEMON_ROOTDIR}
HOSTSFILE=${LIBGKFS_HOSTS_FILE}
CPUS_PER_TASK=$(grep -c ^processor /proc/cpuinfo)
ARGS=${DAEMON_ARGS}
USE_SRUN=false
RUN_FOREGROUND=false
wait_for_gkfs_daemons() {
sleep 2
local server_wait_cnt=0
local nodes=1
if [[ -n ${NODE_NUM} ]]; then
nodes=${NODE_NUM}
fi
until [ $(($(wc -l "${HOSTSFILE}" 2> /dev/null | awk '{print $1}') + 0)) -eq "${nodes}" ]
do
#echo "Waiting for all servers to report connection. Try $server_wait_cnt"
sleep 2
server_wait_cnt=$((server_wait_cnt+1))
if [ ${server_wait_cnt} -gt 600 ]; then
echo "Server failed to start. Exiting ..."
exit 1
fi
done
}
create_pid_file() {
local pid_file=${DAEMON_PID_FILE}
local pid=${1}
if [[ $VERBOSE == true ]]; then
echo "Creating pid file at ${pid_file} with pid ${pid} ..."
fi
# if PID file exists another daemon could run
if [[ -e ${pid_file} ]]; then
local pid_file_tmp=${DAEMON_PID_FILE}.swp
# create empty tmp file
truncate -s 0 "${pid_file_tmp}"
while IFS= read -r line
do
if ps -p "${line}" > /dev/null; then
# process with pid still running
echo "${line}" >> "${pid_file_tmp}"
fi
done < "${pid_file}"
# create pid file with only valid pids
mv "${pid_file_tmp}" "${pid_file}"
fi
echo "${pid}" >> "${pid_file}"
}
start_daemon() {
local node_list
local srun_cmd
local daemon_execute
# setup
if [[ ${USE_SRUN} == true ]]; then
node_list=$(scontrol show job "${SLURM_JOB_ID}" | grep " NodeList=" | cut -d "=" -f2)
if [[ -z ${NODE_NUM} ]]; then
NODE_NUM=$(scontrol show hostname "${node_list}" | wc -l)
fi
# Setting up base srun cmd
srun_cmd="srun --disable-status -N ${NODE_NUM} --ntasks=${NODE_NUM} --ntasks-per-node=1 --overcommit --contiguous --cpus-per-task=${CPUS_PER_TASK} --oversubscribe --mem=0 "
fi
if [[ $VERBOSE == true ]]; then
echo "### mountdir: ${MOUNTDIR}"
echo "### rootdir: ${ROOTDIR}"
echo "### node_num: ${NODE_NUM}"
echo "### args: ${ARGS}"
echo "### cpus_per_task: ${CPUS_PER_TASK}"
fi
if [[ $VERBOSE == true ]]; then
echo "# Cleaning host file ..."
fi
rm "${HOSTSFILE}" 2> /dev/null
# Setting up base daemon cmd
local daemon_cmd="${DAEMON_BIN} -r ${ROOTDIR} -m ${MOUNTDIR} -H ${HOSTSFILE} ${ARGS}"
# Setting up numactl
if [[ ${DAEMON_NUMACTL} == true ]]; then
daemon_cmd="numactl --cpunodebind=${DAEMON_CPUNODEBIND} --membind=${DAEMON_MEMBIND} ${daemon_cmd}"
fi
# final daemon execute command
daemon_execute="${srun_cmd}${daemon_cmd}"
if [[ ${VERBOSE} == true ]]; then
echo "### Full execute DAEMON command:"
echo "##### $daemon_execute"
fi
# setup environment variables
export GKFS_DAEMON_LOG_PATH=$GKFS_DAEMON_LOG_PATH
export GKFS_DAEMON_LOG_LEVEL=$GKFS_DAEMON_LOG_LEVEL
echo "Starting daemons ..."
${daemon_execute} &
local daemon_pid=$!
wait_for_gkfs_daemons
echo "Running ..."
if [[ ${RUN_FOREGROUND} == true ]]; then
echo "Press 'q' to exit"
while : ; do
read -n 1 k <&1
if [[ $k = q ]] ; then
echo
echo "Shutting down ..."
if [[ -n ${daemon_pid} ]]; then
kill -s SIGINT ${daemon_pid} &
wait ${daemon_pid}
fi
break
else
echo "Press 'q' to exit"
fi
done
else
create_pid_file ${daemon_pid}
fi
}
stop_daemons() {
local pid_file=${DAEMON_PID_FILE}
if [[ -e ${pid_file} ]]; then
while IFS= read -r line
do
if ps -p "${line}" > /dev/null; then
if [[ $VERBOSE == true ]]; then
echo "Stopping daemon with pid ${line}"
fi
kill -s SIGINT "${line}" &
# poll pid until it stopped
if [[ $VERBOSE == true ]]; then
echo "Waiting for daemons to exit ..."
fi
timeout 1 tail --pid=${line} -f /dev/null
fi
done < "${pid_file}"
rm "${pid_file}"
else
echo "No pid file found -> no daemon running. Exiting ..."
fi
}
usage_short() {
echo "
usage: gkfs.sh [-h] [-r/--rootdir <config>] [-m/--mountdir <config>] [-n/--numnodes <jobsize>] [-f/--foreground <false>]
[-a/--args <daemon_args>] [--srun <false>] [-c/--cpuspertask <64>] [-v/--verbose <false>]
{start,stop}
"
}
help_msg() {
usage_short
}
# parse input
POSITIONAL=()
while [[ $# -gt 0 ]]; do
key="$1"
case ${key} in
-r | --rootdir)
ROOTDIR=$2
shift # past argument
shift # past value
;;
-m | --mountdir)
MOUNTDIR=$2
shift # past argument
shift # past value
;;
-n | --numnodes)
NODE_NUM=$2
shift # past argument
shift # past value
;;
-a | --args)
ARGS=$2
shift # past argument
shift # past value
;;
--srun)
USE_SRUN=true
shift # past argument
;;
-f | --foreground)
RUN_FOREGROUND=true
shift # past argument
;;
-c | --cpuspertask)
CPUS_PER_TASK=$2
shift # past argument
shift # past value
;;
-h | --help)
help_msg
exit
;;
-v | --verbose)
VERBOSE=true
shift # past argument
;;
*) # unknown option
POSITIONAL+=("$1") # save it in an array for later
shift # past argument
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
# positional arguments
if [[ -z ${1+x} ]]; then
echo "ERROR: Positional arguments missing."
usage_short
exit 1
fi
command="${1}"
if [[ ${command} != *"start"* ]] && [[ ${command} != *"stop"* ]]; then
echo "ERROR: command ${command} not supported"
usage_short
exit 1
fi
if [[ ${command} == "start" ]]; then
start_daemon
elif [[ ${command} == "stop" ]]; then
stop_daemons
fi
if [[ $VERBOSE == true ]]; then
echo "Nothing left to do. Exiting :)"
fi