Commit dad26735 authored by Alberto Miranda's avatar Alberto Miranda ♨️
Browse files

Merge branch 'dep_scripts_update' into 'master'

Minor dependency script updates:

* adding more optional versions to the dependency scripts: RocksDB and syscall_intercept with glibc3 fix as more systems are updated to >glibc3
* Cloning libfabric instead of downloading the tarball. This is because it is configured with a specific autotools version which is not available on all systems. Cloning allows generating `configure` dynamically.
* daemon metadata backend now links to Dynamic Loader which is required by newer systems (not sure when this happened) and newer rocksdb versions
* removing python startup scripts since they are no longer supported and are confusing if part of the git repo

See merge request !57
parents 08f454d0 d55bfd3e
Loading
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -331,6 +331,8 @@ if check_dependency "ofi" "${DEP_CONFIG[@]}"; then
        #libfabric
        CURR=${SOURCE}/libfabric
        prepare_build_dir ${CURR}
        cd ${CURR}
        ./autogen.sh
        cd ${CURR}/build
        OFI_CONFIG="../configure --prefix=${INSTALL} --enable-tcp=yes"
        if check_dependency "verbs" "${DEP_CONFIG[@]}"; then
+17 −11
Original line number Diff line number Diff line
@@ -17,8 +17,8 @@ MOGON1_DEPS=(
)

MOGON2_DEPS=(
    "zstd" "lz4" "snappy" "capstone" "ofi" "mercury" "argobots" "margo" "rocksdb"
    "syscall_intercept" "date"
    "zstd" "lz4" "snappy" "capstone" "ofi-experimental" "mercury" "argobots" "margo" "rocksdb-experimental"
    "syscall_intercept-glibc3" "date" "psm2"
)

DIRECT_DEPS=(
@@ -119,7 +119,7 @@ clonedeps() {
    fi
    # fix the version
    cd "${SOURCE}/${FOLDER}" && git checkout -qf ${COMMIT}
    echo "${ACTION} ${FOLDER} [$COMMIT]"
    echo "${ACTION} '${REPO}' to '${FOLDER}' with commit '[${COMMIT}]' and flags '${GIT_FLAGS}'"

    # apply patch if provided
    if [[ -n "${PATCH}" ]]; then
@@ -150,7 +150,7 @@ wgetdeps() {
    curl ${COMMON_CURL_FLAGS} "$URL" || error_exit "Failed to download ${URL}" $?
    tar -xf "$FILENAME" --directory "${SOURCE}/${FOLDER}" --strip-components=1
    rm -f "$FILENAME"
    echo "Downloaded ${FOLDER}"
    echo "Downloaded '${URL}' to '${FOLDER}'"
}

usage_short() {
@@ -323,13 +323,13 @@ fi
# get libfabric
if [ "${NA_LAYER}" == "ofi" ] || [ "${NA_LAYER}" == "all" ]; then
    if check_dependency "ofi-experimental" "${DEP_CONFIG[@]}"; then
        wgetdeps "libfabric" "https://github.com/ofiwg/libfabric/releases/download/v1.9.1/libfabric-1.9.1.tar.bz2" &
        clonedeps "libfabric" "https://github.com/ofiwg/libfabric.git" "" "-b v1.9.1" &
    elif check_dependency "ofi-verbs" "${DEP_CONFIG[@]}"; then
        # libibverbs 1.2.1-1 used on mogon 1i (installed on system) which is linked to libfabric
        # libfabric 1.8 random RPCs fail to be send. 1.9 RPC client cannot be started when in an MPI environment
        wgetdeps "libfabric" "https://github.com/ofiwg/libfabric/releases/download/v1.7.2/libfabric-1.7.2.tar.gz" &
        clonedeps "libfabric" "https://github.com/ofiwg/libfabric.git" "" "-b v1.7.2" &
    elif check_dependency "ofi" "${DEP_CONFIG[@]}"; then
        wgetdeps "libfabric" "https://github.com/ofiwg/libfabric/releases/download/v1.8.1/libfabric-1.8.1.tar.bz2" &
        clonedeps "libfabric" "https://github.com/ofiwg/libfabric.git" "" "-b v1.8.1" &
    fi
fi

@@ -350,15 +350,21 @@ fi

# get rocksdb
if check_dependency "rocksdb" "${DEP_CONFIG[@]}"; then
    if check_dependency "rocksdb-experimental" "${DEP_CONFIG[@]}"; then
        wgetdeps "rocksdb" "https://github.com/facebook/rocksdb/archive/v6.11.4.tar.gz" &
    else
        wgetdeps "rocksdb" "https://github.com/facebook/rocksdb/archive/v6.2.2.tar.gz" &
elif check_dependency "rocksdb-experimental" "${DEP_CONFIG[@]}"; then
    wgetdeps "rocksdb" "https://github.com/facebook/rocksdb/archive/v6.7.3.tar.gz" &
    fi
fi

# get syscall_intercept
if check_dependency "syscall_intercept" "${DEP_CONFIG[@]}"; then
    if check_dependency "syscall_intercept-glibc3" "${DEP_CONFIG[@]}"; then
        clonedeps "syscall_intercept" "https://github.com/GBuella/syscall_intercept" "ea124fb4ab9eb56bc22a0e94f2b90928c7a88e8c" "-b add_endbr64_and_lea" "syscall_intercept.patch" &
    else
        clonedeps "syscall_intercept" "https://github.com/pmem/syscall_intercept.git" "cc3412a2ad39f2e26cc307d5b155232811d7408e" "" "syscall_intercept.patch" &
    fi
fi

# get date
if check_dependency "date" "${DEP_CONFIG[@]}"; then

scripts/shutdown_gkfs.py

deleted100755 → 0
+0 −164
Original line number Diff line number Diff line
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import print_function
import argparse
import time

import os

from util import util

__author__ = "Marc-Andre Vef"
__email__ = "vef@uni-mainz.de"

global PRETEND
global PSSH_PATH
global WAITTIME
global PSSH_HOSTFILE_PATH


def check_dependencies():
    global PSSH_PATH
    """Check if pssh is installed"""
    pssh_path = os.popen('which pssh').read().strip()
    if pssh_path != '':
        PSSH_PATH = pssh_path
        return
    pssh_path = os.popen('which parallel-ssh').read().strip()
    if pssh_path != '':
        PSSH_PATH = pssh_path
        return
    print('[ERR] parallel-ssh/pssh executable cannot be found. Please add it to the parameter list')
    exit(1)


def shutdown_system(daemon_pid_path, nodelist, sigkill):
    """Shuts down GekkoFS on specified nodes.

    Args:
        daemon_pid_path (str): Path to daemon pid file
        nodelist (str): Comma-separated list of nodes where daemons need to be launched
        sigkill (bool): If true force kills daemons
    """
    global PSSH_PATH
    global PRETEND
    global WAITTIME
    global PSSH_HOSTFILE_PATH
    # get absolute paths
    daemon_pid_path = os.path.realpath(os.path.expanduser(daemon_pid_path))
    pssh_nodelist = ''
    nodefile = False
    if os.path.exists(nodelist):
        nodefile = True
        if not util.create_pssh_hostfile(nodelist, PSSH_HOSTFILE_PATH):
            exit(1)
    if PSSH_PATH is '':
        check_dependencies()
    # set pssh arguments
    if nodefile:
        pssh = '%s -O StrictHostKeyChecking=no -i -h "%s"' % (PSSH_PATH, PSSH_HOSTFILE_PATH)
    else:
        pssh = '%s -O StrictHostKeyChecking=no -i -H "%s"' % (PSSH_PATH, nodelist.replace(',', ' '))
    if sigkill:
        cmd_str = '%s "pkill -SIGKILL --pidfile \"%s\""' % (pssh, daemon_pid_path)
    else:
        cmd_str = '%s "pkill -SIGTERM --pidfile \"%s\""' % (pssh, daemon_pid_path)
    if PRETEND:
        print('Pretending: {}'.format(cmd_str))
    else:
        print('Running: {}'.format(cmd_str))
        pssh_ret = util.exec_shell(cmd_str, True)
        err = False
        for line in pssh_ret:
            if 'FAILURE' in line.strip()[:30]:
                err = True
                print('------------------------- ERROR pssh -- Host "{}" -------------------------'.format(\
                      line[line.find('FAILURE'):].strip().split(' ')[1]))
                print(line)
        if not err:
            if sigkill:
                print('pssh daemon launch successfully executed. FS daemons have been force killed ...')
                exit(1)
            else:
                print('pssh daemon launch successfully executed. Checking for FS shutdown errors ...\n')
        else:
            print('[ERR] with pssh. Aborting...')
            exit(1)

    if not PRETEND:
        print('Give it some time ({} second) to finish up ...'.format(WAITTIME))
        for i in range(WAITTIME):
            print('{}\r'.format(WAITTIME - i))
            time.sleep(1)
    print('Checking logs ...\n')

    cmd_chk_str = '%s "tail -4 /tmp/gkfs_daemon.log"' % pssh
    if PRETEND:
        print('Pretending: {}'.format(cmd_chk_str))
    else:
        print('Running: {}'.format(cmd_chk_str))
        pssh_ret = util.exec_shell(cmd_chk_str, True)
        err = False
        fs_err = False
        for line in pssh_ret:
            if line == '':
                continue
            if 'Failure' in line.strip()[:30]:
                err = True
                print('------------------------- ERROR pssh -- Host "{}" -------------------------'.format(\
                      line[line.find('FAILURE'):].strip().split(' ')[1]))
                print(line)
            else:
                # check for errors in log
                if not 'All services shut down.' in line[line.strip().find('\n') + 1:]:
                    fs_err = True
                    print('------------------------- WARN pssh -- Host "{}" -------------------------'.format(\
                          line.strip().split(' ')[3].split('\n')[0]))
                    print('{}'.format(line[line.find('\n') + 1:]))

        if not err and not fs_err:
            print('pssh logging check successfully executed. Looks prime.')
        else:
            print('[WARN] while checking fs logs. Something might went wrong when shutting down')
            exit(1)


if __name__ == "__main__":
    # Init parser
    parser = argparse.ArgumentParser(description='This script stops GekkoFS on multiple nodes',
                                     formatter_class=argparse.RawTextHelpFormatter)
    # positional arguments
    parser.add_argument('daemonpidpath', type=str,
                        help='path to the daemon pid file')
    parser.add_argument('nodelist', type=str,
                        help='''list of nodes where the file system is launched. This can be a comma-separated list
                             or a path to a nodefile (one node per line)''')

    # optional arguments
    parser.add_argument('-p', '--pretend', action='store_true',
                        help='Output launch command and do not actually execute it')
    parser.add_argument('-9', '--sigkill', action='store_true',
                        help='Force kill daemons')
    parser.add_argument('-P', '--pssh', metavar='<PSSH_PATH>', type=str, default='',
                        help='Path to parallel-ssh/pssh. Defaults to /usr/bin/{parallel-ssh,pssh}')
    parser.add_argument('-J', '--jobid', metavar='<JOBID>', type=str, default='',
                        help='Jobid for cluster batch system. Used for a unique hostfile used for pssh.')
    parser.add_argument('-H', '--pssh_hostfile', metavar='<pssh_hostfile>', type=str, default='/tmp/hostfile_pssh',
                        help='''This script creates a hostfile to pass to MPI. This variable defines the path. 
Defaults to /tmp/hostfile_pssh''')
    args = parser.parse_args()

    if args.pretend is True:
        PRETEND = True
    else:
        PRETEND = False
    if args.jobid == '':
        PSSH_HOSTFILE_PATH = args.pssh_hostfile
    else:
        PSSH_HOSTFILE_PATH = '%s_%s' % (args.pssh_hostfile, args.jobid)
    PSSH_PATH = args.pssh
    WAITTIME = 5
    shutdown_system(args.daemonpidpath, args.nodelist, args.sigkill)

    print('\nNothing left to do; exiting. :)')

scripts/startup_gkfs.py

deleted100755 → 0
+0 −212
Original line number Diff line number Diff line
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import print_function
import argparse
import time

import os

from util import util

__author__ = "Marc-Andre Vef"
__email__ = "vef@uni-mainz.de"

global PRETEND
global PSSH_PATH
global WAITTIME
global PSSH_HOSTFILE_PATH


def check_dependencies():
    global PSSH_PATH
    """Check if pssh is installed"""
    pssh_path = os.popen('which pssh').read().strip()
    if pssh_path != '':
        PSSH_PATH = pssh_path
        return
    pssh_path = os.popen('which parallel-ssh').read().strip()
    if pssh_path != '':
        PSSH_PATH = pssh_path
        return
    print('[ERR] parallel-ssh/pssh executable cannot be found. Please add it to the parameter list')
    exit(1)


def init_system(daemon_path, rootdir, metadir, mountdir, nodelist, cleanroot, numactl):
    """Initializes GekkoFS on specified nodes.

    Args:
        daemon_path (str): Path to daemon executable
        rootdir (str): Path to root directory for fs data
        metadir (str): Path to metadata directory where metadata is stored
        mountdir (str): Path to mount directory where is used in
        nodelist (str): Comma-separated list of nodes where daemons need to be launched
        cleanroot (bool): if True, root and metadir is cleaned before daemon init
        numactl (str): numactl arguments for daemon init
    """
    global PSSH_PATH
    global PRETEND
    global PSSH_HOSTFILE_PATH
    # get absolute paths
    daemon_path = os.path.realpath(os.path.expanduser(daemon_path))
    mountdir = os.path.realpath(os.path.expanduser(mountdir))
    rootdir = os.path.realpath(os.path.expanduser(rootdir))
    # Replace metadir with rootdir if only rootdir is given
    if len(metadir) == 0:
        metadir = rootdir
    else:
        metadir = os.path.realpath(os.path.expanduser(metadir))
    pssh_nodelist = ''
    nodefile = False
    if os.path.exists(nodelist):
        nodefile = True
        if not util.create_pssh_hostfile(nodelist, PSSH_HOSTFILE_PATH):
            exit(1)
    if PSSH_PATH is '':
        check_dependencies()
    # set pssh arguments
    if nodefile:
        pssh = '%s -O StrictHostKeyChecking=no -i -h "%s"' % (PSSH_PATH, PSSH_HOSTFILE_PATH)
    else:
        pssh = '%s -O StrictHostKeyChecking=no -i -H "%s"' % (PSSH_PATH, nodelist.replace(',', ' '))

    # clean root and metadata dir if needed
    if cleanroot:
        cmd_rm_str = '%s "rm -rf %s/* %s/* && truncate -s 0 /tmp/gkfs_daemon.log /tmp/gkfs_preload.log"' % (pssh, rootdir, metadir)
        if PRETEND:
            print('Pretending: {}'.format(cmd_rm_str))
        else:
            print('Running: {}'.format(cmd_rm_str))
            pssh_ret = util.exec_shell(cmd_rm_str, True)
            err = False
            for line in pssh_ret:
                if 'FAILURE' in line.strip()[:30]:
                    err = True
                    print('------------------------- ERROR pssh -- Host "{}" -------------------------'.format(\
                          line[line.find('FAILURE'):].strip().split(' ')[1]))
                    print(line)
            if not err:
                print('pssh daemon launch successfully executed. Root and Metadata dir are cleaned.\n')
            else:
                print('[ERR] with pssh. Aborting!')
                exit(1)

    # Start deamons
    if nodefile:
        if len(numactl) == 0:
            cmd_str = '%s "nohup %s -r %s -i %s -m %s --hostfile %s > /tmp/gkfs_daemon.log 2>&1 &"' \
                      % (pssh, daemon_path, rootdir, metadir, mountdir, nodelist)
        else:
            cmd_str = '%s "nohup numactl %s %s -r %s -i %s -m %s --hostfile %s > /tmp/gkfs_daemon.log 2>&1 &"' \
                      % (pssh, numactl, daemon_path, rootdir, metadir, mountdir, nodelist)

    else:
        if len(numactl) == 0:
            cmd_str = '%s "nohup %s -r %s -i %s -m %s --hosts %s > /tmp/gkfs_daemon.log 2>&1 &"' \
                      % (pssh, daemon_path, rootdir, metadir, mountdir, nodelist)
        else:
            cmd_str = '%s "nohup numactl %s %s -r %s -i %s -m %s --hosts %s > /tmp/gkfs_daemon.log 2>&1 &"' \
                      % (pssh, numactl, daemon_path, rootdir, metadir, mountdir, nodelist)

    if PRETEND:
        print('Pretending: {}'.format(cmd_str))
    else:
        print('Running: {}'.format(cmd_str))
        pssh_ret = util.exec_shell(cmd_str, True)
        err = False
        for line in pssh_ret:
            if 'FAILURE' in line.strip()[:30]:
                err = True
                print('------------------------- ERROR pssh -- Host "{}" -------------------------'.format(\
                      line[line.find('FAILURE'):].strip().split(' ')[1]))
                print(line)
        if not err:
            print('pssh daemon launch successfully executed. Checking for FS startup errors ...\n')
        else:
            print('[ERR] with pssh. Aborting. Please run shutdown_gkfs.py to shut down orphan daemons!')
            exit(1)

    if not PRETEND:
        print('Give it some time ({} second) to startup ...'.format(WAITTIME))
        for i in range(WAITTIME):
            print('{}\r'.format(WAITTIME - i)),
            time.sleep(1)

    # Check logs for errors
    cmd_chk_str = '%s "head -5 /tmp/gkfs_daemon.log"' % pssh
    if PRETEND:
        print('Pretending: {}'.format(cmd_chk_str))
    else:
        print('Running: {}'.format(cmd_chk_str))
        pssh_ret = util.exec_shell(cmd_chk_str, True)
        err = False
        fs_err = False
        for line in pssh_ret:
            if 'Failure' in line.strip()[:30]:
                err = True
                print('------------------------- ERROR pssh -- Host "{}" -------------------------'.format(\
                      line[line.find('FAILURE'):].strip().split(' ')[1]))
                print(line)
            else:
                # check for errors in log
                if '[E]' in line[line.strip().find('\n') + 1:] or 'Assertion `err\'' in line[
                                                                                          line.strip().find('\n') + 1:]:
                    fs_err = True
                    print('------------------------- ERROR pssh -- Host "{}" -------------------------'.format(\
                          line.strip().split(' ')[3].split('\n')[0]))
                    print('{}'.format(line[line.find('\n') + 1:]))

        if not err and not fs_err:
            print('pssh logging check successfully executed. Looks prime.')
        else:
            print('[ERR] while checking fs logs. Aborting. Please run shutdown_gkfs.py to shut down orphan daemons!')
            exit(1)


if __name__ == "__main__":
    # Init parser
    parser = argparse.ArgumentParser(description='This script launches GekkoFS on multiple nodes',
                                     formatter_class=argparse.RawTextHelpFormatter)
    # positional arguments
    parser.add_argument('daemonpath', type=str,
                        help='path to the daemon executable')
    parser.add_argument('rootdir', type=str,
                        help='path to the root directory where all data will be stored')
    parser.add_argument('mountdir', type=str,
                        help='path to the mount directory of the file system')
    parser.add_argument('nodelist', type=str,
                        help='''list of nodes where the file system is launched. This can be a comma-separated list
or a path to a nodefile (one node per line)''')

    # optional arguments
    parser.add_argument('-i', '--metadir', metavar='<METADIR_PATH>', type=str, default='',
                        help='''Path to separate metadir directory where metadata is stored. 
If not set, rootdir will be used instead.''')
    parser.add_argument('-p', '--pretend', action='store_true',
                        help='Output launch command and do not actually execute it')
    parser.add_argument('-P', '--pssh', metavar='<PSSH_PATH>', type=str, default='',
                        help='Path to parallel-ssh/pssh. Defaults to /usr/bin/{parallel-ssh,pssh}')
    parser.add_argument('-J', '--jobid', metavar='<JOBID>', type=str, default='',
                        help='Jobid for cluster batch system. Used for a unique hostfile used for pssh.')
    parser.add_argument('-c', '--cleanroot', action='store_true',
                        help='Removes contents of root and metadata directory before starting daemon. Be careful!')
    parser.add_argument('-n', '--numactl', metavar='<numactl_args>', type=str, default='',
                        help='If daemon should be pinned to certain cores, set numactl arguments here.')
    parser.add_argument('-H', '--pssh_hostfile', metavar='<pssh_hostfile>', type=str, default='/tmp/hostfile_pssh',
                        help='''This script creates a hostfile to pass to MPI. This variable defines the path. 
Defaults to /tmp/hostfile_pssh''')
    args = parser.parse_args()
    if args.pretend:
        PRETEND = True
    else:
        PRETEND = False
    if args.jobid == '':
        PSSH_HOSTFILE_PATH = args.pssh_hostfile
    else:
        PSSH_HOSTFILE_PATH = '%s_%s' % (args.pssh_hostfile, args.jobid)
    PSSH_PATH = args.pssh
    WAITTIME = 5
    init_system(args.daemonpath, args.rootdir, args.metadir, args.mountdir, args.nodelist, args.cleanroot, args.numactl)

    print('\nNothing left to do; exiting. :)')

scripts/util/__init__.py

deleted100644 → 0
+0 −0

Empty file deleted.

Loading