Commit 1c24c047 authored by Marc Vef's avatar Marc Vef
Browse files

Merge branch 'opa_support'

parents a2776cd4 c3524a46
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@ struct FsConfig {

    // rpc infos
    std::map<uint64_t, std::string> hosts;
    std::map<std::string, std::string> sys_hostfile;
    uint64_t host_id; // my host number
    size_t host_size;
    std::string rpc_port;
@@ -114,6 +115,8 @@ std::string daemon_register_path(int pid);

bool get_daemon_auxiliaries();

bool read_system_hostfile();

bool get_addr_by_hostid(uint64_t hostid, hg_addr_t& svr_addr);

size_t get_rpc_node(const std::string& to_hash);
+22 −17
Original line number Diff line number Diff line
@@ -25,7 +25,7 @@ optional arguments:
				defaults to 'all'
	-c <CLUSTER>, --cluster <CLUSTER>
				additional configurations for specific compute clusters
				supported clusters: {mogon1,fh2}
				supported clusters: {mogon1,mogon2,fh2}
	-j <COMPILE_CORES>, --compilecores <COMPILE_CORES>
				number of cores that are used to compile the depdencies
				defaults to number of available cores
@@ -115,7 +115,7 @@ else
    exit
fi
if [[ "${CLUSTER}" != "" ]]; then
	if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) ]]; then
	if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) || ( "${CLUSTER}" == "mogon2" ) ]]; then
		echo CLUSTER  = "${CLUSTER}"
    else
        echo "${CLUSTER} cluster configuration is invalid. Exiting ..."
@@ -138,7 +138,7 @@ echo "Install path = '$2'";
mkdir -p ${SOURCE}

# Set cluster dependencies first
if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) ]]; then
if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) || ( "${CLUSTER}" == "mogon2" ) ]]; then
    # get libtool
    echo "############################################################ Installing:  libtool"
    CURR=${SOURCE}/libtool
@@ -223,6 +223,8 @@ fi

if [ "$NA_LAYER" == "ofi" ] || [ "$NA_LAYER" == "all" ]; then
    USE_OFI="-DNA_USE_OFI:BOOL=ON"
    # Mogon2 already has libfabric installed in a version that Mercury supports.
    if [[ ("${CLUSTER}" != "mogon2") ]]; then
        echo "############################################################ Installing:  LibFabric"
        #libfabric
        CURR=${SOURCE}/libfabric
@@ -235,6 +237,7 @@ if [ "$NA_LAYER" == "ofi" ] || [ "$NA_LAYER" == "all" ]; then
        make install || exit 1
        make check || exit 1
    fi
fi

echo "############################################################ Installing:  Mercury"

@@ -242,9 +245,11 @@ echo "############################################################ Installing:
CURR=${SOURCE}/mercury
prepare_build_dir ${CURR}
cd ${CURR}
if [ "$NA_LAYER" == "cci" ] || [ "$NA_LAYER" == "all" ]; then
    # patch cci verbs addr lookup error handling
    echo "########## Applying cci addr lookup error handling patch"
    git apply ${PATCH_DIR}/mercury_cci_verbs_lookup.patch || exit 1
fi
cd ${CURR}/build
# XXX Note: USE_EAGER_BULK is temporarily disabled due to bugs in Mercury with smaller amounts of data
# Apparantly this is fixed in the new Mercury version. TODO check if it works now
+5 −10
Original line number Diff line number Diff line
@@ -171,22 +171,17 @@ if [ "${NA_LAYER}" == "cci" ] || [ "${NA_LAYER}" == "all" ]; then
fi
# get libfabric
if [ "${NA_LAYER}" == "ofi" ] || [ "${NA_LAYER}" == "all" ]; then
    clonedeps "libfabric" "git clone https://github.com/ofiwg/libfabric" "tags/v1.5.2"
    clonedeps "libfabric" "git clone https://github.com/ofiwg/libfabric" "tags/v1.5.3"
fi
# get Mercury
clonedeps "mercury" "git clone --recurse-submodules https://github.com/mercury-hpc/mercury" "2a7369db11c7d9e962a8d59f1852a04c4ff57694"
clonedeps "mercury" "git clone --recurse-submodules https://github.com/mercury-hpc/mercury" "6c82baf7819a553b6b8235fbe7c180989a1e17fe"
# get Argobots
clonedeps "argobots" "git clone -b dev-get-dev-basic https://github.com/carns/argobots.git" "fd6ae0f4613187a2c73fceee8a2718d54719bcab"
clonedeps "argobots" "git clone -b dev-get-dev-basic https://github.com/carns/argobots.git" "78ceea28ed44faca12cf8ea7f5687b894c66a8c4"
# get Argobots-snoozer
clonedeps "abt-snoozer" "git clone https://xgitlab.cels.anl.gov/sds/abt-snoozer.git" "3d9240eda290bfb89f08a5673cebd888194a4bd7"
# get Margo
clonedeps "margo" "git clone https://xgitlab.cels.anl.gov/sds/margo.git" "30521c87a5d6b531b894877f40f0f34ef8ecd541"
clonedeps "margo" "git clone https://xgitlab.cels.anl.gov/sds/margo.git" "3f9fe3a13392af1ba6df1b3d3bb16503da6b627d"
# get rocksdb
clonedeps "rocksdb" "git clone https://github.com/facebook/rocksdb" "tags/v5.8"

# Commit values used before 17.01.2018
# Mercury: afd70055d21a6df2faefe38d5f6ce1ae11f365a5
# Argobots: a5a6b2036c75ad05804ccb72d2fe31cea1bfef88
# Margo: 68ef7f14178e9066cf38846d90d451e00aaca61d
clonedeps "rocksdb" "git clone https://github.com/facebook/rocksdb" "tags/v5.10.2"

echo "Nothing left to do. Exiting."
+6 −1
Original line number Diff line number Diff line
@@ -141,7 +141,8 @@ bool init_margo_client(Margo_mode mode, const string na_plugin) {
    // Init Mercury layer (must be finalized when finished)
    hg_class_t* hg_class;
    hg_context_t* hg_context;
    hg_class = HG_Init(na_plugin.c_str(), HG_FALSE);
    // Note: localhost should not be required and actually doesn't do anything. But it is required for OFI for Mercury to start
    hg_class = HG_Init((na_plugin + "://localhost"s).c_str(), HG_FALSE);
    if (hg_class == nullptr) {
        ld_logger->error("{}() HG_Init() Failed to init Mercury client layer", __func__);
        return false;
@@ -217,6 +218,10 @@ void init_ld_environment_() {
        ld_logger->error("{}() Unable to initialize Margo RPC client.", __func__);
        exit(EXIT_FAILURE);
    }
    if (!read_system_hostfile()) {
        ld_logger->error("{}() Unable to read system hostfile /etc/hosts for address mapping.", __func__);
        exit(EXIT_FAILURE);
    }
    ld_logger->info("{}() Environment initialization successful.", __func__);
}

+49 −7
Original line number Diff line number Diff line
@@ -3,6 +3,8 @@

#include <dirent.h>
#include <fstream>
#include <iterator>
#include <sstream>

using namespace std;

@@ -250,6 +252,33 @@ bool get_daemon_auxiliaries() {
    return ret;
}

/**
 * Read /etc/hosts and put hostname - ip association into a map in fs config.
 * We are working with hostnames but some network layers (such as Omnipath) does not look into /etc/hosts.
 * Hence, we have to store the mapping ourselves.
 * @return success
 */
bool read_system_hostfile() {
    ifstream hostfile("/etc/hosts");
    if (!hostfile.is_open())
        return false;
    string line;
    map<string, string> sys_hostfile;
    while (getline(hostfile, line)) {
        if (line.empty() || line == "\n" || line.at(0) == '#')
            continue;
        std::istringstream iss(line);
        std::vector<string> tmp_list((istream_iterator<string>(iss)), istream_iterator<string>());
        for (unsigned int i = 1; i < tmp_list.size(); i++) {
            if (tmp_list[i].find(HOSTNAME_SUFFIX) != string::npos)
                sys_hostfile.insert(make_pair(tmp_list[i], tmp_list[0]));
        }
    }
    fs_config->sys_hostfile = sys_hostfile;
    ld_logger->info("{}() /etc/hosts successfully mapped into ADA-FS", __func__);
    return true;
}

/**
 * Creates an abstract rpc address for a given hostid and puts it into an address cache map
 * @param hostid
@@ -263,21 +292,34 @@ bool get_addr_by_hostid(const uint64_t hostid, hg_addr_t& svr_addr) {
        //found
        return true;
    } else {
        ld_logger->trace("not found in lrucache");
        // not found, manual lookup and add address mapping to LRU cache
        auto hostname = RPC_PROTOCOL + "://"s + fs_config->hosts.at(hostid) + HOSTNAME_SUFFIX + ":"s +
                        fs_config->rpc_port; // convert hostid to hostname and port
        ld_logger->trace("generated hostname {} with rpc_port {}", hostname, fs_config->rpc_port);
        ld_logger->trace("not found in lrucache");
        string remote_addr;
        // Try to get the ip of remote addr. If it cannot be found, use hostname
        // first get the hostname with the hostid
        auto hostname = fs_config->hosts.at(hostid) + HOSTNAME_SUFFIX;
        // then get the ip address from /etc/hosts which is mapped to the sys_hostfile map
        if (fs_config->sys_hostfile.count(hostname) == 1) {
            auto remote_ip = fs_config->sys_hostfile.at(hostname);
            remote_addr = RPC_PROTOCOL + "://"s + remote_ip + ":"s + fs_config->rpc_port;
        }
        // fallback hostname to use for lookup
        if (remote_addr.empty()) {
            remote_addr = RPC_PROTOCOL + "://"s + hostname + ":"s +
                          fs_config->rpc_port; // convert hostid to remote_addr and port
        }
        ld_logger->trace("generated remote_addr {} for hostname {} with rpc_port {}",
                         remote_addr, hostname, fs_config->rpc_port);
        // try to look up 3 times before erroring out
        hg_return_t ret;
        // TODO If this is solution is somewhat helpful, write a more versatile solution
        for (unsigned int i = 0; i < 3; i++) {
            ret = margo_addr_lookup(ld_margo_rpc_id, hostname.c_str(), &svr_addr);
            ret = margo_addr_lookup(ld_margo_rpc_id, remote_addr.c_str(), &svr_addr);
            if (ret != HG_SUCCESS) {
                // still not working after 5 tries.
                if (i == 4) {
                    ld_logger->error("{}() Unable to lookup address {} from host {}", __func__,
                                     hostname, fs_config->hosts.at(fs_config->host_id));
                                     remote_addr, fs_config->hosts.at(fs_config->host_id));
                    return false;
                }
                // Wait a second then try again
@@ -288,7 +330,7 @@ bool get_addr_by_hostid(const uint64_t hostid, hg_addr_t& svr_addr) {
        }
        if (svr_addr == HG_ADDR_NULL) {
            ld_logger->error("{}() looked up address is NULL for address {} from host {}", __func__,
                             hostname, fs_config->hosts.at(fs_config->host_id));
                             remote_addr, fs_config->hosts.at(fs_config->host_id));
            return false;
        }
        rpc_address_cache.insert(hostid, svr_addr);