Loading ifs/include/preload/preload_util.hpp +3 −0 Original line number Diff line number Diff line Loading @@ -29,6 +29,7 @@ struct FsConfig { // rpc infos std::map<uint64_t, std::string> hosts; std::map<std::string, std::string> sys_hostfile; uint64_t host_id; // my host number size_t host_size; std::string rpc_port; Loading Loading @@ -114,6 +115,8 @@ std::string daemon_register_path(int pid); bool get_daemon_auxiliaries(); bool read_system_hostfile(); bool get_addr_by_hostid(uint64_t hostid, hg_addr_t& svr_addr); size_t get_rpc_node(const std::string& to_hash); Loading ifs/scripts/compile_dep.sh +22 −17 Original line number Diff line number Diff line Loading @@ -25,7 +25,7 @@ optional arguments: defaults to 'all' -c <CLUSTER>, --cluster <CLUSTER> additional configurations for specific compute clusters supported clusters: {mogon1,fh2} supported clusters: {mogon1,mogon2,fh2} -j <COMPILE_CORES>, --compilecores <COMPILE_CORES> number of cores that are used to compile the depdencies defaults to number of available cores Loading Loading @@ -115,7 +115,7 @@ else exit fi if [[ "${CLUSTER}" != "" ]]; then if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) ]]; then if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) || ( "${CLUSTER}" == "mogon2" ) ]]; then echo CLUSTER = "${CLUSTER}" else echo "${CLUSTER} cluster configuration is invalid. Exiting ..." Loading @@ -138,7 +138,7 @@ echo "Install path = '$2'"; mkdir -p ${SOURCE} # Set cluster dependencies first if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) ]]; then if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) || ( "${CLUSTER}" == "mogon2" ) ]]; then # get libtool echo "############################################################ Installing: libtool" CURR=${SOURCE}/libtool Loading Loading @@ -223,6 +223,8 @@ fi if [ "$NA_LAYER" == "ofi" ] || [ "$NA_LAYER" == "all" ]; then USE_OFI="-DNA_USE_OFI:BOOL=ON" # Mogon2 already has libfabric installed in a version that Mercury supports. if [[ ("${CLUSTER}" != "mogon2") ]]; then echo "############################################################ Installing: LibFabric" #libfabric CURR=${SOURCE}/libfabric Loading @@ -235,6 +237,7 @@ if [ "$NA_LAYER" == "ofi" ] || [ "$NA_LAYER" == "all" ]; then make install || exit 1 make check || exit 1 fi fi echo "############################################################ Installing: Mercury" Loading @@ -242,9 +245,11 @@ echo "############################################################ Installing: CURR=${SOURCE}/mercury prepare_build_dir ${CURR} cd ${CURR} if [ "$NA_LAYER" == "cci" ] || [ "$NA_LAYER" == "all" ]; then # patch cci verbs addr lookup error handling echo "########## Applying cci addr lookup error handling patch" git apply ${PATCH_DIR}/mercury_cci_verbs_lookup.patch || exit 1 fi cd ${CURR}/build # XXX Note: USE_EAGER_BULK is temporarily disabled due to bugs in Mercury with smaller amounts of data # Apparantly this is fixed in the new Mercury version. TODO check if it works now Loading ifs/scripts/dl_dep.sh +5 −10 Original line number Diff line number Diff line Loading @@ -171,22 +171,17 @@ if [ "${NA_LAYER}" == "cci" ] || [ "${NA_LAYER}" == "all" ]; then fi # get libfabric if [ "${NA_LAYER}" == "ofi" ] || [ "${NA_LAYER}" == "all" ]; then clonedeps "libfabric" "git clone https://github.com/ofiwg/libfabric" "tags/v1.5.2" clonedeps "libfabric" "git clone https://github.com/ofiwg/libfabric" "tags/v1.5.3" fi # get Mercury clonedeps "mercury" "git clone --recurse-submodules https://github.com/mercury-hpc/mercury" "2a7369db11c7d9e962a8d59f1852a04c4ff57694" clonedeps "mercury" "git clone --recurse-submodules https://github.com/mercury-hpc/mercury" "6c82baf7819a553b6b8235fbe7c180989a1e17fe" # get Argobots clonedeps "argobots" "git clone -b dev-get-dev-basic https://github.com/carns/argobots.git" "fd6ae0f4613187a2c73fceee8a2718d54719bcab" clonedeps "argobots" "git clone -b dev-get-dev-basic https://github.com/carns/argobots.git" "78ceea28ed44faca12cf8ea7f5687b894c66a8c4" # get Argobots-snoozer clonedeps "abt-snoozer" "git clone https://xgitlab.cels.anl.gov/sds/abt-snoozer.git" "3d9240eda290bfb89f08a5673cebd888194a4bd7" # get Margo clonedeps "margo" "git clone https://xgitlab.cels.anl.gov/sds/margo.git" "30521c87a5d6b531b894877f40f0f34ef8ecd541" clonedeps "margo" "git clone https://xgitlab.cels.anl.gov/sds/margo.git" "3f9fe3a13392af1ba6df1b3d3bb16503da6b627d" # get rocksdb clonedeps "rocksdb" "git clone https://github.com/facebook/rocksdb" "tags/v5.8" # Commit values used before 17.01.2018 # Mercury: afd70055d21a6df2faefe38d5f6ce1ae11f365a5 # Argobots: a5a6b2036c75ad05804ccb72d2fe31cea1bfef88 # Margo: 68ef7f14178e9066cf38846d90d451e00aaca61d clonedeps "rocksdb" "git clone https://github.com/facebook/rocksdb" "tags/v5.10.2" echo "Nothing left to do. Exiting." ifs/src/preload/preload.cpp +6 −1 Original line number Diff line number Diff line Loading @@ -141,7 +141,8 @@ bool init_margo_client(Margo_mode mode, const string na_plugin) { // Init Mercury layer (must be finalized when finished) hg_class_t* hg_class; hg_context_t* hg_context; hg_class = HG_Init(na_plugin.c_str(), HG_FALSE); // Note: localhost should not be required and actually doesn't do anything. But it is required for OFI for Mercury to start hg_class = HG_Init((na_plugin + "://localhost"s).c_str(), HG_FALSE); if (hg_class == nullptr) { ld_logger->error("{}() HG_Init() Failed to init Mercury client layer", __func__); return false; Loading Loading @@ -217,6 +218,10 @@ void init_ld_environment_() { ld_logger->error("{}() Unable to initialize Margo RPC client.", __func__); exit(EXIT_FAILURE); } if (!read_system_hostfile()) { ld_logger->error("{}() Unable to read system hostfile /etc/hosts for address mapping.", __func__); exit(EXIT_FAILURE); } ld_logger->info("{}() Environment initialization successful.", __func__); } Loading ifs/src/preload/preload_util.cpp +49 −7 Original line number Diff line number Diff line Loading @@ -3,6 +3,8 @@ #include <dirent.h> #include <fstream> #include <iterator> #include <sstream> using namespace std; Loading Loading @@ -250,6 +252,33 @@ bool get_daemon_auxiliaries() { return ret; } /** * Read /etc/hosts and put hostname - ip association into a map in fs config. * We are working with hostnames but some network layers (such as Omnipath) does not look into /etc/hosts. * Hence, we have to store the mapping ourselves. * @return success */ bool read_system_hostfile() { ifstream hostfile("/etc/hosts"); if (!hostfile.is_open()) return false; string line; map<string, string> sys_hostfile; while (getline(hostfile, line)) { if (line.empty() || line == "\n" || line.at(0) == '#') continue; std::istringstream iss(line); std::vector<string> tmp_list((istream_iterator<string>(iss)), istream_iterator<string>()); for (unsigned int i = 1; i < tmp_list.size(); i++) { if (tmp_list[i].find(HOSTNAME_SUFFIX) != string::npos) sys_hostfile.insert(make_pair(tmp_list[i], tmp_list[0])); } } fs_config->sys_hostfile = sys_hostfile; ld_logger->info("{}() /etc/hosts successfully mapped into ADA-FS", __func__); return true; } /** * Creates an abstract rpc address for a given hostid and puts it into an address cache map * @param hostid Loading @@ -263,21 +292,34 @@ bool get_addr_by_hostid(const uint64_t hostid, hg_addr_t& svr_addr) { //found return true; } else { ld_logger->trace("not found in lrucache"); // not found, manual lookup and add address mapping to LRU cache auto hostname = RPC_PROTOCOL + "://"s + fs_config->hosts.at(hostid) + HOSTNAME_SUFFIX + ":"s + fs_config->rpc_port; // convert hostid to hostname and port ld_logger->trace("generated hostname {} with rpc_port {}", hostname, fs_config->rpc_port); ld_logger->trace("not found in lrucache"); string remote_addr; // Try to get the ip of remote addr. If it cannot be found, use hostname // first get the hostname with the hostid auto hostname = fs_config->hosts.at(hostid) + HOSTNAME_SUFFIX; // then get the ip address from /etc/hosts which is mapped to the sys_hostfile map if (fs_config->sys_hostfile.count(hostname) == 1) { auto remote_ip = fs_config->sys_hostfile.at(hostname); remote_addr = RPC_PROTOCOL + "://"s + remote_ip + ":"s + fs_config->rpc_port; } // fallback hostname to use for lookup if (remote_addr.empty()) { remote_addr = RPC_PROTOCOL + "://"s + hostname + ":"s + fs_config->rpc_port; // convert hostid to remote_addr and port } ld_logger->trace("generated remote_addr {} for hostname {} with rpc_port {}", remote_addr, hostname, fs_config->rpc_port); // try to look up 3 times before erroring out hg_return_t ret; // TODO If this is solution is somewhat helpful, write a more versatile solution for (unsigned int i = 0; i < 3; i++) { ret = margo_addr_lookup(ld_margo_rpc_id, hostname.c_str(), &svr_addr); ret = margo_addr_lookup(ld_margo_rpc_id, remote_addr.c_str(), &svr_addr); if (ret != HG_SUCCESS) { // still not working after 5 tries. if (i == 4) { ld_logger->error("{}() Unable to lookup address {} from host {}", __func__, hostname, fs_config->hosts.at(fs_config->host_id)); remote_addr, fs_config->hosts.at(fs_config->host_id)); return false; } // Wait a second then try again Loading @@ -288,7 +330,7 @@ bool get_addr_by_hostid(const uint64_t hostid, hg_addr_t& svr_addr) { } if (svr_addr == HG_ADDR_NULL) { ld_logger->error("{}() looked up address is NULL for address {} from host {}", __func__, hostname, fs_config->hosts.at(fs_config->host_id)); remote_addr, fs_config->hosts.at(fs_config->host_id)); return false; } rpc_address_cache.insert(hostid, svr_addr); Loading Loading
ifs/include/preload/preload_util.hpp +3 −0 Original line number Diff line number Diff line Loading @@ -29,6 +29,7 @@ struct FsConfig { // rpc infos std::map<uint64_t, std::string> hosts; std::map<std::string, std::string> sys_hostfile; uint64_t host_id; // my host number size_t host_size; std::string rpc_port; Loading Loading @@ -114,6 +115,8 @@ std::string daemon_register_path(int pid); bool get_daemon_auxiliaries(); bool read_system_hostfile(); bool get_addr_by_hostid(uint64_t hostid, hg_addr_t& svr_addr); size_t get_rpc_node(const std::string& to_hash); Loading
ifs/scripts/compile_dep.sh +22 −17 Original line number Diff line number Diff line Loading @@ -25,7 +25,7 @@ optional arguments: defaults to 'all' -c <CLUSTER>, --cluster <CLUSTER> additional configurations for specific compute clusters supported clusters: {mogon1,fh2} supported clusters: {mogon1,mogon2,fh2} -j <COMPILE_CORES>, --compilecores <COMPILE_CORES> number of cores that are used to compile the depdencies defaults to number of available cores Loading Loading @@ -115,7 +115,7 @@ else exit fi if [[ "${CLUSTER}" != "" ]]; then if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) ]]; then if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) || ( "${CLUSTER}" == "mogon2" ) ]]; then echo CLUSTER = "${CLUSTER}" else echo "${CLUSTER} cluster configuration is invalid. Exiting ..." Loading @@ -138,7 +138,7 @@ echo "Install path = '$2'"; mkdir -p ${SOURCE} # Set cluster dependencies first if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) ]]; then if [[ ( "${CLUSTER}" == "mogon1" ) || ( "${CLUSTER}" == "fh2" ) || ( "${CLUSTER}" == "mogon2" ) ]]; then # get libtool echo "############################################################ Installing: libtool" CURR=${SOURCE}/libtool Loading Loading @@ -223,6 +223,8 @@ fi if [ "$NA_LAYER" == "ofi" ] || [ "$NA_LAYER" == "all" ]; then USE_OFI="-DNA_USE_OFI:BOOL=ON" # Mogon2 already has libfabric installed in a version that Mercury supports. if [[ ("${CLUSTER}" != "mogon2") ]]; then echo "############################################################ Installing: LibFabric" #libfabric CURR=${SOURCE}/libfabric Loading @@ -235,6 +237,7 @@ if [ "$NA_LAYER" == "ofi" ] || [ "$NA_LAYER" == "all" ]; then make install || exit 1 make check || exit 1 fi fi echo "############################################################ Installing: Mercury" Loading @@ -242,9 +245,11 @@ echo "############################################################ Installing: CURR=${SOURCE}/mercury prepare_build_dir ${CURR} cd ${CURR} if [ "$NA_LAYER" == "cci" ] || [ "$NA_LAYER" == "all" ]; then # patch cci verbs addr lookup error handling echo "########## Applying cci addr lookup error handling patch" git apply ${PATCH_DIR}/mercury_cci_verbs_lookup.patch || exit 1 fi cd ${CURR}/build # XXX Note: USE_EAGER_BULK is temporarily disabled due to bugs in Mercury with smaller amounts of data # Apparantly this is fixed in the new Mercury version. TODO check if it works now Loading
ifs/scripts/dl_dep.sh +5 −10 Original line number Diff line number Diff line Loading @@ -171,22 +171,17 @@ if [ "${NA_LAYER}" == "cci" ] || [ "${NA_LAYER}" == "all" ]; then fi # get libfabric if [ "${NA_LAYER}" == "ofi" ] || [ "${NA_LAYER}" == "all" ]; then clonedeps "libfabric" "git clone https://github.com/ofiwg/libfabric" "tags/v1.5.2" clonedeps "libfabric" "git clone https://github.com/ofiwg/libfabric" "tags/v1.5.3" fi # get Mercury clonedeps "mercury" "git clone --recurse-submodules https://github.com/mercury-hpc/mercury" "2a7369db11c7d9e962a8d59f1852a04c4ff57694" clonedeps "mercury" "git clone --recurse-submodules https://github.com/mercury-hpc/mercury" "6c82baf7819a553b6b8235fbe7c180989a1e17fe" # get Argobots clonedeps "argobots" "git clone -b dev-get-dev-basic https://github.com/carns/argobots.git" "fd6ae0f4613187a2c73fceee8a2718d54719bcab" clonedeps "argobots" "git clone -b dev-get-dev-basic https://github.com/carns/argobots.git" "78ceea28ed44faca12cf8ea7f5687b894c66a8c4" # get Argobots-snoozer clonedeps "abt-snoozer" "git clone https://xgitlab.cels.anl.gov/sds/abt-snoozer.git" "3d9240eda290bfb89f08a5673cebd888194a4bd7" # get Margo clonedeps "margo" "git clone https://xgitlab.cels.anl.gov/sds/margo.git" "30521c87a5d6b531b894877f40f0f34ef8ecd541" clonedeps "margo" "git clone https://xgitlab.cels.anl.gov/sds/margo.git" "3f9fe3a13392af1ba6df1b3d3bb16503da6b627d" # get rocksdb clonedeps "rocksdb" "git clone https://github.com/facebook/rocksdb" "tags/v5.8" # Commit values used before 17.01.2018 # Mercury: afd70055d21a6df2faefe38d5f6ce1ae11f365a5 # Argobots: a5a6b2036c75ad05804ccb72d2fe31cea1bfef88 # Margo: 68ef7f14178e9066cf38846d90d451e00aaca61d clonedeps "rocksdb" "git clone https://github.com/facebook/rocksdb" "tags/v5.10.2" echo "Nothing left to do. Exiting."
ifs/src/preload/preload.cpp +6 −1 Original line number Diff line number Diff line Loading @@ -141,7 +141,8 @@ bool init_margo_client(Margo_mode mode, const string na_plugin) { // Init Mercury layer (must be finalized when finished) hg_class_t* hg_class; hg_context_t* hg_context; hg_class = HG_Init(na_plugin.c_str(), HG_FALSE); // Note: localhost should not be required and actually doesn't do anything. But it is required for OFI for Mercury to start hg_class = HG_Init((na_plugin + "://localhost"s).c_str(), HG_FALSE); if (hg_class == nullptr) { ld_logger->error("{}() HG_Init() Failed to init Mercury client layer", __func__); return false; Loading Loading @@ -217,6 +218,10 @@ void init_ld_environment_() { ld_logger->error("{}() Unable to initialize Margo RPC client.", __func__); exit(EXIT_FAILURE); } if (!read_system_hostfile()) { ld_logger->error("{}() Unable to read system hostfile /etc/hosts for address mapping.", __func__); exit(EXIT_FAILURE); } ld_logger->info("{}() Environment initialization successful.", __func__); } Loading
ifs/src/preload/preload_util.cpp +49 −7 Original line number Diff line number Diff line Loading @@ -3,6 +3,8 @@ #include <dirent.h> #include <fstream> #include <iterator> #include <sstream> using namespace std; Loading Loading @@ -250,6 +252,33 @@ bool get_daemon_auxiliaries() { return ret; } /** * Read /etc/hosts and put hostname - ip association into a map in fs config. * We are working with hostnames but some network layers (such as Omnipath) does not look into /etc/hosts. * Hence, we have to store the mapping ourselves. * @return success */ bool read_system_hostfile() { ifstream hostfile("/etc/hosts"); if (!hostfile.is_open()) return false; string line; map<string, string> sys_hostfile; while (getline(hostfile, line)) { if (line.empty() || line == "\n" || line.at(0) == '#') continue; std::istringstream iss(line); std::vector<string> tmp_list((istream_iterator<string>(iss)), istream_iterator<string>()); for (unsigned int i = 1; i < tmp_list.size(); i++) { if (tmp_list[i].find(HOSTNAME_SUFFIX) != string::npos) sys_hostfile.insert(make_pair(tmp_list[i], tmp_list[0])); } } fs_config->sys_hostfile = sys_hostfile; ld_logger->info("{}() /etc/hosts successfully mapped into ADA-FS", __func__); return true; } /** * Creates an abstract rpc address for a given hostid and puts it into an address cache map * @param hostid Loading @@ -263,21 +292,34 @@ bool get_addr_by_hostid(const uint64_t hostid, hg_addr_t& svr_addr) { //found return true; } else { ld_logger->trace("not found in lrucache"); // not found, manual lookup and add address mapping to LRU cache auto hostname = RPC_PROTOCOL + "://"s + fs_config->hosts.at(hostid) + HOSTNAME_SUFFIX + ":"s + fs_config->rpc_port; // convert hostid to hostname and port ld_logger->trace("generated hostname {} with rpc_port {}", hostname, fs_config->rpc_port); ld_logger->trace("not found in lrucache"); string remote_addr; // Try to get the ip of remote addr. If it cannot be found, use hostname // first get the hostname with the hostid auto hostname = fs_config->hosts.at(hostid) + HOSTNAME_SUFFIX; // then get the ip address from /etc/hosts which is mapped to the sys_hostfile map if (fs_config->sys_hostfile.count(hostname) == 1) { auto remote_ip = fs_config->sys_hostfile.at(hostname); remote_addr = RPC_PROTOCOL + "://"s + remote_ip + ":"s + fs_config->rpc_port; } // fallback hostname to use for lookup if (remote_addr.empty()) { remote_addr = RPC_PROTOCOL + "://"s + hostname + ":"s + fs_config->rpc_port; // convert hostid to remote_addr and port } ld_logger->trace("generated remote_addr {} for hostname {} with rpc_port {}", remote_addr, hostname, fs_config->rpc_port); // try to look up 3 times before erroring out hg_return_t ret; // TODO If this is solution is somewhat helpful, write a more versatile solution for (unsigned int i = 0; i < 3; i++) { ret = margo_addr_lookup(ld_margo_rpc_id, hostname.c_str(), &svr_addr); ret = margo_addr_lookup(ld_margo_rpc_id, remote_addr.c_str(), &svr_addr); if (ret != HG_SUCCESS) { // still not working after 5 tries. if (i == 4) { ld_logger->error("{}() Unable to lookup address {} from host {}", __func__, hostname, fs_config->hosts.at(fs_config->host_id)); remote_addr, fs_config->hosts.at(fs_config->host_id)); return false; } // Wait a second then try again Loading @@ -288,7 +330,7 @@ bool get_addr_by_hostid(const uint64_t hostid, hg_addr_t& svr_addr) { } if (svr_addr == HG_ADDR_NULL) { ld_logger->error("{}() looked up address is NULL for address {} from host {}", __func__, hostname, fs_config->hosts.at(fs_config->host_id)); remote_addr, fs_config->hosts.at(fs_config->host_id)); return false; } rpc_address_cache.insert(hostid, svr_addr); Loading