From b32a29cac7a5e49be3bcc9595fb09376a2b6efa5 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Fri, 17 May 2024 12:11:46 +0200 Subject: [PATCH 01/17] Add RPC client to daemon --- include/daemon/classes/rpc_data.hpp | 17 +++++++- src/daemon/classes/rpc_data.cpp | 15 +++++++ src/daemon/daemon.cpp | 64 +++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) diff --git a/include/daemon/classes/rpc_data.hpp b/include/daemon/classes/rpc_data.hpp index c98b7d3ad..d81ecfdae 100644 --- a/include/daemon/classes/rpc_data.hpp +++ b/include/daemon/classes/rpc_data.hpp @@ -41,6 +41,10 @@ class Distributor; namespace daemon { +struct margo_client_ids { + hg_id_t test_rpc_id; +}; + class RPCData { private: @@ -50,6 +54,8 @@ private: // contexts that were created at init time margo_instance_id server_rpc_mid_; margo_instance_id proxy_server_rpc_mid_; + margo_instance_id client_rpc_mid_; + margo_client_ids rpc_client_ids_{}; // Argobots I/O pools and execution streams ABT_pool io_pool_; @@ -83,7 +89,16 @@ public: proxy_server_rpc_mid(); void - proxy_server_rpc_mid(margo_instance* proxy_server_rpc_mid); + proxy_server_rpc_mid(margo_instance* client_rpc_mid); + + margo_instance* + client_rpc_mid(); + + void + client_rpc_mid(margo_instance* client_rpc_mid); + + margo_client_ids& + rpc_client_ids(); ABT_pool io_pool() const; diff --git a/src/daemon/classes/rpc_data.cpp b/src/daemon/classes/rpc_data.cpp index cf8ba8d87..024f409c8 100644 --- a/src/daemon/classes/rpc_data.cpp +++ b/src/daemon/classes/rpc_data.cpp @@ -54,6 +54,21 @@ RPCData::proxy_server_rpc_mid(margo_instance* proxy_server_rpc_mid) { RPCData::proxy_server_rpc_mid_ = proxy_server_rpc_mid; } +margo_instance* +RPCData::client_rpc_mid() { + return client_rpc_mid_; +} + +void +RPCData::client_rpc_mid(margo_instance* client_rpc_mid) { + RPCData::client_rpc_mid_ = client_rpc_mid; +} + +margo_client_ids& +RPCData::rpc_client_ids() { + return rpc_client_ids_; +} + ABT_pool RPCData::io_pool() const { return io_pool_; diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index 073fb60a1..f0f55acf8 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -235,6 +235,52 @@ init_rpc_server() { register_server_rpcs(mid); } +/** + * @brief Registers RPC handlers to a given Margo instance. + * @internal + * Registering is done by associating a Margo instance id (mid) with the RPC + * name and its handler function including defined input/out structs + * @endinternal + * @param margo_instance_id + */ +void +register_client_rpcs(margo_instance_id mid) { + // TODO + RPC_DATA->rpc_client_ids().test_rpc_id = MARGO_REGISTER( + mid, gkfs::rpc::tag::fs_config, void, rpc_config_out_t, NULL); +} + +/** + * @brief Initializes the daemon RPC client. + * @throws std::runtime_error on failure + */ +void +init_rpc_client() { + struct hg_init_info hg_options = HG_INIT_INFO_INITIALIZER; + hg_options.auto_sm = GKFS_DATA->use_auto_sm() ? HG_TRUE : HG_FALSE; + hg_options.stats = HG_FALSE; + if(gkfs::rpc::protocol::ofi_psm2 == GKFS_DATA->rpc_protocol()) + hg_options.na_init_info.progress_mode = NA_NO_BLOCK; + // Start Margo (this will also initialize Argobots and Mercury internally) + auto margo_config = "{}"; + struct margo_init_info args = {nullptr}; + args.json_config = margo_config; + args.hg_init_info = &hg_options; + auto* mid = margo_init_ext(GKFS_DATA->bind_addr().c_str(), + MARGO_CLIENT_MODE, &args); + + if(mid == MARGO_INSTANCE_NULL) { + throw runtime_error("Failed to initialize the Margo RPC client"); + } + + GKFS_DATA->spdlogger()->info( + "{}() RPC client initialization successful for protocol {}", + __func__, GKFS_DATA->bind_addr()); + + RPC_DATA->client_rpc_mid(mid); + register_client_rpcs(mid); +} + void register_proxy_server_rpcs(margo_instance_id mid) { MARGO_REGISTER(mid, gkfs::rpc::tag::get_chunk_stat, rpc_chunk_stat_in_t, @@ -464,6 +510,18 @@ init_environment() { if(!GKFS_DATA->hosts_file().empty()) { gkfs::utils::populate_hosts_file(); } + + // Init margo client + GKFS_DATA->spdlogger()->debug("{}() Initializing RPC client: '{}'", + __func__, GKFS_DATA->bind_addr()); + try { + init_rpc_client(); + } catch(const std::exception& e) { + GKFS_DATA->spdlogger()->error( + "{}() Failed to initialize RPC client: {}", __func__, e.what()); + throw; + } + GKFS_DATA->spdlogger()->info("Startup successful. Daemon is ready."); } @@ -524,6 +582,12 @@ destroy_enviroment() { GKFS_DATA->spdlogger()->info("{}() Closing metadata DB", __func__); GKFS_DATA->close_mdb(); + if(RPC_DATA->client_rpc_mid() != nullptr) { + GKFS_DATA->spdlogger()->info("{}() Finalizing margo RPC client ...", + __func__); + margo_finalize(RPC_DATA->client_rpc_mid()); + } + // Delete rootdir/metadir if requested if(!keep_rootdir) { -- GitLab From 1c07cec87b8388f619cde7e9a84bdb50954bc17a Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Fri, 17 May 2024 17:54:54 +0200 Subject: [PATCH 02/17] Separating running FS instance from extension --- scripts/run/gkfs | 1 + src/client/preload_util.cpp | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/run/gkfs b/scripts/run/gkfs index 502f3bc3b..07c6a4937 100755 --- a/scripts/run/gkfs +++ b/scripts/run/gkfs @@ -43,6 +43,7 @@ wait_for_gkfs_daemons() { exit 1 fi done + echo "# End of current FS instance" >> "${HOSTSFILE}" } ####################################### # Creates a pid file for a given pid. If pid file exists, we check if its pids are still valid. diff --git a/src/client/preload_util.cpp b/src/client/preload_util.cpp index 7c769b1cb..247e44819 100644 --- a/src/client/preload_util.cpp +++ b/src/client/preload_util.cpp @@ -161,6 +161,10 @@ load_hostfile(const std::string& path) { string uri; std::smatch match; while(getline(lf, line)) { + // if line starts with #, it indicates the end of current FS instance + // It is therefore skipped + if(line[0] == '#') + continue; if(!regex_match(line, match, line_re)) { LOG(ERROR, "Unrecognized line format: [path: '{}', line: '{}']", @@ -510,4 +514,4 @@ lookup_proxy_addr() { CTX->proxy_host(addr); } -} // namespace gkfs::utils +} // namespace gkfs::utils \ No newline at end of file -- GitLab From 3f1b02cf59105688d26c5f9846ff1b86297332ac Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Wed, 3 Jul 2024 10:27:24 +0200 Subject: [PATCH 03/17] Adding client skeleton for expansion --- include/client/CMakeLists.txt | 1 + include/client/rpc/forward_malleability.hpp | 45 +++ include/client/rpc/rpc_types.hpp | 379 +++++++++++++++++++- include/client/user_functions.hpp | 28 +- include/common/common_defs.hpp | 7 + include/common/rpc/distributor.hpp | 10 +- include/common/rpc/rpc_types.hpp | 4 + src/client/CMakeLists.txt | 2 + src/client/malleability.cpp | 55 +++ src/client/rpc/forward_malleability.cpp | 222 ++++++++++++ src/client/rpc/forward_metadata.cpp | 4 +- src/common/rpc/distributor.cpp | 8 +- 12 files changed, 734 insertions(+), 31 deletions(-) create mode 100644 include/client/rpc/forward_malleability.hpp create mode 100644 src/client/malleability.cpp create mode 100644 src/client/rpc/forward_malleability.cpp diff --git a/include/client/CMakeLists.txt b/include/client/CMakeLists.txt index 6815fc953..e4ce8983c 100644 --- a/include/client/CMakeLists.txt +++ b/include/client/CMakeLists.txt @@ -72,6 +72,7 @@ target_sources( rpc/forward_management.hpp rpc/forward_metadata.hpp rpc/forward_data.hpp + rpc/forward_malleability.hpp syscalls/args.hpp syscalls/decoder.hpp syscalls/errno.hpp diff --git a/include/client/rpc/forward_malleability.hpp b/include/client/rpc/forward_malleability.hpp new file mode 100644 index 000000000..770a8a5f1 --- /dev/null +++ b/include/client/rpc/forward_malleability.hpp @@ -0,0 +1,45 @@ +/* + Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain + Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany + + This software was partially supported by the + EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). + + This software was partially supported by the + ADA-FS project under the SPPEXA project funded by the DFG. + + This file is part of GekkoFS' POSIX interface. + + GekkoFS' POSIX interface is free software: you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation, either version 3 of the License, + or (at your option) any later version. + + GekkoFS' POSIX interface is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with GekkoFS' POSIX interface. If not, see + . + + SPDX-License-Identifier: LGPL-3.0-or-later +*/ + +#ifndef GEKKOFS_CLIENT_FORWARD_MALLEABILITY_HPP +#define GEKKOFS_CLIENT_FORWARD_MALLEABILITY_HPP + +namespace gkfs::malleable::rpc { + +int +forward_expand_start(int old_server_conf, int new_server_conf); + +int +forward_expand_status(); + +int +forward_expand_finalize(); +} // namespace gkfs::malleable::rpc + +#endif // GEKKOFS_CLIENT_FORWARD_MALLEABILITY_HPP diff --git a/include/client/rpc/rpc_types.hpp b/include/client/rpc/rpc_types.hpp index 485619e60..b58f59699 100644 --- a/include/client/rpc/rpc_types.hpp +++ b/include/client/rpc/rpc_types.hpp @@ -63,7 +63,9 @@ hg_proc_void_t(hg_proc_t proc, void* data) { } // namespace hermes::detail -namespace gkfs::rpc { +namespace gkfs { + +namespace rpc { //============================================================================== // definitions for fs_config @@ -125,7 +127,8 @@ struct fs_config { explicit input(const hermes::detail::hg_void_t& other) {} - explicit operator hermes::detail::hg_void_t() { + explicit + operator hermes::detail::hg_void_t() { return {}; } }; @@ -311,7 +314,8 @@ struct create { explicit input(const rpc_mk_node_in_t& other) : m_path(other.path), m_mode(other.mode) {} - explicit operator rpc_mk_node_in_t() { + explicit + operator rpc_mk_node_in_t() { return {m_path.c_str(), m_mode}; } @@ -420,7 +424,8 @@ struct stat { explicit input(const rpc_path_only_in_t& other) : m_path(other.path) {} - explicit operator rpc_path_only_in_t() { + explicit + operator rpc_path_only_in_t() { return {m_path.c_str()}; } @@ -546,7 +551,8 @@ struct remove_metadata { explicit input(const rpc_rm_node_in_t& other) : m_path(other.path), m_rm_dir(other.rm_dir) {} - explicit operator rpc_rm_node_in_t() { + explicit + operator rpc_rm_node_in_t() { return {m_path.c_str(), m_rm_dir}; } @@ -678,7 +684,8 @@ struct decr_size { explicit input(const rpc_trunc_in_t& other) : m_path(other.path), m_length(other.length) {} - explicit operator rpc_trunc_in_t() { + explicit + operator rpc_trunc_in_t() { return {m_path.c_str(), m_length}; } @@ -885,7 +892,8 @@ struct update_metadentry { m_atime_flag(other.atime_flag), m_mtime_flag(other.mtime_flag), m_ctime_flag(other.ctime_flag) {} - explicit operator rpc_update_metadentry_in_t() { + explicit + operator rpc_update_metadentry_in_t() { return {m_path.c_str(), m_nlink, m_mode, m_uid, m_gid, m_size, m_blocks, m_atime, m_mtime, m_ctime, m_nlink_flag, m_mode_flag, @@ -1013,7 +1021,8 @@ struct get_metadentry_size { explicit input(const rpc_path_only_in_t& other) : m_path(other.path) {} - explicit operator rpc_path_only_in_t() { + explicit + operator rpc_path_only_in_t() { return {m_path.c_str()}; } @@ -1148,7 +1157,8 @@ struct update_metadentry_size { : m_path(other.path), m_size(other.size), m_offset(other.offset), m_append(other.append) {} - explicit operator rpc_update_metadentry_size_in_t() { + explicit + operator rpc_update_metadentry_size_in_t() { return {m_path.c_str(), m_size, m_offset, m_append}; } @@ -1276,7 +1286,8 @@ struct mk_symlink { explicit input(const rpc_mk_symlink_in_t& other) : m_path(other.path), m_target_path(other.target_path) {} - explicit operator rpc_mk_symlink_in_t() { + explicit + operator rpc_mk_symlink_in_t() { return {m_path.c_str(), m_target_path.c_str()}; } @@ -1387,7 +1398,8 @@ struct remove_data { explicit input(const rpc_rm_node_in_t& other) : m_path(other.path) {} - explicit operator rpc_rm_node_in_t() { + explicit + operator rpc_rm_node_in_t() { return {m_path.c_str()}; } @@ -1553,7 +1565,8 @@ struct write_data { m_total_chunk_size(other.total_chunk_size), m_buffers(other.bulk_handle) {} - explicit operator rpc_write_data_in_t() { + explicit + operator rpc_write_data_in_t() { return {m_path.c_str(), m_offset, m_host_id, m_host_size, m_wbitset.c_str(), m_chunk_n, m_chunk_start, m_chunk_end, m_total_chunk_size, @@ -1738,7 +1751,8 @@ struct read_data { m_total_chunk_size(other.total_chunk_size), m_buffers(other.bulk_handle) {} - explicit operator rpc_read_data_in_t() { + explicit + operator rpc_read_data_in_t() { return {m_path.c_str(), m_offset, m_host_id, m_host_size, m_wbitset.c_str(), m_chunk_n, m_chunk_start, m_chunk_end, m_total_chunk_size, @@ -1872,7 +1886,8 @@ struct trunc_data { explicit input(const rpc_trunc_in_t& other) : m_path(other.path), m_length(other.length) {} - explicit operator rpc_trunc_in_t() { + explicit + operator rpc_trunc_in_t() { return { m_path.c_str(), m_length, @@ -1991,7 +2006,8 @@ struct get_dirents { explicit input(const rpc_get_dirents_in_t& other) : m_path(other.path), m_buffers(other.bulk_handle) {} - explicit operator rpc_get_dirents_in_t() { + explicit + operator rpc_get_dirents_in_t() { return {m_path.c_str(), hg_bulk_t(m_buffers)}; } @@ -2116,7 +2132,8 @@ struct get_dirents_extended { explicit input(const rpc_get_dirents_in_t& other) : m_path(other.path), m_buffers(other.bulk_handle) {} - explicit operator rpc_get_dirents_in_t() { + explicit + operator rpc_get_dirents_in_t() { return {m_path.c_str(), hg_bulk_t(m_buffers)}; } @@ -2235,7 +2252,8 @@ struct chunk_stat { explicit input(const rpc_chunk_stat_in_t& other) : m_dummy(other.dummy) {} - explicit operator rpc_chunk_stat_in_t() { + explicit + operator rpc_chunk_stat_in_t() { return {m_dummy}; } @@ -3693,8 +3711,333 @@ struct get_dirents_extended_proxy { size_t m_dirents_size; }; }; +} // namespace rpc +namespace malleable::rpc { + +//============================================================================== +// definitions for expand_start +struct expand_start { + + // forward declarations of public input/output types for this RPC + class input; + + class output; + + // traits used so that the engine knows what to do with the RPC + using self_type = expand_start; + using handle_type = hermes::rpc_handle; + using input_type = input; + using output_type = output; + using mercury_input_type = rpc_expand_start_in_t; + using mercury_output_type = rpc_err_out_t; + + // RPC public identifier + // (N.B: we reuse the same IDs assigned by Margo so that the daemon + // understands Hermes RPCs) + constexpr static const uint64_t public_id = 50; + + // RPC internal Mercury identifier + constexpr static const hg_id_t mercury_id = 0; + + // RPC name + constexpr static const auto name = gkfs::rpc::malleable::tag::expand_start; + + // requires response? + constexpr static const auto requires_response = true; + + // Mercury callback to serialize input arguments + constexpr static const auto mercury_in_proc_cb = + HG_GEN_PROC_NAME(rpc_expand_start_in_t); + + // Mercury callback to serialize output arguments + constexpr static const auto mercury_out_proc_cb = + HG_GEN_PROC_NAME(rpc_err_out_t); + + class input { + + template + friend hg_return_t + hermes::detail::post_to_mercury(ExecutionContext*); + + public: + input(const uint32_t old_server_conf, uint32_t new_server_conf) + : m_old_server_conf(old_server_conf), + m_new_server_conf(new_server_conf) {} + + input(input&& rhs) = default; + + input(const input& other) = default; + + input& + operator=(input&& rhs) = default; + + input& + operator=(const input& other) = default; + + uint32_t + old_server_conf() const { + return m_old_server_conf; + } + + uint32_t + new_server_conf() const { + return m_new_server_conf; + } + + explicit input(const rpc_expand_start_in_t& other) + : m_old_server_conf(other.old_server_conf), + m_new_server_conf(other.new_server_conf) {} + + explicit + operator rpc_expand_start_in_t() { + return {m_old_server_conf, m_new_server_conf}; + } + + private: + uint32_t m_old_server_conf; + uint32_t m_new_server_conf; + }; + + class output { + + template + friend hg_return_t + hermes::detail::post_to_mercury(ExecutionContext*); + + public: + output() : m_err() {} + + output(int32_t err) : m_err(err) {} + + output(output&& rhs) = default; + + output(const output& other) = default; + + output& + operator=(output&& rhs) = default; + + output& + operator=(const output& other) = default; + + explicit output(const rpc_err_out_t& out) { + m_err = out.err; + } + + int32_t + err() const { + return m_err; + } + + private: + int32_t m_err; + }; +}; + +//============================================================================== +// definitions for expand_status +struct expand_status { + + // forward declarations of public input/output types for this RPC + class input; + + class output; + + // traits used so that the engine knows what to do with the RPC + using self_type = expand_status; + using handle_type = hermes::rpc_handle; + using input_type = input; + using output_type = output; + using mercury_input_type = hermes::detail::hg_void_t; + using mercury_output_type = rpc_err_out_t; + + // RPC public identifier + // (N.B: we reuse the same IDs assigned by Margo so that the daemon + // understands Hermes RPCs) + constexpr static const uint64_t public_id = 51; + + // RPC internal Mercury identifier + constexpr static const hg_id_t mercury_id = 0; + + // RPC name + constexpr static const auto name = gkfs::rpc::malleable::tag::expand_status; + + // requires response? + constexpr static const auto requires_response = true; + + // Mercury callback to serialize input arguments + constexpr static const auto mercury_in_proc_cb = + hermes::detail::hg_proc_void_t; + + // Mercury callback to serialize output arguments + constexpr static const auto mercury_out_proc_cb = + HG_GEN_PROC_NAME(rpc_err_out_t); + + class input { + + template + friend hg_return_t + hermes::detail::post_to_mercury(ExecutionContext*); + + public: + input() {} + + input(input&& rhs) = default; + + input(const input& other) = default; + + input& + operator=(input&& rhs) = default; + + input& + operator=(const input& other) = default; + + explicit input(const hermes::detail::hg_void_t& other) {} + + explicit + operator hermes::detail::hg_void_t() { + return {}; + } + }; + + class output { + + template + friend hg_return_t + hermes::detail::post_to_mercury(ExecutionContext*); + + public: + output() : m_err() {} + + output(int32_t err) : m_err(err) {} + + output(output&& rhs) = default; + + output(const output& other) = default; + + output& + operator=(output&& rhs) = default; + + output& + operator=(const output& other) = default; + + explicit output(const rpc_err_out_t& out) { + m_err = out.err; + } + + int32_t + err() const { + return m_err; + } + + private: + int32_t m_err; + }; +}; + +//============================================================================== +// definitions for expand_finalize +struct expand_finalize { + + // forward declarations of public input/output types for this RPC + class input; + + class output; + + // traits used so that the engine knows what to do with the RPC + using self_type = expand_finalize; + using handle_type = hermes::rpc_handle; + using input_type = input; + using output_type = output; + using mercury_input_type = hermes::detail::hg_void_t; + using mercury_output_type = rpc_err_out_t; + + // RPC public identifier + // (N.B: we reuse the same IDs assigned by Margo so that the daemon + // understands Hermes RPCs) + constexpr static const uint64_t public_id = 52; + + // RPC internal Mercury identifier + constexpr static const hg_id_t mercury_id = 0; + + // RPC name + constexpr static const auto name = + gkfs::rpc::malleable::tag::expand_finalize; + + // requires response? + constexpr static const auto requires_response = true; + + // Mercury callback to serialize input arguments + constexpr static const auto mercury_in_proc_cb = + hermes::detail::hg_proc_void_t; + + // Mercury callback to serialize output arguments + constexpr static const auto mercury_out_proc_cb = + HG_GEN_PROC_NAME(rpc_err_out_t); + + class input { + + template + friend hg_return_t + hermes::detail::post_to_mercury(ExecutionContext*); + + public: + input() {} + + input(input&& rhs) = default; + + input(const input& other) = default; + + input& + operator=(input&& rhs) = default; + + input& + operator=(const input& other) = default; + + explicit input(const hermes::detail::hg_void_t& other) {} + + explicit + operator hermes::detail::hg_void_t() { + return {}; + } + }; + + class output { + + template + friend hg_return_t + hermes::detail::post_to_mercury(ExecutionContext*); + + public: + output() : m_err() {} + + output(int32_t err) : m_err(err) {} + + output(output&& rhs) = default; + + output(const output& other) = default; + + output& + operator=(output&& rhs) = default; + + output& + operator=(const output& other) = default; + + explicit output(const rpc_err_out_t& out) { + m_err = out.err; + } + + int32_t + err() const { + return m_err; + } + + private: + int32_t m_err; + }; +}; -} // namespace gkfs::rpc +} // namespace malleable::rpc +} // namespace gkfs #endif // GKFS_RPCS_TYPES_HPP diff --git a/include/client/user_functions.hpp b/include/client/user_functions.hpp index cc87b386e..92d2826e2 100644 --- a/include/client/user_functions.hpp +++ b/include/client/user_functions.hpp @@ -40,7 +40,8 @@ extern "C" { struct linux_dirent64; -namespace gkfs::syscall { +namespace gkfs { +namespace syscall { int gkfs_open(const std::string& path, mode_t mode, int flags); @@ -77,7 +78,30 @@ gkfs_remove(const std::string& path); std::vector gkfs_get_file_list(const std::string& path); -} // namespace gkfs::syscall +} // namespace syscall +namespace malleable { + +/** + * @brief Start an expansion of the file system + * @param old_server_conf old number of nodes + * @param new_server_conf new number of nodes + * @return error code + */ +int +expand_start(int old_server_conf, int new_server_conf); + +/** + * @brief Check for the current status of the expansion process + * @return 0 when finished, positive numbers indicate how many daemons + * are still redistributing data + */ +int +expand_status(); + +int +expand_finalize(); +} // namespace malleable +} // namespace gkfs extern "C" int diff --git a/include/common/common_defs.hpp b/include/common/common_defs.hpp index 206ab7de1..fd914b3ba 100644 --- a/include/common/common_defs.hpp +++ b/include/common/common_defs.hpp @@ -78,8 +78,15 @@ constexpr auto client_proxy_get_dirents_extended = // Specific RPCs between daemon and proxy constexpr auto proxy_daemon_write = "proxy_daemon_rpc_srv_write_data"; constexpr auto proxy_daemon_read = "proxy_daemon_rpc_srv_read_data"; + } // namespace tag +namespace malleable::tag { +constexpr auto expand_start = "rpc_srv_expand_start"; +constexpr auto expand_status = "rpc_srv_expand_status"; +constexpr auto expand_finalize = "rpc_srv_expand_finalize"; +} // namespace malleable::tag + namespace protocol { constexpr auto na_sm = "na+sm"; constexpr auto ofi_sockets = "ofi+sockets"; diff --git a/include/common/rpc/distributor.hpp b/include/common/rpc/distributor.hpp index 68ac5ded0..53fed0200 100644 --- a/include/common/rpc/distributor.hpp +++ b/include/common/rpc/distributor.hpp @@ -64,7 +64,7 @@ public: locate_file_metadata(const std::string& path, const int num_copy) const = 0; virtual std::vector - locate_directory_metadata(const std::string& path) const = 0; + locate_directory_metadata() const = 0; }; @@ -99,7 +99,7 @@ public: const int num_copy) const override; std::vector - locate_directory_metadata(const std::string& path) const override; + locate_directory_metadata() const override; }; class LocalOnlyDistributor : public Distributor { @@ -125,7 +125,7 @@ public: const int num_copy) const override; std::vector - locate_directory_metadata(const std::string& path) const override; + locate_directory_metadata() const override; }; class ForwarderDistributor : public Distributor { @@ -157,7 +157,7 @@ public: const int num_copy) const override; std::vector - locate_directory_metadata(const std::string& path) const override; + locate_directory_metadata() const override; }; /* @@ -210,7 +210,7 @@ public: const int num_copy) const override; std::vector - locate_directory_metadata(const std::string& path) const override; + locate_directory_metadata() const override; }; } // namespace gkfs::rpc diff --git a/include/common/rpc/rpc_types.hpp b/include/common/rpc/rpc_types.hpp index 40fc20005..9b09dbbcb 100644 --- a/include/common/rpc/rpc_types.hpp +++ b/include/common/rpc/rpc_types.hpp @@ -164,5 +164,9 @@ MERCURY_GEN_PROC(rpc_proxy_get_dirents_in_t, ((hg_const_string_t) (path))((int32_t) (server))( (hg_bulk_t) (bulk_handle))) +// malleability + +MERCURY_GEN_PROC(rpc_expand_start_in_t, + ((uint32_t) (old_server_conf))((uint32_t) (new_server_conf))) #endif // LFS_RPC_TYPES_HPP diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt index d2415233d..e2a53a082 100644 --- a/src/client/CMakeLists.txt +++ b/src/client/CMakeLists.txt @@ -67,12 +67,14 @@ target_sources( preload.cpp preload_context.cpp preload_util.cpp + malleability.cpp rpc/rpc_types.cpp rpc/forward_data.cpp rpc/forward_data_proxy.cpp rpc/forward_management.cpp rpc/forward_metadata.cpp rpc/forward_metadata_proxy.cpp + rpc/forward_malleability.cpp syscalls/detail/syscall_info.c syscalls/util.S ) diff --git a/src/client/malleability.cpp b/src/client/malleability.cpp new file mode 100644 index 000000000..156b9a079 --- /dev/null +++ b/src/client/malleability.cpp @@ -0,0 +1,55 @@ +/* + Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain + Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany + + This software was partially supported by the + EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). + + This software was partially supported by the + ADA-FS project under the SPPEXA project funded by the DFG. + + This file is part of GekkoFS' POSIX interface. + + GekkoFS' POSIX interface is free software: you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation, either version 3 of the License, + or (at your option) any later version. + + GekkoFS' POSIX interface is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with GekkoFS' POSIX interface. If not, see + . + + SPDX-License-Identifier: LGPL-3.0-or-later +*/ + +#include +#include +#include + +namespace gkfs::malleable { + +int +expand_start(int old_server_conf, int new_server_conf) { + LOG(INFO, "{}() Expand operation started", __func__); + return gkfs::malleable::rpc::forward_expand_start(old_server_conf, + new_server_conf); +} + +int +expand_status() { + LOG(INFO, "{}() Expand operation status", __func__); + return gkfs::malleable::rpc::forward_expand_status(); +} + +int +expand_finalize() { + LOG(INFO, "{}() Expand operation finalize", __func__); + return gkfs::malleable::rpc::forward_expand_finalize(); +} + +} // namespace gkfs::malleable \ No newline at end of file diff --git a/src/client/rpc/forward_malleability.cpp b/src/client/rpc/forward_malleability.cpp new file mode 100644 index 000000000..ec02a8170 --- /dev/null +++ b/src/client/rpc/forward_malleability.cpp @@ -0,0 +1,222 @@ +/* + Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain + Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany + + This software was partially supported by the + EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). + + This software was partially supported by the + ADA-FS project under the SPPEXA project funded by the DFG. + + This file is part of GekkoFS' POSIX interface. + + GekkoFS' POSIX interface is free software: you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation, either version 3 of the License, + or (at your option) any later version. + + GekkoFS' POSIX interface is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with GekkoFS' POSIX interface. If not, see + . + + SPDX-License-Identifier: LGPL-3.0-or-later +*/ +#include +#include +#include +#include +#include +#include + +namespace gkfs::malleable::rpc { + +int +forward_expand_start(int old_server_conf, int new_server_conf) { + LOG(INFO, "{}() enter", __func__); + auto const targets = CTX->distributor()->locate_directory_metadata(); + + auto err = 0; + // send async RPCs + std::vector> handles; + + for(std::size_t i = 0; i < targets.size(); ++i) { + + // Setup rpc input parameters for each host + auto endp = CTX->hosts().at(targets[i]); + + gkfs::malleable::rpc::expand_start::input in(old_server_conf, + new_server_conf); + + try { + LOG(DEBUG, "{}() Sending RPC to host: '{}'", __func__, targets[i]); + handles.emplace_back( + ld_network_service + ->post(endp, + in)); + } catch(const std::exception& ex) { + LOG(ERROR, + "{}() Unable to send non-blocking forward_expand_start() [peer: {}] err '{}'", + __func__, targets[i], ex.what()); + err = EBUSY; + break; // we need to gather responses from already sent RPCS + } + } + + LOG(INFO, "{}() send expand_start rpc to '{}' targets", __func__, + targets.size()); + + // wait for RPC responses + for(std::size_t i = 0; i < handles.size(); ++i) { + + gkfs::malleable::rpc::expand_start::output out; + + try { + out = handles[i].get().at(0); + + if(out.err() != 0) { + LOG(ERROR, + "{}() Failed to retrieve dir entries from host '{}'. Error '{}'", + __func__, targets[i], strerror(out.err())); + err = out.err(); + // We need to gather all responses before exiting + continue; + } + } catch(const std::exception& ex) { + LOG(ERROR, + "{}() Failed to get rpc output.. [target host: {}] err '{}'", + __func__, targets[i], ex.what()); + err = EBUSY; + // We need to gather all responses before exiting + continue; + } + } + return err; +} + +int +forward_expand_status() { + LOG(INFO, "{}() enter", __func__); + auto const targets = CTX->distributor()->locate_directory_metadata(); + + auto err = 0; + // send async RPCs + std::vector> + handles; + + for(std::size_t i = 0; i < targets.size(); ++i) { + + // Setup rpc input parameters for each host + auto endp = CTX->hosts().at(targets[i]); + + try { + LOG(DEBUG, "{}() Sending RPC to host: '{}'", __func__, targets[i]); + handles.emplace_back( + ld_network_service + ->post(endp)); + } catch(const std::exception& ex) { + LOG(ERROR, + "{}() Unable to send non-blocking forward_expand_status() [peer: {}] err '{}'", + __func__, targets[i], ex.what()); + err = EBUSY; + break; // we need to gather responses from already sent RPCS + } + } + + LOG(INFO, "{}() send expand_status rpc to '{}' targets", __func__, + targets.size()); + + // wait for RPC responses + for(std::size_t i = 0; i < handles.size(); ++i) { + + gkfs::malleable::rpc::expand_status::output out; + + try { + out = handles[i].get().at(0); + + if(out.err() != 0) { + LOG(ERROR, + "{}() Failed to retrieve dir entries from host '{}'. Error '{}'", + __func__, targets[i], strerror(out.err())); + err = out.err(); + // We need to gather all responses before exiting + continue; + } + } catch(const std::exception& ex) { + LOG(ERROR, + "{}() Failed to get rpc output.. [target host: {}] err '{}'", + __func__, targets[i], ex.what()); + err = EBUSY; + // We need to gather all responses before exiting + continue; + } + } + return err; +} + +int +forward_expand_finalize() { + LOG(INFO, "{}() enter", __func__); + auto const targets = CTX->distributor()->locate_directory_metadata(); + + auto err = 0; + // send async RPCs + std::vector> + handles; + + for(std::size_t i = 0; i < targets.size(); ++i) { + + // Setup rpc input parameters for each host + auto endp = CTX->hosts().at(targets[i]); + + try { + LOG(DEBUG, "{}() Sending RPC to host: '{}'", __func__, targets[i]); + handles.emplace_back( + ld_network_service + ->post( + endp)); + } catch(const std::exception& ex) { + LOG(ERROR, + "{}() Unable to send non-blocking forward_expand_finalize() [peer: {}] err '{}'", + __func__, targets[i], ex.what()); + err = EBUSY; + break; // we need to gather responses from already sent RPCS + } + } + + LOG(INFO, "{}() send expand_finalize rpc to '{}' targets", __func__, + targets.size()); + + // wait for RPC responses + for(std::size_t i = 0; i < handles.size(); ++i) { + + gkfs::malleable::rpc::expand_finalize::output out; + + try { + out = handles[i].get().at(0); + + if(out.err() != 0) { + LOG(ERROR, + "{}() Failed to retrieve dir entries from host '{}'. Error '{}'", + __func__, targets[i], strerror(out.err())); + err = out.err(); + // We need to gather all responses before exiting + continue; + } + } catch(const std::exception& ex) { + LOG(ERROR, + "{}() Failed to get rpc output.. [target host: {}] err '{}'", + __func__, targets[i], ex.what()); + err = EBUSY; + // We need to gather all responses before exiting + continue; + } + } + return err; +} + +} // namespace gkfs::malleable::rpc \ No newline at end of file diff --git a/src/client/rpc/forward_metadata.cpp b/src/client/rpc/forward_metadata.cpp index c5c309573..4cfe6f60a 100644 --- a/src/client/rpc/forward_metadata.cpp +++ b/src/client/rpc/forward_metadata.cpp @@ -642,7 +642,7 @@ forward_get_dirents(const string& path) { LOG(DEBUG, "{}() enter for path '{}'", __func__, path) - auto const targets = CTX->distributor()->locate_directory_metadata(path); + auto const targets = CTX->distributor()->locate_directory_metadata(); /* preallocate receiving buffer. The actual size is not known yet. * @@ -793,7 +793,7 @@ forward_get_dirents_single(const string& path, int server) { LOG(DEBUG, "{}() enter for path '{}'", __func__, path) - auto const targets = CTX->distributor()->locate_directory_metadata(path); + auto const targets = CTX->distributor()->locate_directory_metadata(); /* preallocate receiving buffer. The actual size is not known yet. * diff --git a/src/common/rpc/distributor.cpp b/src/common/rpc/distributor.cpp index ad558b9e9..b17a09862 100644 --- a/src/common/rpc/distributor.cpp +++ b/src/common/rpc/distributor.cpp @@ -78,7 +78,7 @@ SimpleHashDistributor::locate_file_metadata(const string& path, } ::vector -SimpleHashDistributor::locate_directory_metadata(const string& path) const { +SimpleHashDistributor::locate_directory_metadata() const { return all_hosts_; } @@ -108,7 +108,7 @@ LocalOnlyDistributor::locate_file_metadata(const string& path, } ::vector -LocalOnlyDistributor::locate_directory_metadata(const string& path) const { +LocalOnlyDistributor::locate_directory_metadata() const { return {localhost_}; } @@ -150,7 +150,7 @@ ForwarderDistributor::locate_file_metadata(const std::string& path, std::vector -ForwarderDistributor::locate_directory_metadata(const std::string& path) const { +ForwarderDistributor::locate_directory_metadata() const { return all_hosts_; } @@ -282,7 +282,7 @@ GuidedDistributor::locate_file_metadata(const string& path, ::vector -GuidedDistributor::locate_directory_metadata(const string& path) const { +GuidedDistributor::locate_directory_metadata() const { return all_hosts_; } -- GitLab From 08da8a319ea674bce2512cbd5f4fe4a26586f7ec Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Wed, 22 May 2024 00:41:18 +0200 Subject: [PATCH 04/17] Adding daemon boiler plate for expansion --- include/client/rpc/rpc_types.hpp | 6 +- include/client/user_functions.hpp | 4 + include/common/common_defs.hpp | 12 +- include/daemon/classes/fs_data.hpp | 26 ++++- include/daemon/handler/rpc_defs.hpp | 9 ++ src/daemon/CMakeLists.txt | 1 + src/daemon/classes/fs_data.cpp | 36 ++++++ src/daemon/daemon.cpp | 7 ++ src/daemon/handler/srv_malleability.cpp | 146 ++++++++++++++++++++++++ 9 files changed, 237 insertions(+), 10 deletions(-) create mode 100644 src/daemon/handler/srv_malleability.cpp diff --git a/include/client/rpc/rpc_types.hpp b/include/client/rpc/rpc_types.hpp index b58f59699..b84c7c179 100644 --- a/include/client/rpc/rpc_types.hpp +++ b/include/client/rpc/rpc_types.hpp @@ -3740,7 +3740,7 @@ struct expand_start { constexpr static const hg_id_t mercury_id = 0; // RPC name - constexpr static const auto name = gkfs::rpc::malleable::tag::expand_start; + constexpr static const auto name = gkfs::malleable::rpc::tag::expand_start; // requires response? constexpr static const auto requires_response = true; @@ -3859,7 +3859,7 @@ struct expand_status { constexpr static const hg_id_t mercury_id = 0; // RPC name - constexpr static const auto name = gkfs::rpc::malleable::tag::expand_status; + constexpr static const auto name = gkfs::malleable::rpc::tag::expand_status; // requires response? constexpr static const auto requires_response = true; @@ -3961,7 +3961,7 @@ struct expand_finalize { // RPC name constexpr static const auto name = - gkfs::rpc::malleable::tag::expand_finalize; + gkfs::malleable::rpc::tag::expand_finalize; // requires response? constexpr static const auto requires_response = true; diff --git a/include/client/user_functions.hpp b/include/client/user_functions.hpp index 92d2826e2..5f96ab3c1 100644 --- a/include/client/user_functions.hpp +++ b/include/client/user_functions.hpp @@ -98,6 +98,10 @@ expand_start(int old_server_conf, int new_server_conf); int expand_status(); +/** + * @brief Finalize the expansion process + * @return error code + */ int expand_finalize(); } // namespace malleable diff --git a/include/common/common_defs.hpp b/include/common/common_defs.hpp index fd914b3ba..034d250f1 100644 --- a/include/common/common_defs.hpp +++ b/include/common/common_defs.hpp @@ -81,12 +81,6 @@ constexpr auto proxy_daemon_read = "proxy_daemon_rpc_srv_read_data"; } // namespace tag -namespace malleable::tag { -constexpr auto expand_start = "rpc_srv_expand_start"; -constexpr auto expand_status = "rpc_srv_expand_status"; -constexpr auto expand_finalize = "rpc_srv_expand_finalize"; -} // namespace malleable::tag - namespace protocol { constexpr auto na_sm = "na+sm"; constexpr auto ofi_sockets = "ofi+sockets"; @@ -111,6 +105,12 @@ constexpr auto all_remote_protocols = {ofi_sockets, ofi_tcp, ofi_verbs, } // namespace protocol } // namespace gkfs::rpc +namespace gkfs::malleable::rpc::tag { +constexpr auto expand_start = "rpc_srv_expand_start"; +constexpr auto expand_status = "rpc_srv_expand_status"; +constexpr auto expand_finalize = "rpc_srv_expand_finalize"; +} // namespace gkfs::malleable::rpc::tag + namespace gkfs::config::syscall::stat { // Number 512-byte blocks allocated as it is in the linux kernel (struct_stat.h) constexpr auto st_nblocksize = 512; diff --git a/include/daemon/classes/fs_data.hpp b/include/daemon/classes/fs_data.hpp index 4cde4170e..67b559a62 100644 --- a/include/daemon/classes/fs_data.hpp +++ b/include/daemon/classes/fs_data.hpp @@ -56,7 +56,9 @@ namespace daemon { class FsData { private: - FsData() = default; + FsData(); + + ~FsData(); // logger std::shared_ptr spdlogger_; @@ -104,6 +106,16 @@ private: // Prometheus std::string prometheus_gateway_ = gkfs::config::stats::prometheus_gateway; + // Malleability + // maintenance mode is used to prevent new RPCs to the filesystem and + // indicates for clients: try again. Is set to true when redist is running + bool maintenance_mode_ = false; + ABT_mutex maintenance_mode_mutex_; + // redist_running_ indicates to client that redistribution is running + bool redist_running_ = false; + ABT_thread redist_thread_; + + public: static FsData* getInstance() { @@ -284,6 +296,18 @@ public: void prometheus_gateway(const std::string& prometheus_gateway_); + + bool + maintenance_mode() const; + + void + maintenance_mode(bool maintenance_mode); + + bool + redist_running() const; + + void + redist_running(bool redist_running); }; diff --git a/include/daemon/handler/rpc_defs.hpp b/include/daemon/handler/rpc_defs.hpp index 371094966..f234a65fd 100644 --- a/include/daemon/handler/rpc_defs.hpp +++ b/include/daemon/handler/rpc_defs.hpp @@ -84,4 +84,13 @@ DECLARE_MARGO_RPC_HANDLER(proxy_rpc_srv_read) DECLARE_MARGO_RPC_HANDLER(proxy_rpc_srv_write) +// malleability + +DECLARE_MARGO_RPC_HANDLER(rpc_srv_expand_start) + +DECLARE_MARGO_RPC_HANDLER(rpc_srv_expand_status) + +DECLARE_MARGO_RPC_HANDLER(rpc_srv_expand_finalize) + + #endif // GKFS_DAEMON_RPC_DEFS_HPP diff --git a/src/daemon/CMakeLists.txt b/src/daemon/CMakeLists.txt index 89af71c76..d02be0259 100644 --- a/src/daemon/CMakeLists.txt +++ b/src/daemon/CMakeLists.txt @@ -48,6 +48,7 @@ target_sources( classes/rpc_data.cpp handler/srv_metadata.cpp handler/srv_management.cpp + handler/srv_malleability.cpp PUBLIC ${CMAKE_SOURCE_DIR}/include/config.hpp ${CMAKE_SOURCE_DIR}/include/version.hpp.in ) diff --git a/src/daemon/classes/fs_data.cpp b/src/daemon/classes/fs_data.cpp index 909c26fe9..7542c7397 100644 --- a/src/daemon/classes/fs_data.cpp +++ b/src/daemon/classes/fs_data.cpp @@ -33,6 +33,14 @@ namespace gkfs::daemon { +FsData::FsData() { + ABT_mutex_create(&maintenance_mode_mutex_); +} + +FsData::~FsData() { + ABT_mutex_free(&maintenance_mode_mutex_); +} + // getter/setter const std::shared_ptr& @@ -314,4 +322,32 @@ FsData::prometheus_gateway(const std::string& prometheus_gateway) { FsData::prometheus_gateway_ = prometheus_gateway; } +bool +FsData::maintenance_mode() const { + return maintenance_mode_; +} + +void +FsData::maintenance_mode(bool maintenance_mode) { + ABT_mutex_lock(maintenance_mode_mutex_); + if(maintenance_mode && maintenance_mode_) { + auto err_str = + "Critical error: Maintenance mode enabled twice, e.g., due to multiple expand requests. This is not a allowed and should not happen."; + spdlogger()->error(err_str); + throw std::runtime_error(err_str); + } + maintenance_mode_ = maintenance_mode; + ABT_mutex_unlock(maintenance_mode_mutex_); +} + +bool +FsData::redist_running() const { + return redist_running_; +} + +void +FsData::redist_running(bool redist_running) { + redist_running_ = redist_running; +} + } // namespace gkfs::daemon diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index f0f55acf8..796883069 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -176,6 +176,13 @@ register_server_rpcs(margo_instance_id mid) { rpc_srv_truncate); MARGO_REGISTER(mid, gkfs::rpc::tag::get_chunk_stat, rpc_chunk_stat_in_t, rpc_chunk_stat_out_t, rpc_srv_get_chunk_stat); + // malleability + MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::expand_start, + rpc_expand_start_in_t, rpc_err_out_t, rpc_srv_expand_start); + MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::expand_status, void, + rpc_err_out_t, rpc_srv_expand_status); + MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::expand_finalize, void, + rpc_err_out_t, rpc_srv_expand_status); } /** diff --git a/src/daemon/handler/srv_malleability.cpp b/src/daemon/handler/srv_malleability.cpp new file mode 100644 index 000000000..294505407 --- /dev/null +++ b/src/daemon/handler/srv_malleability.cpp @@ -0,0 +1,146 @@ +/* + Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain + Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany + + This software was partially supported by the + EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). + + This software was partially supported by the + ADA-FS project under the SPPEXA project funded by the DFG. + + This file is part of GekkoFS' POSIX interface. + + GekkoFS' POSIX interface is free software: you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation, either version 3 of the License, + or (at your option) any later version. + + GekkoFS' POSIX interface is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with GekkoFS' POSIX interface. If not, see + . + + SPDX-License-Identifier: LGPL-3.0-or-later +*/ +#include +#include + +#include + +extern "C" { +#include +} + +using namespace std; + +namespace { + +hg_return_t +rpc_srv_expand_start(hg_handle_t handle) { + rpc_expand_start_in_t in; + rpc_err_out_t out; + + auto ret = margo_get_input(handle, &in); + if(ret != HG_SUCCESS) + GKFS_DATA->spdlogger()->error( + "{}() Failed to retrieve input from handle", __func__); + assert(ret == HG_SUCCESS); + GKFS_DATA->spdlogger()->debug( + "{}() Got RPC with old conf '{}' new conf '{}'", __func__, + in.old_server_conf, in.new_server_conf); + // if maintenance mode is already set, daemon is blown up as it is not + // allowed + GKFS_DATA->maintenance_mode(true); + GKFS_DATA->redist_running(true); + + // TODO branch off here with ABT thread and start redistribution + // auto abt_err = ABT_thread_create(RPC_DATA->io_pool(), nullptr, + // &task_args_[idx], &abt_tasks_[idx]); + + + // try { + out.err = 0; + // } catch(const std::exception& e) { + // GKFS_DATA->spdlogger()->error("{}() Failed to start expansion: + // '{}'", + // __func__, e.what()); + // out.err = -1; + // } + + GKFS_DATA->spdlogger()->debug("{}() Sending output err '{}'", __func__, + out.err); + auto hret = margo_respond(handle, &out); + if(hret != HG_SUCCESS) { + GKFS_DATA->spdlogger()->error("{}() Failed to respond", __func__); + } + + // Destroy handle when finished + margo_free_input(handle, &in); + margo_destroy(handle); + return HG_SUCCESS; +} + +hg_return_t +rpc_srv_expand_status(hg_handle_t handle) { + rpc_err_out_t out; + + GKFS_DATA->spdlogger()->debug("{}() Got RPC ", __func__); + + try { + // return 1 if redistribution is running, 0 otherwise. + out.err = GKFS_DATA->redist_running() ? 1 : 0; + } catch(const std::exception& e) { + GKFS_DATA->spdlogger()->error( + "{}() Failed to check status for expansion: '{}'", __func__, + e.what()); + out.err = -1; + } + + GKFS_DATA->spdlogger()->debug("{}() Sending output err '{}'", __func__, + out.err); + auto hret = margo_respond(handle, &out); + if(hret != HG_SUCCESS) { + GKFS_DATA->spdlogger()->error("{}() Failed to respond", __func__); + } + + // Destroy handle when finished + margo_destroy(handle); + return HG_SUCCESS; +} + +hg_return_t +rpc_srv_expand_finalize(hg_handle_t handle) { + rpc_err_out_t out; + GKFS_DATA->spdlogger()->debug("{}() Got RPC ", __func__); + try { + GKFS_DATA->maintenance_mode(false); + out.err = 0; + } catch(const std::exception& e) { + GKFS_DATA->spdlogger()->error("{}() Failed to finalize expansion: '{}'", + __func__, e.what()); + out.err = -1; + } + + GKFS_DATA->spdlogger()->debug("{}() Sending output err '{}'", __func__, + out.err); + auto hret = margo_respond(handle, &out); + if(hret != HG_SUCCESS) { + GKFS_DATA->spdlogger()->error("{}() Failed to respond", __func__); + } + + // Destroy handle when finished + margo_destroy(handle); + return HG_SUCCESS; +} + +} // namespace + +DEFINE_MARGO_RPC_HANDLER(rpc_srv_expand_start) + +DEFINE_MARGO_RPC_HANDLER(rpc_srv_expand_status) + +DEFINE_MARGO_RPC_HANDLER(rpc_srv_expand_finalize) -- GitLab From 2f2381342571259ea94c68588ee68ee31454955f Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Wed, 22 May 2024 23:56:04 +0200 Subject: [PATCH 05/17] Adding daemon MalleableManager base class --- include/daemon/CMakeLists.txt | 1 + include/daemon/classes/fs_data.hpp | 14 +- .../daemon/malleability/malleable_manager.hpp | 55 +++++ src/daemon/CMakeLists.txt | 1 + src/daemon/backend/CMakeLists.txt | 2 +- src/daemon/classes/fs_data.cpp | 12 + src/daemon/daemon.cpp | 16 ++ src/daemon/handler/srv_malleability.cpp | 35 +-- src/daemon/malleability/malleable_manager.cpp | 229 ++++++++++++++++++ 9 files changed, 339 insertions(+), 26 deletions(-) create mode 100644 include/daemon/malleability/malleable_manager.hpp create mode 100644 src/daemon/malleability/malleable_manager.cpp diff --git a/include/daemon/CMakeLists.txt b/include/daemon/CMakeLists.txt index 26466ba16..e1d393824 100644 --- a/include/daemon/CMakeLists.txt +++ b/include/daemon/CMakeLists.txt @@ -36,6 +36,7 @@ target_sources( classes/rpc_data.hpp handler/rpc_defs.hpp handler/rpc_util.hpp + malleability/malleable_manager.hpp ) if(GKFS_ENABLE_AGIOS) diff --git a/include/daemon/classes/fs_data.hpp b/include/daemon/classes/fs_data.hpp index 67b559a62..a59cd184b 100644 --- a/include/daemon/classes/fs_data.hpp +++ b/include/daemon/classes/fs_data.hpp @@ -51,6 +51,10 @@ namespace utils { class Stats; } +namespace malleable { +class MalleableManager; +} + namespace daemon { class FsData { @@ -106,15 +110,14 @@ private: // Prometheus std::string prometheus_gateway_ = gkfs::config::stats::prometheus_gateway; - // Malleability // maintenance mode is used to prevent new RPCs to the filesystem and // indicates for clients: try again. Is set to true when redist is running bool maintenance_mode_ = false; ABT_mutex maintenance_mode_mutex_; // redist_running_ indicates to client that redistribution is running bool redist_running_ = false; - ABT_thread redist_thread_; + std::shared_ptr malleable_manager_; public: static FsData* @@ -308,6 +311,13 @@ public: void redist_running(bool redist_running); + + const std::shared_ptr& + malleable_manager() const; + + void + malleable_manager(const std::shared_ptr& + malleable_manager); }; diff --git a/include/daemon/malleability/malleable_manager.hpp b/include/daemon/malleability/malleable_manager.hpp new file mode 100644 index 000000000..eff5baf3b --- /dev/null +++ b/include/daemon/malleability/malleable_manager.hpp @@ -0,0 +1,55 @@ +/* + Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain + Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany + + This software was partially supported by the + EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). + + This software was partially supported by the + ADA-FS project under the SPPEXA project funded by the DFG. + + This file is part of GekkoFS. + + GekkoFS is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + GekkoFS is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GekkoFS. If not, see . + + SPDX-License-Identifier: GPL-3.0-or-later +*/ +#ifndef GEKKOFS_DAEMON_MALLEABLE_MANAGER_HPP +#define GEKKOFS_DAEMON_MALLEABLE_MANAGER_HPP + +#include + +namespace gkfs::malleable { + +class MalleableManager { +private: + ABT_thread redist_thread_; + + static void + expand_abt(void* _arg); + + void + redistribute_metadata(); + + void + redistribute_data(); + +public: + void + expand_start(int old_server_conf, int new_server_conf); +}; +} // namespace gkfs::malleable + + +#endif // GEKKOFS_MALLEABLE_MANAGER_HPP diff --git a/src/daemon/CMakeLists.txt b/src/daemon/CMakeLists.txt index d02be0259..cbb159a1b 100644 --- a/src/daemon/CMakeLists.txt +++ b/src/daemon/CMakeLists.txt @@ -49,6 +49,7 @@ target_sources( handler/srv_metadata.cpp handler/srv_management.cpp handler/srv_malleability.cpp + malleability/malleable_manager.cpp PUBLIC ${CMAKE_SOURCE_DIR}/include/config.hpp ${CMAKE_SOURCE_DIR}/include/version.hpp.in ) diff --git a/src/daemon/backend/CMakeLists.txt b/src/daemon/backend/CMakeLists.txt index daecf5514..aab7312f3 100644 --- a/src/daemon/backend/CMakeLists.txt +++ b/src/daemon/backend/CMakeLists.txt @@ -27,4 +27,4 @@ ################################################################################ add_subdirectory(metadata) -add_subdirectory(data) +add_subdirectory(data) \ No newline at end of file diff --git a/src/daemon/classes/fs_data.cpp b/src/daemon/classes/fs_data.cpp index 7542c7397..c2d09ffdd 100644 --- a/src/daemon/classes/fs_data.cpp +++ b/src/daemon/classes/fs_data.cpp @@ -350,4 +350,16 @@ FsData::redist_running(bool redist_running) { redist_running_ = redist_running; } +const std::shared_ptr& +FsData::malleable_manager() const { + return malleable_manager_; +} + +void +FsData::malleable_manager( + const std::shared_ptr& + malleable_manager) { + malleable_manager_ = malleable_manager; +} + } // namespace gkfs::daemon diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index 796883069..16dbd8d29 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -47,6 +47,7 @@ #include #include #include +#include #include #ifdef GKFS_ENABLE_AGIOS @@ -528,6 +529,21 @@ init_environment() { "{}() Failed to initialize RPC client: {}", __func__, e.what()); throw; } + GKFS_DATA->spdlogger()->debug("{}() RPC client running.", __func__); + GKFS_DATA->spdlogger()->debug("{}() Initializing MalleableManager...", + __func__); + try { + auto malleable_manager = + std::make_shared(); + GKFS_DATA->malleable_manager(malleable_manager); + } catch(const std::exception& e) { + GKFS_DATA->spdlogger()->error( + "{}() Failed to initialize MalleableManager: {}", __func__, + e.what()); + throw; + } + GKFS_DATA->spdlogger()->debug("{}() MalleableManager running.", __func__); + GKFS_DATA->spdlogger()->info("Startup successful. Daemon is ready."); } diff --git a/src/daemon/handler/srv_malleability.cpp b/src/daemon/handler/srv_malleability.cpp index 294505407..18da19818 100644 --- a/src/daemon/handler/srv_malleability.cpp +++ b/src/daemon/handler/srv_malleability.cpp @@ -28,6 +28,7 @@ */ #include #include +#include #include @@ -52,24 +53,17 @@ rpc_srv_expand_start(hg_handle_t handle) { GKFS_DATA->spdlogger()->debug( "{}() Got RPC with old conf '{}' new conf '{}'", __func__, in.old_server_conf, in.new_server_conf); - // if maintenance mode is already set, daemon is blown up as it is not - // allowed - GKFS_DATA->maintenance_mode(true); - GKFS_DATA->redist_running(true); - - // TODO branch off here with ABT thread and start redistribution - // auto abt_err = ABT_thread_create(RPC_DATA->io_pool(), nullptr, - // &task_args_[idx], &abt_tasks_[idx]); - - - // try { - out.err = 0; - // } catch(const std::exception& e) { - // GKFS_DATA->spdlogger()->error("{}() Failed to start expansion: - // '{}'", - // __func__, e.what()); - // out.err = -1; - // } + try { + // if maintenance mode is already set, error is thrown -- not allowed + GKFS_DATA->maintenance_mode(true); + GKFS_DATA->malleable_manager()->expand_start(in.old_server_conf, + in.new_server_conf); + out.err = 0; + } catch(const std::exception& e) { + GKFS_DATA->spdlogger()->error("{}() Failed to start expansion: '{}' ", + __func__, e.what()); + out.err = -1; + } GKFS_DATA->spdlogger()->debug("{}() Sending output err '{}'", __func__, out.err); @@ -77,7 +71,6 @@ rpc_srv_expand_start(hg_handle_t handle) { if(hret != HG_SUCCESS) { GKFS_DATA->spdlogger()->error("{}() Failed to respond", __func__); } - // Destroy handle when finished margo_free_input(handle, &in); margo_destroy(handle); @@ -87,9 +80,7 @@ rpc_srv_expand_start(hg_handle_t handle) { hg_return_t rpc_srv_expand_status(hg_handle_t handle) { rpc_err_out_t out; - GKFS_DATA->spdlogger()->debug("{}() Got RPC ", __func__); - try { // return 1 if redistribution is running, 0 otherwise. out.err = GKFS_DATA->redist_running() ? 1 : 0; @@ -99,14 +90,12 @@ rpc_srv_expand_status(hg_handle_t handle) { e.what()); out.err = -1; } - GKFS_DATA->spdlogger()->debug("{}() Sending output err '{}'", __func__, out.err); auto hret = margo_respond(handle, &out); if(hret != HG_SUCCESS) { GKFS_DATA->spdlogger()->error("{}() Failed to respond", __func__); } - // Destroy handle when finished margo_destroy(handle); return HG_SUCCESS; diff --git a/src/daemon/malleability/malleable_manager.cpp b/src/daemon/malleability/malleable_manager.cpp new file mode 100644 index 000000000..440d56177 --- /dev/null +++ b/src/daemon/malleability/malleable_manager.cpp @@ -0,0 +1,229 @@ +/* + Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain + Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany + + This software was partially supported by the + EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). + + This software was partially supported by the + ADA-FS project under the SPPEXA project funded by the DFG. + + This file is part of GekkoFS. + + GekkoFS is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + GekkoFS is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GekkoFS. If not, see . + + SPDX-License-Identifier: GPL-3.0-or-later +*/ + +#include + +using namespace std; + +namespace gkfs::malleable { + +void +MalleableManager::expand_abt(void* _arg) { + + GKFS_DATA->redist_running(true); + GKFS_DATA->spdlogger()->info("{}() Starting metadata redistribution...", + __func__); + GKFS_DATA->malleable_manager()->redistribute_metadata(); + GKFS_DATA->spdlogger()->info("{}() Metadata redistribution completed.", + __func__); + GKFS_DATA->spdlogger()->info("{}() Starting data redistribution...", + __func__); + GKFS_DATA->malleable_manager()->redistribute_data(); + GKFS_DATA->spdlogger()->info("{}() Data redistribution completed.", + __func__); + GKFS_DATA->redist_running(false); +} + +void +MalleableManager::expand_start(int old_server_conf, int new_server_conf) { + auto abt_err = + ABT_thread_create(RPC_DATA->io_pool(), expand_abt, + ABT_THREAD_ATTR_NULL, nullptr, &redist_thread_); + if(abt_err != ABT_SUCCESS) { + auto err_str = fmt::format( + "MalleableManager::{}() Failed to create ABT thread with abt_err '{}'", + __func__, abt_err); + throw runtime_error(err_str); + } +} +void +MalleableManager::redistribute_metadata() { + // reload_distribution_configuration(); + + // auto mid = RPC_DATA->client_rpc_mid(); + // auto relocate_metadata_id = + // gkfs::rpc::get_rpc_id(mid, gkfs::rpc::tag::relocate_metadata); + // + // auto& distributor = *(GKFS_DATA->distributor()); + // auto hosts = + // dynamic_cast(&distributor) + // ->get_hosts_map(); + // GKFS_DATA->spdlogger()->info("{}() Got host id = {} and parsed {} + // hosts", + // __func__, localhost, hosts.size()); + // + // // Relocate metadata + // for(const auto& [metakey, metavalue] : GKFS_DATA->mdb()->get_all()) { + // if(metakey == "/") { + // continue; + // } + // auto destination = distributor.locate_file_metadata(metakey); + // + // GKFS_DATA->spdlogger()->trace( + // "{}() Metadentry {} : {} {} {}", __func__, metakey, + // metavalue, (destination == localhost ? " Stay on " : " -> + // Goto "), destination); + // + // if(destination == localhost) { + // continue; + // } + // // send RPC + // rpc_relocate_metadata_in_t in{}; + // rpc_err_out_t out{}; + // hg_addr_t host_addr{}; + // + // in.key = metakey.c_str(); + // in.value = metavalue.c_str(); + // + // auto ret = margo_addr_lookup(mid, + // hosts[destination].second.c_str(), + // &host_addr); + // assert(ret == HG_SUCCESS); + // + // // let's do this sequential first + // hg_handle_t handle; + // ret = margo_create(mid, host_addr, relocate_metadata_id, &handle); + // assert(ret == HG_SUCCESS); + // + // ret = margo_forward(handle, &in); // blocking + // assert(ret == HG_SUCCESS); + // + // ret = margo_get_output(handle, &out); + // assert(ret == HG_SUCCESS); + // + // // TODO(dauer) catch DB exceptions + // GKFS_DATA->mdb()->remove(in.key); + // + // if(HG_SUCCESS != + // gkfs::rpc::margo_client_cleanup(&handle, &out, &mid, + // &host_addr)) { + // GKFS_DATA->spdlogger()->error("{}() Error during margo + // cleanup", + // __func__); + // } + // } +} + +void +MalleableManager::redistribute_data() { + // Relocate data (chunks) + // auto relocate_chunk_rpc_id = + // gkfs::rpc::get_rpc_id(mid, gkfs::rpc::tag::relocate_chunk); + // for(auto& chunks_dir : + // GKFS_DATA->storage()->chunks_directory_iterator()) { + // if(!chunks_dir.is_directory()) { + // GKFS_DATA->spdlogger()->warn( + // "{}() Expected directory but got something else: {}", + // __func__, chunks_dir.path().string()); + // continue; + // } + // string file_path = GKFS_DATA->storage()->get_file_path( + // chunks_dir.path().filename().string()); + // + // for(auto& chunk_file : fs::directory_iterator(chunks_dir)) { + // if(!chunk_file.is_regular_file()) { + // GKFS_DATA->spdlogger()->warn( + // "{}() Expected regular file but got something + // else: {}", + // __func__, chunk_file.path().string()); + // continue; + // } + // gkfs::rpc::chnk_id_t chunk_id = + // std::stoul(chunk_file.path().filename().string()); + // auto destination = distributor.locate_data(file_path, + // chunk_id); size_t size = chunk_file.file_size(); + // + // GKFS_DATA->spdlogger()->trace( + // "{}() Checking {} chunk: {} size: {} {} {}", __func__, + // file_path, chunk_id, size, + // (destination == localhost ? " Stay on" : " -> Goto "), + // destination); + // + // if(destination == localhost) { + // continue; + // } + // + // // prepare bulk + // unique_ptr buf(new char[size]()); + // // read data (blocking) + // hg_size_t bytes_read = GKFS_DATA->storage()->read_chunk( + // file_path, chunk_id, buf.get(), size, 0); + // hg_bulk_t bulk{}; + // char* bufptr = buf.get(); + // auto ret = margo_bulk_create(mid, 1, (void**) &bufptr, + // &bytes_read, + // HG_BULK_READ_ONLY, &bulk); + // assert(ret == HG_SUCCESS); + // + // // send RPC + // rpc_relocate_chunk_in_t in{}; + // rpc_err_out_t out{}; + // hg_addr_t host_addr{}; + // + // in.path = file_path.c_str(); + // in.chunk_id = chunk_id; + // in.bulk_handle = bulk; + // + // ret = margo_addr_lookup(mid, + // hosts[destination].second.c_str(), + // &host_addr); + // assert(ret == HG_SUCCESS); + // + // // let's do this sequential first + // hg_handle_t handle; + // ret = margo_create(mid, host_addr, relocate_chunk_rpc_id, + // &handle); assert(ret == HG_SUCCESS); + // + // ret = margo_forward(handle, &in); // blocking + // assert(ret == HG_SUCCESS); + // + // ret = margo_get_output(handle, &out); + // assert(ret == HG_SUCCESS); + // + // // TODO(dauer) process output + // GKFS_DATA->storage()->remove_chunk(file_path, chunk_id); + // + // // FIXME This can leave behind empty directories, even when + // the + // // whole file is delete later. Three possibilities: + // // 1) Clean them up, but make sure this doesn't break another + // thread + // // creating a new chunk in this directory at the same time. + // // 2) Switch to a flat namespace without directories + // // 3) Ignore and waste some inodes + // + // if(HG_SUCCESS != gkfs::rpc::margo_client_cleanup( + // &handle, &out, &mid, &host_addr, + // &bulk)) { + // cout << "Error during margo cleanup.\n"; + // } + // } + // } +} + +} // namespace gkfs::malleable -- GitLab From 49349524badc7dc87e8a8bc3f1dfa2f8c5d4f6f4 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Thu, 23 May 2024 17:38:19 +0200 Subject: [PATCH 06/17] Added metadata migration --- include/common/common_defs.hpp | 2 + include/common/rpc/rpc_types.hpp | 11 +- include/daemon/CMakeLists.txt | 1 + include/daemon/backend/metadata/db.hpp | 8 +- .../backend/metadata/metadata_backend.hpp | 14 +- .../backend/metadata/parallax_backend.hpp | 2 +- .../backend/metadata/rocksdb_backend.hpp | 5 +- include/daemon/classes/rpc_data.hpp | 27 +- include/daemon/handler/rpc_defs.hpp | 4 + .../daemon/malleability/malleable_manager.hpp | 14 +- .../rpc/forward_redistribution.hpp | 45 +++ src/daemon/CMakeLists.txt | 1 + src/daemon/backend/metadata/db.cpp | 10 +- .../backend/metadata/parallax_backend.cpp | 6 +- .../backend/metadata/rocksdb_backend.cpp | 27 +- src/daemon/classes/rpc_data.cpp | 29 ++ src/daemon/daemon.cpp | 14 +- src/daemon/handler/srv_malleability.cpp | 15 + src/daemon/malleability/malleable_manager.cpp | 285 +++++++++++++----- .../rpc/forward_redistribution.cpp | 82 +++++ 20 files changed, 502 insertions(+), 100 deletions(-) create mode 100644 include/daemon/malleability/rpc/forward_redistribution.hpp create mode 100644 src/daemon/malleability/rpc/forward_redistribution.cpp diff --git a/include/common/common_defs.hpp b/include/common/common_defs.hpp index 034d250f1..430b3106b 100644 --- a/include/common/common_defs.hpp +++ b/include/common/common_defs.hpp @@ -109,6 +109,8 @@ namespace gkfs::malleable::rpc::tag { constexpr auto expand_start = "rpc_srv_expand_start"; constexpr auto expand_status = "rpc_srv_expand_status"; constexpr auto expand_finalize = "rpc_srv_expand_finalize"; +constexpr auto migrate_metadata = "rpc_srv_migrate_metadata"; +constexpr auto migrate_data = "rpc_srv_migrate_data"; } // namespace gkfs::malleable::rpc::tag namespace gkfs::config::syscall::stat { diff --git a/include/common/rpc/rpc_types.hpp b/include/common/rpc/rpc_types.hpp index 9b09dbbcb..a7c937a43 100644 --- a/include/common/rpc/rpc_types.hpp +++ b/include/common/rpc/rpc_types.hpp @@ -164,9 +164,18 @@ MERCURY_GEN_PROC(rpc_proxy_get_dirents_in_t, ((hg_const_string_t) (path))((int32_t) (server))( (hg_bulk_t) (bulk_handle))) -// malleability +// malleability client <-> daemon MERCURY_GEN_PROC(rpc_expand_start_in_t, ((uint32_t) (old_server_conf))((uint32_t) (new_server_conf))) +// malleability daemon <-> daemon + +MERCURY_GEN_PROC(rpc_migrate_metadata_in_t, + ((hg_const_string_t) (key))((hg_const_string_t) (value))) + +MERCURY_GEN_PROC(rpc_migrate_data_in_t, + ((hg_const_string_t) (path))((hg_uint64_t) (chunk_id))( + (hg_bulk_t) (bulk_handle))) + #endif // LFS_RPC_TYPES_HPP diff --git a/include/daemon/CMakeLists.txt b/include/daemon/CMakeLists.txt index e1d393824..977c51cbd 100644 --- a/include/daemon/CMakeLists.txt +++ b/include/daemon/CMakeLists.txt @@ -37,6 +37,7 @@ target_sources( handler/rpc_defs.hpp handler/rpc_util.hpp malleability/malleable_manager.hpp + malleability/rpc/forward_redistribution.hpp ) if(GKFS_ENABLE_AGIOS) diff --git a/include/daemon/backend/metadata/db.hpp b/include/daemon/backend/metadata/db.hpp index f1f629e0a..00131cb2c 100644 --- a/include/daemon/backend/metadata/db.hpp +++ b/include/daemon/backend/metadata/db.hpp @@ -164,8 +164,14 @@ public: * @brief Iterate over complete database, note ONLY used for debugging and * is therefore unused. */ - void + void* iterate_all() const; + + /** + * @brief Returns an estimated db size, i.e., number of KV pairs + */ + uint64_t + db_size() const; }; } // namespace gkfs::metadata diff --git a/include/daemon/backend/metadata/metadata_backend.hpp b/include/daemon/backend/metadata/metadata_backend.hpp index 84f7a8f41..584590354 100644 --- a/include/daemon/backend/metadata/metadata_backend.hpp +++ b/include/daemon/backend/metadata/metadata_backend.hpp @@ -73,8 +73,11 @@ public: virtual std::vector> get_dirents_extended(const std::string& dir) const = 0; - virtual void + virtual void* iterate_all() const = 0; + + virtual uint64_t + db_size() const = 0; }; template @@ -137,9 +140,14 @@ public: return static_cast(*this).get_dirents_extended_impl(dir); } - void + void* iterate_all() const { - static_cast(*this).iterate_all_impl(); + return static_cast(*this).iterate_all_impl(); + } + + uint64_t + db_size() const { + return static_cast(*this).db_size_impl(); } }; diff --git a/include/daemon/backend/metadata/parallax_backend.hpp b/include/daemon/backend/metadata/parallax_backend.hpp index eda56690d..c351c0fe6 100644 --- a/include/daemon/backend/metadata/parallax_backend.hpp +++ b/include/daemon/backend/metadata/parallax_backend.hpp @@ -190,7 +190,7 @@ public: * Code example for iterating all entries in KV store. This is for debug * only as it is too expensive */ - void + void* iterate_all_impl() const; }; diff --git a/include/daemon/backend/metadata/rocksdb_backend.hpp b/include/daemon/backend/metadata/rocksdb_backend.hpp index a54472da1..824ae6fc9 100644 --- a/include/daemon/backend/metadata/rocksdb_backend.hpp +++ b/include/daemon/backend/metadata/rocksdb_backend.hpp @@ -174,8 +174,11 @@ public: * Code example for iterating all entries in KV store. This is for debug * only as it is too expensive */ - void + void* iterate_all_impl() const; + + uint64_t + db_size_impl() const; }; } // namespace gkfs::metadata diff --git a/include/daemon/classes/rpc_data.hpp b/include/daemon/classes/rpc_data.hpp index d81ecfdae..4ccc6e615 100644 --- a/include/daemon/classes/rpc_data.hpp +++ b/include/daemon/classes/rpc_data.hpp @@ -30,6 +30,7 @@ #define LFS_RPC_DATA_HPP #include +#include namespace gkfs { @@ -42,7 +43,8 @@ class Distributor; namespace daemon { struct margo_client_ids { - hg_id_t test_rpc_id; + hg_id_t migrate_metadata_id; + hg_id_t migrate_data_id; }; class RPCData { @@ -54,8 +56,13 @@ private: // contexts that were created at init time margo_instance_id server_rpc_mid_; margo_instance_id proxy_server_rpc_mid_; + // client margo_instance_id client_rpc_mid_; margo_client_ids rpc_client_ids_{}; + std::map rpc_endpoints_; + uint64_t hosts_size_; + uint64_t local_host_id_; + // Argobots I/O pools and execution streams ABT_pool io_pool_; @@ -100,6 +107,24 @@ public: margo_client_ids& rpc_client_ids(); + std::map& + rpc_endpoints(); + + void + rpc_endpoints(const std::map& rpc_endpoints); + + uint64_t + hosts_size() const; + + void + hosts_size(uint64_t hosts_size); + + uint64_t + local_host_id() const; + + void + local_host_id(uint64_t local_host_id); + ABT_pool io_pool() const; diff --git a/include/daemon/handler/rpc_defs.hpp b/include/daemon/handler/rpc_defs.hpp index f234a65fd..bb6c2ce13 100644 --- a/include/daemon/handler/rpc_defs.hpp +++ b/include/daemon/handler/rpc_defs.hpp @@ -92,5 +92,9 @@ DECLARE_MARGO_RPC_HANDLER(rpc_srv_expand_status) DECLARE_MARGO_RPC_HANDLER(rpc_srv_expand_finalize) +DECLARE_MARGO_RPC_HANDLER(rpc_srv_migrate_metadata) + +DECLARE_MARGO_RPC_HANDLER(rpc_srv_migrate_data) + #endif // GKFS_DAEMON_RPC_DEFS_HPP diff --git a/include/daemon/malleability/malleable_manager.hpp b/include/daemon/malleability/malleable_manager.hpp index eff5baf3b..3293f1d20 100644 --- a/include/daemon/malleability/malleable_manager.hpp +++ b/include/daemon/malleability/malleable_manager.hpp @@ -36,10 +36,22 @@ class MalleableManager { private: ABT_thread redist_thread_; + // TODO next 3 functions are mostly copy paste from preload_util. FIX + + std::vector> + load_hostfile(const std::string& path); + + std::vector> + read_hosts_file(); + + void + connect_to_hosts( + const std::vector>& hosts); + static void expand_abt(void* _arg); - void + int redistribute_metadata(); void diff --git a/include/daemon/malleability/rpc/forward_redistribution.hpp b/include/daemon/malleability/rpc/forward_redistribution.hpp new file mode 100644 index 000000000..b42fa968f --- /dev/null +++ b/include/daemon/malleability/rpc/forward_redistribution.hpp @@ -0,0 +1,45 @@ +/* + Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain + Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany + + This software was partially supported by the + EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). + + This software was partially supported by the + ADA-FS project under the SPPEXA project funded by the DFG. + + This file is part of GekkoFS. + + GekkoFS is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + GekkoFS is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GekkoFS. If not, see . + + SPDX-License-Identifier: GPL-3.0-or-later +*/ + +#ifndef GEKKOFS_DAEMON_FORWARD_REDISTRIBUTION_HPP +#define GEKKOFS_DAEMON_FORWARD_REDISTRIBUTION_HPP + +#include +#include + +namespace gkfs::malleable::rpc { + +int +forward_metadata(std::string& key, std::string& value, unsigned int dest_id); + +void +forward_data(); + +} // namespace gkfs::malleable::rpc + +#endif // GEKKOFS_DAEMON_FORWARD_REDISTRIBUTION_HPP \ No newline at end of file diff --git a/src/daemon/CMakeLists.txt b/src/daemon/CMakeLists.txt index cbb159a1b..d71a36d54 100644 --- a/src/daemon/CMakeLists.txt +++ b/src/daemon/CMakeLists.txt @@ -50,6 +50,7 @@ target_sources( handler/srv_management.cpp handler/srv_malleability.cpp malleability/malleable_manager.cpp + malleability/rpc/forward_redistribution.cpp PUBLIC ${CMAKE_SOURCE_DIR}/include/config.hpp ${CMAKE_SOURCE_DIR}/include/version.hpp.in ) diff --git a/src/daemon/backend/metadata/db.cpp b/src/daemon/backend/metadata/db.cpp index da015da43..a2e0e59b3 100644 --- a/src/daemon/backend/metadata/db.cpp +++ b/src/daemon/backend/metadata/db.cpp @@ -173,16 +173,20 @@ MetadataDB::get_dirents_extended(const std::string& dir) const { return backend_->get_dirents_extended(root_path); } - /** * @internal * Code example for iterating all entries in KV store. This is for debug only as * it is too expensive. * @endinternal */ -void +void* MetadataDB::iterate_all() const { - backend_->iterate_all(); + return backend_->iterate_all(); +} + +uint64_t +MetadataDB::db_size() const { + return backend_->db_size(); } } // namespace gkfs::metadata diff --git a/src/daemon/backend/metadata/parallax_backend.cpp b/src/daemon/backend/metadata/parallax_backend.cpp index ee9e5e9ec..16da64738 100644 --- a/src/daemon/backend/metadata/parallax_backend.cpp +++ b/src/daemon/backend/metadata/parallax_backend.cpp @@ -529,8 +529,10 @@ ParallaxBackend::get_dirents_extended_impl(const std::string& dir) const { * Code example for iterating all entries in KV store. This is for debug only as * it is too expensive */ -void -ParallaxBackend::iterate_all_impl() const {} +void* +ParallaxBackend::iterate_all_impl() const { + return nullptr; +} } // namespace gkfs::metadata diff --git a/src/daemon/backend/metadata/rocksdb_backend.cpp b/src/daemon/backend/metadata/rocksdb_backend.cpp index 508e7bd82..0c6fd55e6 100644 --- a/src/daemon/backend/metadata/rocksdb_backend.cpp +++ b/src/daemon/backend/metadata/rocksdb_backend.cpp @@ -388,17 +388,26 @@ RocksDBBackend::get_dirents_extended_impl(const std::string& dir) const { * Code example for iterating all entries in KV store. This is for debug only as * it is too expensive */ -void +void* RocksDBBackend::iterate_all_impl() const { - std::string key; - std::string val; + // std::string key; + // std::string val; // Do RangeScan on parent inode - auto iter = db_->NewIterator(rdb::ReadOptions()); - for(iter->SeekToFirst(); iter->Valid(); iter->Next()) { - key = iter->key().ToString(); - val = iter->value().ToString(); - std::cout << key << std::endl; - } + // auto iter = db_->NewIterator(rdb::ReadOptions()); + // for(iter->SeekToFirst(); iter->Valid(); iter->Next()) { + // key = iter->key().ToString(); + // val = iter->value().ToString(); + // } + // TODO Fix this hacky solution. Returning void* is not a good idea :> + return static_cast(db_->NewIterator(rdb::ReadOptions())); +} + +uint64_t +RocksDBBackend::db_size_impl() const { + // TODO error handling + uint64_t num_keys = 0; + db_->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys); + return num_keys; } /** diff --git a/src/daemon/classes/rpc_data.cpp b/src/daemon/classes/rpc_data.cpp index 024f409c8..6ec5c0673 100644 --- a/src/daemon/classes/rpc_data.cpp +++ b/src/daemon/classes/rpc_data.cpp @@ -69,6 +69,35 @@ RPCData::rpc_client_ids() { return rpc_client_ids_; } +std::map& +RPCData::rpc_endpoints() { + return rpc_endpoints_; +} + +void +RPCData::rpc_endpoints(const std::map& rpc_endpoints) { + rpc_endpoints_ = rpc_endpoints; +} + +uint64_t +RPCData::hosts_size() const { + return hosts_size_; +} +void +RPCData::hosts_size(uint64_t hosts_size) { + hosts_size_ = hosts_size; +} + +uint64_t +RPCData::local_host_id() const { + return local_host_id_; +} + +void +RPCData::local_host_id(uint64_t local_host_id) { + local_host_id_ = local_host_id; +} + ABT_pool RPCData::io_pool() const { return io_pool_; diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index 16dbd8d29..5986c1faa 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -184,6 +184,11 @@ register_server_rpcs(margo_instance_id mid) { rpc_err_out_t, rpc_srv_expand_status); MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::expand_finalize, void, rpc_err_out_t, rpc_srv_expand_status); + MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::migrate_metadata, + rpc_migrate_metadata_in_t, rpc_err_out_t, + rpc_srv_migrate_metadata); + MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::migrate_data, + rpc_migrate_data_in_t, rpc_err_out_t, rpc_srv_migrate_data); } /** @@ -253,9 +258,12 @@ init_rpc_server() { */ void register_client_rpcs(margo_instance_id mid) { - // TODO - RPC_DATA->rpc_client_ids().test_rpc_id = MARGO_REGISTER( - mid, gkfs::rpc::tag::fs_config, void, rpc_config_out_t, NULL); + RPC_DATA->rpc_client_ids().migrate_metadata_id = + MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::migrate_metadata, + rpc_migrate_metadata_in_t, rpc_err_out_t, NULL); + // RPC_DATA->rpc_client_ids().migrate_data_id = MARGO_REGISTER( + // mid, gkfs::malleable::rpc::tag::migrate_metadata, + // rpc_migrate_metadata_in_t, rpc_err_out_t, NULL); } /** diff --git a/src/daemon/handler/srv_malleability.cpp b/src/daemon/handler/srv_malleability.cpp index 18da19818..73ccae06c 100644 --- a/src/daemon/handler/srv_malleability.cpp +++ b/src/daemon/handler/srv_malleability.cpp @@ -62,6 +62,7 @@ rpc_srv_expand_start(hg_handle_t handle) { } catch(const std::exception& e) { GKFS_DATA->spdlogger()->error("{}() Failed to start expansion: '{}' ", __func__, e.what()); + GKFS_DATA->maintenance_mode(false); out.err = -1; } @@ -126,6 +127,16 @@ rpc_srv_expand_finalize(hg_handle_t handle) { return HG_SUCCESS; } +hg_return_t +rpc_srv_migrate_metadata(hg_handle_t handle) { + return HG_SUCCESS; +} + +hg_return_t +rpc_srv_migrate_data(hg_handle_t handle) { + return HG_SUCCESS; +} + } // namespace DEFINE_MARGO_RPC_HANDLER(rpc_srv_expand_start) @@ -133,3 +144,7 @@ DEFINE_MARGO_RPC_HANDLER(rpc_srv_expand_start) DEFINE_MARGO_RPC_HANDLER(rpc_srv_expand_status) DEFINE_MARGO_RPC_HANDLER(rpc_srv_expand_finalize) + +DEFINE_MARGO_RPC_HANDLER(rpc_srv_migrate_metadata) + +DEFINE_MARGO_RPC_HANDLER(rpc_srv_migrate_data) diff --git a/src/daemon/malleability/malleable_manager.cpp b/src/daemon/malleability/malleable_manager.cpp index 440d56177..e4c1a34e5 100644 --- a/src/daemon/malleability/malleable_manager.cpp +++ b/src/daemon/malleability/malleable_manager.cpp @@ -27,30 +27,188 @@ */ #include +#include + +#include + +#include + +#include +#include +#include using namespace std; namespace gkfs::malleable { +vector> +MalleableManager::load_hostfile(const std::string& path) { + + GKFS_DATA->spdlogger()->info("{}() Loading hosts file '{}'", __func__, + path); + + ifstream lf(path); + if(!lf) { + throw runtime_error(fmt::format("Failed to open hosts file '{}': {}", + path, strerror(errno))); + } + vector> hosts; + const regex line_re("^(\\S+)\\s+(\\S+)\\s*(\\S*)$", + regex::ECMAScript | regex::optimize); + string line; + string host; + string uri; + std::smatch match; + while(getline(lf, line)) { + // if line starts with #, it indicates the end of current FS instance + // It is therefore skipped + if(line[0] == '#') + continue; + if(!regex_match(line, match, line_re)) { + GKFS_DATA->spdlogger()->error( + "{}() Unrecognized line format: [path: '{}', line: '{}']", + path, line); + throw runtime_error( + fmt::format("unrecognized line format: '{}'", line)); + } + host = match[1]; + uri = match[2]; + hosts.emplace_back(host, uri); + } + if(hosts.empty()) { + throw runtime_error( + "Hosts file found but no suitable addresses could be extracted"); + } + // sort hosts so that data always hashes to the same place + std::sort(hosts.begin(), hosts.end()); + // remove rootdir suffix from host after sorting as no longer required + for(auto& h : hosts) { + auto idx = h.first.rfind("#"); + if(idx != string::npos) + h.first.erase(idx, h.first.length()); + } + return hosts; +} + +vector> +MalleableManager::read_hosts_file() { + auto hostfile = GKFS_DATA->hosts_file(); + GKFS_DATA->spdlogger()->info("{}() Reading hosts file...", __func__); + + vector> hosts; + try { + hosts = load_hostfile(hostfile); + } catch(const exception& e) { + auto emsg = fmt::format("Failed to load hosts file: {}", e.what()); + throw runtime_error(emsg); + } + + if(hosts.empty()) { + throw runtime_error(fmt::format("Hostfile empty: '{}'", hostfile)); + } + GKFS_DATA->spdlogger()->info( + "{}() Number of hosts for current instance '{}'", __func__, + hosts.size()); + return hosts; +} + void -MalleableManager::expand_abt(void* _arg) { +MalleableManager::connect_to_hosts( + const vector>& hosts) { + auto local_hostname = gkfs::rpc::get_my_hostname(true); + bool local_host_found = false; - GKFS_DATA->redist_running(true); - GKFS_DATA->spdlogger()->info("{}() Starting metadata redistribution...", + RPC_DATA->hosts_size(hosts.size()); + vector host_ids(hosts.size()); + // populate vector with [0, ..., host_size - 1] + ::iota(::begin(host_ids), ::end(host_ids), 0); + /* + * Shuffle hosts to balance addr lookups to all hosts + * Too many concurrent lookups send to same host + * could overwhelm the server, + * returning error when addr lookup + */ + ::random_device rd; // obtain a random number from hardware + ::mt19937 g(rd()); // seed the random generator + ::shuffle(host_ids.begin(), host_ids.end(), g); // Shuffle hosts vector + // lookup addresses and put abstract server addresses into rpc_addresses + for(const auto& id : host_ids) { + const auto& hostname = hosts.at(id).first; + const auto& uri = hosts.at(id).second; + + hg_addr_t svr_addr = HG_ADDR_NULL; + + // try to look up 3 times before erroring out + hg_return_t ret; + for(uint32_t i = 0; i < 4; i++) { + ret = margo_addr_lookup(RPC_DATA->client_rpc_mid(), uri.c_str(), + &svr_addr); + if(ret != HG_SUCCESS) { + // still not working after 5 tries. + if(i == 3) { + auto err_msg = + fmt::format("{}() Unable to lookup address '{}'", + __func__, uri); + throw runtime_error(err_msg); + } + // Wait a random amount of time and try again + ::mt19937 eng(rd()); // seed the random generator + ::uniform_int_distribution<> distr( + 50, 50 * (i + 2)); // define the range + ::this_thread::sleep_for(std::chrono::milliseconds(distr(eng))); + } else { + break; + } + } + if(svr_addr == HG_ADDR_NULL) { + auto err_msg = fmt::format( + "{}() looked up address is NULL for address '{}'", __func__, + uri); + throw runtime_error(err_msg); + } + RPC_DATA->rpc_endpoints().insert(make_pair(id, svr_addr)); + + if(!local_host_found && hostname == local_hostname) { + GKFS_DATA->spdlogger()->debug("{}() Found local host: {}", __func__, + hostname); + RPC_DATA->local_host_id(id); + local_host_found = true; + } + GKFS_DATA->spdlogger()->debug("{}() Found daemon: id '{}' uri '{}'", + __func__, id, uri); + } + if(!local_host_found) { + auto err_msg = fmt::format( + "{}() Local host '{}' not found in hosts file. This should not happen.", + __func__, local_hostname); + throw runtime_error(err_msg); + } +} + +void +MalleableManager::expand_abt(void* _arg) { + GKFS_DATA->spdlogger()->info("{}() Starting expansion process...", __func__); + GKFS_DATA->redist_running(true); GKFS_DATA->malleable_manager()->redistribute_metadata(); - GKFS_DATA->spdlogger()->info("{}() Metadata redistribution completed.", - __func__); - GKFS_DATA->spdlogger()->info("{}() Starting data redistribution...", - __func__); GKFS_DATA->malleable_manager()->redistribute_data(); - GKFS_DATA->spdlogger()->info("{}() Data redistribution completed.", - __func__); GKFS_DATA->redist_running(false); + GKFS_DATA->spdlogger()->info( + "{}() Expansion process successfully finished.", __func__); } void MalleableManager::expand_start(int old_server_conf, int new_server_conf) { + auto hosts = read_hosts_file(); + if(hosts.size() != static_cast(new_server_conf)) { + throw runtime_error( + fmt::format("MalleableManager::{}() Something is wrong. " + "Number of hosts in hosts file ({}) " + "does not match new server configuration ({})", + __func__, hosts.size(), new_server_conf)); + } + connect_to_hosts(hosts); + auto abt_err = ABT_thread_create(RPC_DATA->io_pool(), expand_abt, ABT_THREAD_ATTR_NULL, nullptr, &redist_thread_); @@ -61,76 +219,55 @@ MalleableManager::expand_start(int old_server_conf, int new_server_conf) { throw runtime_error(err_str); } } -void -MalleableManager::redistribute_metadata() { - // reload_distribution_configuration(); - // auto mid = RPC_DATA->client_rpc_mid(); - // auto relocate_metadata_id = - // gkfs::rpc::get_rpc_id(mid, gkfs::rpc::tag::relocate_metadata); - // - // auto& distributor = *(GKFS_DATA->distributor()); - // auto hosts = - // dynamic_cast(&distributor) - // ->get_hosts_map(); - // GKFS_DATA->spdlogger()->info("{}() Got host id = {} and parsed {} - // hosts", - // __func__, localhost, hosts.size()); - // - // // Relocate metadata - // for(const auto& [metakey, metavalue] : GKFS_DATA->mdb()->get_all()) { - // if(metakey == "/") { - // continue; - // } - // auto destination = distributor.locate_file_metadata(metakey); - // - // GKFS_DATA->spdlogger()->trace( - // "{}() Metadentry {} : {} {} {}", __func__, metakey, - // metavalue, (destination == localhost ? " Stay on " : " -> - // Goto "), destination); - // - // if(destination == localhost) { - // continue; - // } - // // send RPC - // rpc_relocate_metadata_in_t in{}; - // rpc_err_out_t out{}; - // hg_addr_t host_addr{}; - // - // in.key = metakey.c_str(); - // in.value = metavalue.c_str(); - // - // auto ret = margo_addr_lookup(mid, - // hosts[destination].second.c_str(), - // &host_addr); - // assert(ret == HG_SUCCESS); - // - // // let's do this sequential first - // hg_handle_t handle; - // ret = margo_create(mid, host_addr, relocate_metadata_id, &handle); - // assert(ret == HG_SUCCESS); - // - // ret = margo_forward(handle, &in); // blocking - // assert(ret == HG_SUCCESS); - // - // ret = margo_get_output(handle, &out); - // assert(ret == HG_SUCCESS); - // - // // TODO(dauer) catch DB exceptions - // GKFS_DATA->mdb()->remove(in.key); - // - // if(HG_SUCCESS != - // gkfs::rpc::margo_client_cleanup(&handle, &out, &mid, - // &host_addr)) { - // GKFS_DATA->spdlogger()->error("{}() Error during margo - // cleanup", - // __func__); - // } - // } +int +MalleableManager::redistribute_metadata() { + uint64_t count = 0; + auto estimate_db_size = GKFS_DATA->mdb()->db_size(); + auto percent_interval = estimate_db_size / 1000; + GKFS_DATA->spdlogger()->info( + "{}() Starting metadata redistribution for '{}' estimated number of KV pairs...", + estimate_db_size, __func__); + int migration_err = 0; + string key, value; + auto iter = + static_cast(GKFS_DATA->mdb()->iterate_all()); + for(iter->SeekToFirst(); iter->Valid(); iter->Next()) { + key = iter->key().ToString(); + value = iter->value().ToString(); + if(key == "/") { + continue; + } + auto dest_id = RPC_DATA->distributor()->locate_file_metadata(key, 0); + if(dest_id == RPC_DATA->local_host_id()) { + continue; + } + auto err = gkfs::malleable::rpc::forward_metadata(key, value, dest_id); + if(err != 0) { + GKFS_DATA->spdlogger()->error( + "{}() Failed to migrate metadata for key '{}'", __func__, + key); + migration_err++; + } + GKFS_DATA->mdb()->remove(key); + count++; + if(percent_interval > 0 && count % percent_interval == 0) { + GKFS_DATA->spdlogger()->info( + "{}() Metadata migration {}%/100% completed...", __func__, + count / percent_interval); + } + } + GKFS_DATA->spdlogger()->info("{}() Metadata redistribution completed.", + __func__); + return migration_err; } void MalleableManager::redistribute_data() { + GKFS_DATA->spdlogger()->info("{}() Starting data redistribution...", + __func__); + GKFS_DATA->spdlogger()->info("{}() Data redistribution completed.", + __func__); // Relocate data (chunks) // auto relocate_chunk_rpc_id = // gkfs::rpc::get_rpc_id(mid, gkfs::rpc::tag::relocate_chunk); diff --git a/src/daemon/malleability/rpc/forward_redistribution.cpp b/src/daemon/malleability/rpc/forward_redistribution.cpp new file mode 100644 index 000000000..7b1a4e71c --- /dev/null +++ b/src/daemon/malleability/rpc/forward_redistribution.cpp @@ -0,0 +1,82 @@ +/* + Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain + Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany + + This software was partially supported by the + EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). + + This software was partially supported by the + ADA-FS project under the SPPEXA project funded by the DFG. + + This file is part of GekkoFS. + + GekkoFS is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + GekkoFS is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GekkoFS. If not, see . + + SPDX-License-Identifier: GPL-3.0-or-later +*/ + +#include +#include + + +namespace gkfs::malleable::rpc { + +int +forward_metadata(std::string& key, std::string& value, unsigned int dest_id) { + hg_handle_t rpc_handle = nullptr; + rpc_migrate_metadata_in_t in{}; + rpc_err_out_t out{}; + int err; + // Create handle + GKFS_DATA->spdlogger()->debug("{}() Creating Margo handle ...", __func__); + auto endp = RPC_DATA->rpc_endpoints().at(dest_id); + auto ret = margo_create(RPC_DATA->client_rpc_mid(), endp, + RPC_DATA->rpc_client_ids().migrate_metadata_id, + &rpc_handle); + if(ret != HG_SUCCESS) { + GKFS_DATA->spdlogger()->error( + "{}() Critical error. Cannot create margo handle", __func__); + return EBUSY; + } + ret = margo_forward(rpc_handle, &in); + if(ret == HG_SUCCESS) { + // Get response + GKFS_DATA->spdlogger()->trace("{}() Waiting for response", __func__); + ret = margo_get_output(rpc_handle, &out); + if(ret == HG_SUCCESS) { + GKFS_DATA->spdlogger()->debug("{}() Got response success: {}", + __func__, out.err); + err = out.err; + margo_free_output(rpc_handle, &out); + } else { + // something is wrong + err = EBUSY; + GKFS_DATA->spdlogger()->error("{}() while getting rpc output", + __func__); + } + } else { + // something is wrong + err = EBUSY; + GKFS_DATA->spdlogger()->error("{}() sending rpc failed", __func__); + } + + /* clean up resources consumed by this rpc */ + margo_destroy(rpc_handle); + return err; +} + +void +forward_data() {} + +} // namespace gkfs::malleable::rpc \ No newline at end of file -- GitLab From c84a5738fc716a63ee9148f3e1922d6543b691a9 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Thu, 23 May 2024 22:57:21 +0200 Subject: [PATCH 07/17] Adding gkfs_malleability executable, fixing RPC bugs --- src/client/malleability.cpp | 7 +++-- src/client/rpc/rpc_types.cpp | 6 ++++ src/daemon/daemon.cpp | 2 +- tools/CMakeLists.txt | 13 ++++++-- tools/malleability.cpp | 60 ++++++++++++++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 7 deletions(-) create mode 100644 tools/malleability.cpp diff --git a/src/client/malleability.cpp b/src/client/malleability.cpp index 156b9a079..637050312 100644 --- a/src/client/malleability.cpp +++ b/src/client/malleability.cpp @@ -35,9 +35,10 @@ namespace gkfs::malleable { int expand_start(int old_server_conf, int new_server_conf) { - LOG(INFO, "{}() Expand operation started", __func__); - return gkfs::malleable::rpc::forward_expand_start(old_server_conf, - new_server_conf); + LOG(INFO, "{}() Expand operation enter", __func__); + gkfs::malleable::rpc::forward_expand_start(old_server_conf, + new_server_conf); + return 0; } int diff --git a/src/client/rpc/rpc_types.cpp b/src/client/rpc/rpc_types.cpp index 8dc03c911..ed447bdd2 100644 --- a/src/client/rpc/rpc_types.cpp +++ b/src/client/rpc/rpc_types.cpp @@ -60,6 +60,12 @@ hermes::detail::register_user_request_types(uint32_t provider_id) { (void) registered_requests().add(provider_id); (void) registered_requests().add( provider_id); + (void) registered_requests().add( + provider_id); + (void) registered_requests().add( + provider_id); + (void) registered_requests().add( + provider_id); } else { (void) registered_requests().add( provider_id); diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index 5986c1faa..a56ca28f1 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -183,7 +183,7 @@ register_server_rpcs(margo_instance_id mid) { MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::expand_status, void, rpc_err_out_t, rpc_srv_expand_status); MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::expand_finalize, void, - rpc_err_out_t, rpc_srv_expand_status); + rpc_err_out_t, rpc_srv_expand_finalize); MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::migrate_metadata, rpc_migrate_metadata_in_t, rpc_err_out_t, rpc_srv_migrate_metadata); diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 9c3e16519..f9d7d6b8e 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -1,6 +1,6 @@ ################################################################################ -# Copyright 2018-2023, Barcelona Supercomputing Center (BSC), Spain # -# Copyright 2015-2023, Johannes Gutenberg Universitaet Mainz, Germany # +# Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain # +# Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany # # # # This software was partially supported by the # # EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). # @@ -35,4 +35,11 @@ if (GKFS_ENABLE_CLIENT_METRICS) ) target_link_libraries(gkfs_clientmetrics2json PUBLIC msgpack_util nlohmann_json::nlohmann_json) install(TARGETS gkfs_clientmetrics2json RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) -endif () \ No newline at end of file +endif () + +add_executable(gkfs_malleability malleability.cpp) +target_link_libraries(gkfs_malleability + PUBLIC + gkfs_user_lib + CLI11::CLI11) +install(TARGETS gkfs_malleability RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) \ No newline at end of file diff --git a/tools/malleability.cpp b/tools/malleability.cpp new file mode 100644 index 000000000..8f0474605 --- /dev/null +++ b/tools/malleability.cpp @@ -0,0 +1,60 @@ +/* + Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain + Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany + + This software was partially supported by the + EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). + + This software was partially supported by the + ADA-FS project under the SPPEXA project funded by the DFG. + + This file is part of GekkoFS. + + GekkoFS is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + GekkoFS is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GekkoFS. If not, see . + + SPDX-License-Identifier: GPL-3.0-or-later +*/ + +#include +// #include +#include + +#include +#include + + +using namespace std; + +struct cli_options { + string hosts_file; +}; + +int +main(int argc, const char* argv[]) { + CLI::App desc{"Allowed options"}; + cli_options opts{}; + + auto res = gkfs_init(); + cout << "Init result " << res << endl; + + res = gkfs::malleable::expand_start(1, 2); + cout << "Expand start " << res << endl; + res = gkfs::malleable::expand_status(); + cout << "Expand status " << res << endl; + res = gkfs::malleable::expand_finalize(); + cout << "Expand finalize " << res << endl; + + res = gkfs_end(); + cout << "End result " << res << endl; +} \ No newline at end of file -- GitLab From 0190d1cf764a9f35b031a17d6e3ff64fb77194b4 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Fri, 24 May 2024 00:31:50 +0200 Subject: [PATCH 08/17] Initial functional checks for communication to expansion thread succeeded --- include/common/common_defs.hpp | 22 ++++++++------ scripts/run/gkfs | 3 +- src/client/malleability.cpp | 27 +++++++++++++++-- src/client/preload_util.cpp | 10 +++---- src/daemon/malleability/malleable_manager.cpp | 2 +- tools/malleability.cpp | 29 +++++++++++-------- 6 files changed, 62 insertions(+), 31 deletions(-) diff --git a/include/common/common_defs.hpp b/include/common/common_defs.hpp index 430b3106b..9e5eda72b 100644 --- a/include/common/common_defs.hpp +++ b/include/common/common_defs.hpp @@ -29,10 +29,15 @@ #ifndef GEKKOFS_COMMON_DEFS_HPP #define GEKKOFS_COMMON_DEFS_HPP +namespace gkfs { +namespace client { +// This must be equivalent to the line set in the gkfs script +constexpr auto hostsfile_end_str = "#FS_INSTANCE_END"; + +} // namespace client +namespace rpc { // These constexpr set the RPC's identity and which handler the receiver end // should use -namespace gkfs::rpc { - using chnk_id_t = unsigned long; struct ChunkStat { unsigned long chunk_size; @@ -40,7 +45,6 @@ struct ChunkStat { unsigned long chunk_free; }; - namespace tag { constexpr auto fs_config = "rpc_srv_fs_config"; @@ -103,19 +107,19 @@ constexpr auto all_remote_protocols = {ofi_sockets, ofi_tcp, ofi_verbs, ucx_rc, ucx_ud}; #pragma GCC diagnostic pop } // namespace protocol -} // namespace gkfs::rpc +} // namespace rpc -namespace gkfs::malleable::rpc::tag { +namespace malleable::rpc::tag { constexpr auto expand_start = "rpc_srv_expand_start"; constexpr auto expand_status = "rpc_srv_expand_status"; constexpr auto expand_finalize = "rpc_srv_expand_finalize"; constexpr auto migrate_metadata = "rpc_srv_migrate_metadata"; constexpr auto migrate_data = "rpc_srv_migrate_data"; -} // namespace gkfs::malleable::rpc::tag +} // namespace malleable::rpc::tag -namespace gkfs::config::syscall::stat { +namespace config::syscall::stat { // Number 512-byte blocks allocated as it is in the linux kernel (struct_stat.h) constexpr auto st_nblocksize = 512; -} // namespace gkfs::config::syscall::stat - +} // namespace config::syscall::stat +} // namespace gkfs #endif // GEKKOFS_COMMON_DEFS_HPP diff --git a/scripts/run/gkfs b/scripts/run/gkfs index 07c6a4937..9f1482331 100755 --- a/scripts/run/gkfs +++ b/scripts/run/gkfs @@ -43,7 +43,8 @@ wait_for_gkfs_daemons() { exit 1 fi done - echo "# End of current FS instance" >> "${HOSTSFILE}" + # This must be equivalent to the line set in include/common/common_defs.hpp + echo "#FS_INSTANCE_END" >> "${HOSTSFILE}" } ####################################### # Creates a pid file for a given pid. If pid file exists, we check if its pids are still valid. diff --git a/src/client/malleability.cpp b/src/client/malleability.cpp index 637050312..f53be3e9d 100644 --- a/src/client/malleability.cpp +++ b/src/client/malleability.cpp @@ -28,17 +28,38 @@ */ #include +#include #include #include +#include + +using namespace std; + namespace gkfs::malleable { int expand_start(int old_server_conf, int new_server_conf) { LOG(INFO, "{}() Expand operation enter", __func__); - gkfs::malleable::rpc::forward_expand_start(old_server_conf, - new_server_conf); - return 0; + // sanity checks + if(old_server_conf == new_server_conf) { + auto err_str = + "ERR: Old server configuration is the same as the new one"; + cerr << err_str << endl; + LOG(ERROR, "{}() {}", __func__, err_str); + return -1; + } + if(CTX->hosts().size() != static_cast(old_server_conf)) { + auto err_str = + "ERR: Old server configuration does not match the number of hosts in hostsfile"; + cerr << err_str << endl; + LOG(ERROR, "{}() {}", __func__, err_str); + return -1; + } + // TODO check that hostsfile contains endmarker + return gkfs::malleable::rpc::forward_expand_start(old_server_conf, + new_server_conf); + // return 0; } int diff --git a/src/client/preload_util.cpp b/src/client/preload_util.cpp index 247e44819..806db6639 100644 --- a/src/client/preload_util.cpp +++ b/src/client/preload_util.cpp @@ -162,14 +162,14 @@ load_hostfile(const std::string& path) { std::smatch match; while(getline(lf, line)) { // if line starts with #, it indicates the end of current FS instance - // It is therefore skipped - if(line[0] == '#') - continue; + // Further hosts are not part of the file system instance yet and are + // therefore skipped The hostfile is ordered, so nothgin below this line + // can contain valid hosts + if(line.find(gkfs::client::hostsfile_end_str) != string::npos) + break; if(!regex_match(line, match, line_re)) { - LOG(ERROR, "Unrecognized line format: [path: '{}', line: '{}']", path, line); - throw runtime_error( fmt::format("unrecognized line format: '{}'", line)); } diff --git a/src/daemon/malleability/malleable_manager.cpp b/src/daemon/malleability/malleable_manager.cpp index e4c1a34e5..00a2aae56 100644 --- a/src/daemon/malleability/malleable_manager.cpp +++ b/src/daemon/malleability/malleable_manager.cpp @@ -227,7 +227,7 @@ MalleableManager::redistribute_metadata() { auto percent_interval = estimate_db_size / 1000; GKFS_DATA->spdlogger()->info( "{}() Starting metadata redistribution for '{}' estimated number of KV pairs...", - estimate_db_size, __func__); + __func__, estimate_db_size); int migration_err = 0; string key, value; auto iter = diff --git a/tools/malleability.cpp b/tools/malleability.cpp index 8f0474605..b3d4bdc90 100644 --- a/tools/malleability.cpp +++ b/tools/malleability.cpp @@ -45,16 +45,21 @@ main(int argc, const char* argv[]) { CLI::App desc{"Allowed options"}; cli_options opts{}; - auto res = gkfs_init(); - cout << "Init result " << res << endl; - - res = gkfs::malleable::expand_start(1, 2); - cout << "Expand start " << res << endl; - res = gkfs::malleable::expand_status(); - cout << "Expand status " << res << endl; - res = gkfs::malleable::expand_finalize(); - cout << "Expand finalize " << res << endl; - - res = gkfs_end(); - cout << "End result " << res << endl; + auto err = gkfs_init(); + cout << "Init result " << err << endl; + + err = gkfs::malleable::expand_start(1, 1); + if(err) { + cout << "Expand start failed. Exiting..." << endl; + gkfs_end(); + return -1; + } + cout << "Expand start " << err << endl; + err = gkfs::malleable::expand_status(); + cout << "Expand status " << err << endl; + err = gkfs::malleable::expand_finalize(); + cout << "Expand finalize " << err << endl; + + err = gkfs_end(); + cout << "End result " << err << endl; } \ No newline at end of file -- GitLab From 91679852d7f42360dc0e9b322426788578c161ed Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Fri, 24 May 2024 13:42:04 +0200 Subject: [PATCH 09/17] Adding metadata redistribution receiving function --- src/daemon/handler/srv_malleability.cpp | 58 ++++++++++++++----------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/src/daemon/handler/srv_malleability.cpp b/src/daemon/handler/srv_malleability.cpp index 73ccae06c..4bdfea5cf 100644 --- a/src/daemon/handler/srv_malleability.cpp +++ b/src/daemon/handler/srv_malleability.cpp @@ -28,7 +28,9 @@ */ #include #include +#include #include +#include #include @@ -46,10 +48,11 @@ rpc_srv_expand_start(hg_handle_t handle) { rpc_err_out_t out; auto ret = margo_get_input(handle, &in); - if(ret != HG_SUCCESS) + if(ret != HG_SUCCESS) { GKFS_DATA->spdlogger()->error( "{}() Failed to retrieve input from handle", __func__); - assert(ret == HG_SUCCESS); + return gkfs::rpc::cleanup_respond(&handle, &in, &out); + } GKFS_DATA->spdlogger()->debug( "{}() Got RPC with old conf '{}' new conf '{}'", __func__, in.old_server_conf, in.new_server_conf); @@ -68,14 +71,7 @@ rpc_srv_expand_start(hg_handle_t handle) { GKFS_DATA->spdlogger()->debug("{}() Sending output err '{}'", __func__, out.err); - auto hret = margo_respond(handle, &out); - if(hret != HG_SUCCESS) { - GKFS_DATA->spdlogger()->error("{}() Failed to respond", __func__); - } - // Destroy handle when finished - margo_free_input(handle, &in); - margo_destroy(handle); - return HG_SUCCESS; + return gkfs::rpc::cleanup_respond(&handle, &in, &out); } hg_return_t @@ -93,13 +89,7 @@ rpc_srv_expand_status(hg_handle_t handle) { } GKFS_DATA->spdlogger()->debug("{}() Sending output err '{}'", __func__, out.err); - auto hret = margo_respond(handle, &out); - if(hret != HG_SUCCESS) { - GKFS_DATA->spdlogger()->error("{}() Failed to respond", __func__); - } - // Destroy handle when finished - margo_destroy(handle); - return HG_SUCCESS; + return gkfs::rpc::cleanup_respond(&handle, &out); } hg_return_t @@ -117,19 +107,35 @@ rpc_srv_expand_finalize(hg_handle_t handle) { GKFS_DATA->spdlogger()->debug("{}() Sending output err '{}'", __func__, out.err); - auto hret = margo_respond(handle, &out); - if(hret != HG_SUCCESS) { - GKFS_DATA->spdlogger()->error("{}() Failed to respond", __func__); - } - - // Destroy handle when finished - margo_destroy(handle); - return HG_SUCCESS; + return gkfs::rpc::cleanup_respond(&handle, &out); } hg_return_t rpc_srv_migrate_metadata(hg_handle_t handle) { - return HG_SUCCESS; + rpc_migrate_metadata_in_t in{}; + rpc_err_out_t out{}; + + auto ret = margo_get_input(handle, &in); + if(ret != HG_SUCCESS) { + GKFS_DATA->spdlogger()->error( + "{}() Failed to retrieve input from handle", __func__); + return gkfs::rpc::cleanup_respond(&handle, &in, &out); + } + GKFS_DATA->spdlogger()->debug("{}() Got RPC with key '{}' value '{}'", + __func__, in.key, in.value); + try { + // create metadentry + GKFS_DATA->mdb()->put(in.key, in.value); + out.err = 0; + } catch(const std::exception& e) { + GKFS_DATA->spdlogger()->error("{}() Failed to create KV entry: '{}'", + __func__, e.what()); + out.err = -1; + } + + GKFS_DATA->spdlogger()->debug("{}() Sending output err '{}'", __func__, + out.err); + return gkfs::rpc::cleanup_respond(&handle, &in, &out); } hg_return_t -- GitLab From bbedfa371e8181145a1a8f240412c35ed46abe87 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Wed, 3 Jul 2024 18:00:25 +0200 Subject: [PATCH 10/17] Working metadata redistribution on demand --- include/common/rpc/distributor.hpp | 15 +++ src/common/rpc/distributor.cpp | 20 ++++ src/daemon/daemon.cpp | 38 ++++--- src/daemon/malleability/malleable_manager.cpp | 9 +- .../rpc/forward_redistribution.cpp | 3 + src/daemon/util.cpp | 102 ++++++++++++++++-- tools/malleability.cpp | 2 +- 7 files changed, 159 insertions(+), 30 deletions(-) diff --git a/include/common/rpc/distributor.hpp b/include/common/rpc/distributor.hpp index 53fed0200..eb8bca77f 100644 --- a/include/common/rpc/distributor.hpp +++ b/include/common/rpc/distributor.hpp @@ -56,6 +56,9 @@ public: virtual unsigned int hosts_size() const = 0; + virtual void + hosts_size(unsigned int size) = 0; + virtual host_t locate_data(const std::string& path, const chunkid_t& chnk_id, unsigned int hosts_size, const int num_copy) = 0; @@ -83,6 +86,9 @@ public: unsigned int hosts_size() const override; + void + hosts_size(unsigned int size) override; + host_t localhost() const override; @@ -116,6 +122,9 @@ public: unsigned int hosts_size() const override; + void + hosts_size(unsigned int size) override; + host_t locate_data(const std::string& path, const chunkid_t& chnk_id, const int num_copy) const override; @@ -144,6 +153,9 @@ public: unsigned int hosts_size() const override; + void + hosts_size(unsigned int size) override; + host_t locate_data(const std::string& path, const chunkid_t& chnk_id, const int num_copy) const override final; @@ -197,6 +209,9 @@ public: unsigned int hosts_size() const override; + void + hosts_size(unsigned int size) override; + host_t locate_data(const std::string& path, const chunkid_t& chnk_id, const int num_copy) const override; diff --git a/src/common/rpc/distributor.cpp b/src/common/rpc/distributor.cpp index b17a09862..9fd4d90b5 100644 --- a/src/common/rpc/distributor.cpp +++ b/src/common/rpc/distributor.cpp @@ -52,6 +52,11 @@ SimpleHashDistributor::hosts_size() const { return hosts_size_; } +void +SimpleHashDistributor::hosts_size(unsigned int size) { + hosts_size_ = size; +} + host_t SimpleHashDistributor::locate_data(const string& path, const chunkid_t& chnk_id, const int num_copy) const { @@ -95,6 +100,11 @@ LocalOnlyDistributor::hosts_size() const { return hosts_size_; } +void +LocalOnlyDistributor::hosts_size(unsigned int size) { + hosts_size_ = size; +} + host_t LocalOnlyDistributor::locate_data(const string& path, const chunkid_t& chnk_id, const int num_copy) const { @@ -128,6 +138,11 @@ ForwarderDistributor::hosts_size() const { return hosts_size_; } +void +ForwarderDistributor::hosts_size(unsigned int size) { + hosts_size_ = size; +} + host_t ForwarderDistributor::locate_data(const std::string& path, const chunkid_t& chnk_id, @@ -239,6 +254,11 @@ GuidedDistributor::hosts_size() const { return hosts_size_; } +void +GuidedDistributor::hosts_size(unsigned int size) { + hosts_size_ = size; +} + host_t GuidedDistributor::locate_data(const string& path, const chunkid_t& chnk_id, unsigned int hosts_size, const int num_copy) { diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index a56ca28f1..8f298073d 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -278,9 +278,12 @@ init_rpc_client() { if(gkfs::rpc::protocol::ofi_psm2 == GKFS_DATA->rpc_protocol()) hg_options.na_init_info.progress_mode = NA_NO_BLOCK; // Start Margo (this will also initialize Argobots and Mercury internally) - auto margo_config = "{}"; + auto margo_config = fmt::format( + R"({{ "use_progress_thread" : true, "rpc_thread_count" : {} }})", + 0); + // auto margo_config = "{}"; struct margo_init_info args = {nullptr}; - args.json_config = margo_config; + args.json_config = margo_config.c_str(); args.hg_init_info = &hg_options; auto* mid = margo_init_ext(GKFS_DATA->bind_addr().c_str(), MARGO_CLIENT_MODE, &args); @@ -465,20 +468,6 @@ init_environment() { // init margo for proxy RPC if(!GKFS_DATA->bind_proxy_addr().empty()) { - GKFS_DATA->spdlogger()->debug("{}() Initializing Distributor ... ", - __func__); - try { - auto distributor = - std::make_shared(); - RPC_DATA->distributor(distributor); - } catch(const std::exception& e) { - GKFS_DATA->spdlogger()->error( - "{}() Failed to initialize Distributor: {}", __func__, - e.what()); - throw; - } - GKFS_DATA->spdlogger()->debug("{}() Distributed running.", __func__); - GKFS_DATA->spdlogger()->debug( "{}() Initializing proxy RPC server: '{}'", __func__, GKFS_DATA->bind_proxy_addr()); @@ -538,12 +527,28 @@ init_environment() { throw; } GKFS_DATA->spdlogger()->debug("{}() RPC client running.", __func__); + + // Needed for client + GKFS_DATA->spdlogger()->debug("{}() Initializing Distributor ... ", + __func__); + try { + auto distributor = std::make_shared(); + RPC_DATA->distributor(distributor); + } catch(const std::exception& e) { + GKFS_DATA->spdlogger()->error( + "{}() Failed to initialize Distributor: {}", __func__, + e.what()); + throw; + } + GKFS_DATA->spdlogger()->debug("{}() Distributed running.", __func__); + GKFS_DATA->spdlogger()->debug("{}() Initializing MalleableManager...", __func__); try { auto malleable_manager = std::make_shared(); GKFS_DATA->malleable_manager(malleable_manager); + } catch(const std::exception& e) { GKFS_DATA->spdlogger()->error( "{}() Failed to initialize MalleableManager: {}", __func__, @@ -552,7 +557,6 @@ init_environment() { } GKFS_DATA->spdlogger()->debug("{}() MalleableManager running.", __func__); - GKFS_DATA->spdlogger()->info("Startup successful. Daemon is ready."); } diff --git a/src/daemon/malleability/malleable_manager.cpp b/src/daemon/malleability/malleable_manager.cpp index 00a2aae56..7187538a8 100644 --- a/src/daemon/malleability/malleable_manager.cpp +++ b/src/daemon/malleability/malleable_manager.cpp @@ -208,7 +208,10 @@ MalleableManager::expand_start(int old_server_conf, int new_server_conf) { __func__, hosts.size(), new_server_conf)); } connect_to_hosts(hosts); - + RPC_DATA->distributor()->hosts_size(hosts.size()); + GKFS_DATA->spdlogger()->info( + "{}() Total number of hosts after expansion: {}", __func__, + RPC_DATA->distributor()->hosts_size()); auto abt_err = ABT_thread_create(RPC_DATA->io_pool(), expand_abt, ABT_THREAD_ATTR_NULL, nullptr, &redist_thread_); @@ -239,7 +242,11 @@ MalleableManager::redistribute_metadata() { continue; } auto dest_id = RPC_DATA->distributor()->locate_file_metadata(key, 0); + GKFS_DATA->spdlogger()->info( + "{}() Migration: key {} and value {}. From host {} to host {}", + __func__, key, value, RPC_DATA->local_host_id(), dest_id); if(dest_id == RPC_DATA->local_host_id()) { + GKFS_DATA->spdlogger()->info("{}() SKIPPERS", __func__); continue; } auto err = gkfs::malleable::rpc::forward_metadata(key, value, dest_id); diff --git a/src/daemon/malleability/rpc/forward_redistribution.cpp b/src/daemon/malleability/rpc/forward_redistribution.cpp index 7b1a4e71c..50921778f 100644 --- a/src/daemon/malleability/rpc/forward_redistribution.cpp +++ b/src/daemon/malleability/rpc/forward_redistribution.cpp @@ -38,6 +38,9 @@ forward_metadata(std::string& key, std::string& value, unsigned int dest_id) { rpc_migrate_metadata_in_t in{}; rpc_err_out_t out{}; int err; + // set input + in.key = key.c_str(); + in.value = value.c_str(); // Create handle GKFS_DATA->spdlogger()->debug("{}() Creating Margo handle ...", __func__); auto endp = RPC_DATA->rpc_endpoints().at(dest_id); diff --git a/src/daemon/util.cpp b/src/daemon/util.cpp index 7dc39cd8e..e49c0aa67 100644 --- a/src/daemon/util.cpp +++ b/src/daemon/util.cpp @@ -31,6 +31,13 @@ #include +#include // Added for file existence check +#include // Added for sleep (if needed) +#include +#include +#include +#include + using namespace std; namespace gkfs::utils { @@ -47,18 +54,47 @@ namespace gkfs::utils { * access is simultaneous. * @endinternal */ +// void +// populate_hosts_file() { +// const auto& hosts_file = GKFS_DATA->hosts_file(); +// const auto& daemon_addr = RPC_DATA->self_addr_str(); +// const auto& proxy_addr = RPC_DATA->self_proxy_addr_str(); +// GKFS_DATA->spdlogger()->debug("{}() Populating hosts file: '{}'", +// __func__, +// hosts_file); +// ofstream lfstream(hosts_file, ios::out | ios::app); +// if(!lfstream) { +// throw runtime_error(fmt::format("Failed to open hosts file '{}': {}", +// hosts_file, strerror(errno))); +// } +// // if rootdir_suffix is used, append it to hostname +// auto hostname = +// GKFS_DATA->rootdir_suffix().empty() +// ? gkfs::rpc::get_my_hostname(true) +// : fmt::format("{}#{}", gkfs::rpc::get_my_hostname(true), +// GKFS_DATA->rootdir_suffix()); +// auto line_out = fmt::format("{} {}", hostname, daemon_addr); +// if(!proxy_addr.empty()) +// line_out = fmt::format("{} {}", line_out, proxy_addr); +// lfstream << line_out << std::endl; +// +// if(!lfstream) { +// throw runtime_error( +// fmt::format("Failed to write on hosts file '{}': {}", +// hosts_file, strerror(errno))); +// } +// lfstream.close(); +// } + + void populate_hosts_file() { const auto& hosts_file = GKFS_DATA->hosts_file(); const auto& daemon_addr = RPC_DATA->self_addr_str(); const auto& proxy_addr = RPC_DATA->self_proxy_addr_str(); + GKFS_DATA->spdlogger()->debug("{}() Populating hosts file: '{}'", __func__, hosts_file); - ofstream lfstream(hosts_file, ios::out | ios::app); - if(!lfstream) { - throw runtime_error(fmt::format("Failed to open hosts file '{}': {}", - hosts_file, strerror(errno))); - } // if rootdir_suffix is used, append it to hostname auto hostname = GKFS_DATA->rootdir_suffix().empty() @@ -68,16 +104,60 @@ populate_hosts_file() { auto line_out = fmt::format("{} {}", hostname, daemon_addr); if(!proxy_addr.empty()) line_out = fmt::format("{} {}", line_out, proxy_addr); - lfstream << line_out << std::endl; + // Constants for retry mechanism + const int MAX_RETRIES = 5; // Maximum number of retry attempts + const std::chrono::milliseconds RETRY_DELAY( + 3); // Delay between retries (in milliseconds) - if(!lfstream) { - throw runtime_error( - fmt::format("Failed to write on hosts file '{}': {}", - hosts_file, strerror(errno))); + for(int attempt = 1; attempt <= MAX_RETRIES; attempt++) { + { // New scope to close the file after each write attempt + std::ofstream lfstream(hosts_file, std::ios::out | std::ios::app); + if(!lfstream) { + throw std::runtime_error( + fmt::format("Failed to open hosts file '{}': {}", + hosts_file, strerror(errno))); + } + lfstream << line_out << std::endl; + if(!lfstream) { + throw runtime_error( + fmt::format("Failed to write on hosts file '{}': {}", + hosts_file, strerror(errno))); + } + lfstream.close(); + } // lfstream closed here + + // Check if the line is in the file + std::ifstream checkstream(hosts_file); + std::string line; + bool lineFound = false; + while(std::getline(checkstream, line)) { + if(line == line_out) { + lineFound = true; + break; + } + } + checkstream.close(); + + if(lineFound) { + GKFS_DATA->spdlogger()->debug( + "{}() Host successfully written and to hosts file", + __func__); + return; // Success, exit the function + } else { + GKFS_DATA->spdlogger()->warn( + "{}() Host not found after attempt {}, retrying...", + __func__, attempt); + std::this_thread::sleep_for(RETRY_DELAY); // Wait before retrying + } } - lfstream.close(); + + // Failed after all retries + throw std::runtime_error(fmt::format( + "Failed to write line to hosts file '{}' after {} retries", + hosts_file, MAX_RETRIES)); } + /** * @internal * This function removes the entire hosts file even if just one daemon is diff --git a/tools/malleability.cpp b/tools/malleability.cpp index b3d4bdc90..71b582d41 100644 --- a/tools/malleability.cpp +++ b/tools/malleability.cpp @@ -48,7 +48,7 @@ main(int argc, const char* argv[]) { auto err = gkfs_init(); cout << "Init result " << err << endl; - err = gkfs::malleable::expand_start(1, 1); + err = gkfs::malleable::expand_start(1, 2); if(err) { cout << "Expand start failed. Exiting..." << endl; gkfs_end(); -- GitLab From 37f2d7f382bcd8eeb620554f017fa2e7833aa50b Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Wed, 3 Jul 2024 19:27:37 +0200 Subject: [PATCH 11/17] Improving gkfs_malleability --- src/daemon/malleability/malleable_manager.cpp | 8 +- tools/malleability.cpp | 108 +++++++++++++++--- 2 files changed, 91 insertions(+), 25 deletions(-) diff --git a/src/daemon/malleability/malleable_manager.cpp b/src/daemon/malleability/malleable_manager.cpp index 7187538a8..5ab0ea676 100644 --- a/src/daemon/malleability/malleable_manager.cpp +++ b/src/daemon/malleability/malleable_manager.cpp @@ -106,9 +106,8 @@ MalleableManager::read_hosts_file() { if(hosts.empty()) { throw runtime_error(fmt::format("Hostfile empty: '{}'", hostfile)); } - GKFS_DATA->spdlogger()->info( - "{}() Number of hosts for current instance '{}'", __func__, - hosts.size()); + GKFS_DATA->spdlogger()->info("{}() Number of hosts after expansion '{}'", + __func__, hosts.size()); return hosts; } @@ -209,9 +208,6 @@ MalleableManager::expand_start(int old_server_conf, int new_server_conf) { } connect_to_hosts(hosts); RPC_DATA->distributor()->hosts_size(hosts.size()); - GKFS_DATA->spdlogger()->info( - "{}() Total number of hosts after expansion: {}", __func__, - RPC_DATA->distributor()->hosts_size()); auto abt_err = ABT_thread_create(RPC_DATA->io_pool(), expand_abt, ABT_THREAD_ATTR_NULL, nullptr, &redist_thread_); diff --git a/tools/malleability.cpp b/tools/malleability.cpp index 71b582d41..5380f00ad 100644 --- a/tools/malleability.cpp +++ b/tools/malleability.cpp @@ -26,9 +26,10 @@ SPDX-License-Identifier: GPL-3.0-or-later */ +#include #include -// #include #include +#include #include #include @@ -37,29 +38,98 @@ using namespace std; struct cli_options { - string hosts_file; + bool verbose = false; + string action; + string subcommand; }; +std::pair +get_expansion_host_num() { + // get hosts file and read how much should be expanded + auto hosts_file_path = std::getenv("LIBGKFS_HOSTS_FILE"); + if(!hosts_file_path) { + std::cerr + << "Error: LIBGKFS_HOSTS_FILE environment variable not set.\n"; + return {-1, -1}; + } + std::ifstream file(hosts_file_path); + if(!file) { + std::cerr << "Error: Unable to open file at " << hosts_file_path + << ".\n"; + return {-1, -1}; // Indicate an error + } + auto initialHostCount = 0; + auto finalHostCount = 0; + auto foundSeparator = false; + std::string line; + + while(std::getline(file, line)) { + if(line == "#FS_INSTANCE_END") { + if(foundSeparator) { + cerr << "marker was found twice. this is not allowed.\n"; + return {-1, -1}; + } + foundSeparator = true; + initialHostCount = finalHostCount; + continue; + } + if(!line.empty()) { + finalHostCount++; + } + } + if(!foundSeparator) { + initialHostCount = finalHostCount; + } + return {initialHostCount, finalHostCount}; +} + int main(int argc, const char* argv[]) { CLI::App desc{"Allowed options"}; - cli_options opts{}; - - auto err = gkfs_init(); - cout << "Init result " << err << endl; + cli_options opts; + + // Global verbose flag + desc.add_flag("--verbose,-v", opts.verbose, "Verbose output"); + + auto expand_args = + desc.add_subcommand("expand", "Expansion-related actions"); + expand_args->add_option("action", opts.action, "Action to perform") + ->required() + ->check(CLI::IsMember({"start", "status", "finalize"})); + try { + desc.parse(argc, argv); + } catch(const CLI::ParseError& e) { + return desc.exit(e); + } - err = gkfs::malleable::expand_start(1, 2); - if(err) { - cout << "Expand start failed. Exiting..." << endl; - gkfs_end(); - return -1; + if(opts.verbose) { // Check the verbose flag from the main options + std::cout << "Verbose mode is on." << std::endl; + } + int err; + gkfs_init(); + + if(opts.action == "start") { + auto [current_instance, expanded_instance] = get_expansion_host_num(); + if(current_instance == -1 || expanded_instance == -1) { + return 1; + } + err = gkfs::malleable::expand_start(current_instance, + expanded_instance); + if(err) { + cout << "Expand start failed. Exiting...\n"; + gkfs_end(); + return -1; + } + cout << "Expand start " << err << "\n"; + } else if(opts.action == "status") { + if(gkfs::malleable::expand_status() > 0) { + cout << "Expansion in progress...\n"; + } else { + cout << "No expansion running.\n"; + } + } else if(opts.action == "finalize") { + err = gkfs::malleable::expand_finalize(); + cout << "Expand finalize " << err << endl; } - cout << "Expand start " << err << endl; - err = gkfs::malleable::expand_status(); - cout << "Expand status " << err << endl; - err = gkfs::malleable::expand_finalize(); - cout << "Expand finalize " << err << endl; - - err = gkfs_end(); - cout << "End result " << err << endl; + gkfs_end(); } \ No newline at end of file -- GitLab From d425dc6dbbe89d1a068052b49364450cbcc595f5 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Fri, 5 Jul 2024 18:03:13 +0200 Subject: [PATCH 12/17] Added data redistribution --- include/client/rpc/rpc_types.hpp | 57 ++---- include/common/common_defs.hpp | 2 +- include/common/rpc/rpc_types.hpp | 4 - include/daemon/backend/data/chunk_storage.hpp | 7 + .../rpc/forward_redistribution.hpp | 5 +- src/daemon/backend/data/chunk_storage.cpp | 15 ++ src/daemon/daemon.cpp | 9 +- src/daemon/malleability/malleable_manager.cpp | 176 ++++++++---------- .../rpc/forward_redistribution.cpp | 77 +++++++- tools/malleability.cpp | 4 +- 10 files changed, 208 insertions(+), 148 deletions(-) diff --git a/include/client/rpc/rpc_types.hpp b/include/client/rpc/rpc_types.hpp index b84c7c179..3bf7ca3db 100644 --- a/include/client/rpc/rpc_types.hpp +++ b/include/client/rpc/rpc_types.hpp @@ -127,8 +127,7 @@ struct fs_config { explicit input(const hermes::detail::hg_void_t& other) {} - explicit - operator hermes::detail::hg_void_t() { + explicit operator hermes::detail::hg_void_t() { return {}; } }; @@ -314,8 +313,7 @@ struct create { explicit input(const rpc_mk_node_in_t& other) : m_path(other.path), m_mode(other.mode) {} - explicit - operator rpc_mk_node_in_t() { + explicit operator rpc_mk_node_in_t() { return {m_path.c_str(), m_mode}; } @@ -424,8 +422,7 @@ struct stat { explicit input(const rpc_path_only_in_t& other) : m_path(other.path) {} - explicit - operator rpc_path_only_in_t() { + explicit operator rpc_path_only_in_t() { return {m_path.c_str()}; } @@ -551,8 +548,7 @@ struct remove_metadata { explicit input(const rpc_rm_node_in_t& other) : m_path(other.path), m_rm_dir(other.rm_dir) {} - explicit - operator rpc_rm_node_in_t() { + explicit operator rpc_rm_node_in_t() { return {m_path.c_str(), m_rm_dir}; } @@ -684,8 +680,7 @@ struct decr_size { explicit input(const rpc_trunc_in_t& other) : m_path(other.path), m_length(other.length) {} - explicit - operator rpc_trunc_in_t() { + explicit operator rpc_trunc_in_t() { return {m_path.c_str(), m_length}; } @@ -892,8 +887,7 @@ struct update_metadentry { m_atime_flag(other.atime_flag), m_mtime_flag(other.mtime_flag), m_ctime_flag(other.ctime_flag) {} - explicit - operator rpc_update_metadentry_in_t() { + explicit operator rpc_update_metadentry_in_t() { return {m_path.c_str(), m_nlink, m_mode, m_uid, m_gid, m_size, m_blocks, m_atime, m_mtime, m_ctime, m_nlink_flag, m_mode_flag, @@ -1021,8 +1015,7 @@ struct get_metadentry_size { explicit input(const rpc_path_only_in_t& other) : m_path(other.path) {} - explicit - operator rpc_path_only_in_t() { + explicit operator rpc_path_only_in_t() { return {m_path.c_str()}; } @@ -1157,8 +1150,7 @@ struct update_metadentry_size { : m_path(other.path), m_size(other.size), m_offset(other.offset), m_append(other.append) {} - explicit - operator rpc_update_metadentry_size_in_t() { + explicit operator rpc_update_metadentry_size_in_t() { return {m_path.c_str(), m_size, m_offset, m_append}; } @@ -1286,8 +1278,7 @@ struct mk_symlink { explicit input(const rpc_mk_symlink_in_t& other) : m_path(other.path), m_target_path(other.target_path) {} - explicit - operator rpc_mk_symlink_in_t() { + explicit operator rpc_mk_symlink_in_t() { return {m_path.c_str(), m_target_path.c_str()}; } @@ -1398,8 +1389,7 @@ struct remove_data { explicit input(const rpc_rm_node_in_t& other) : m_path(other.path) {} - explicit - operator rpc_rm_node_in_t() { + explicit operator rpc_rm_node_in_t() { return {m_path.c_str()}; } @@ -1565,8 +1555,7 @@ struct write_data { m_total_chunk_size(other.total_chunk_size), m_buffers(other.bulk_handle) {} - explicit - operator rpc_write_data_in_t() { + explicit operator rpc_write_data_in_t() { return {m_path.c_str(), m_offset, m_host_id, m_host_size, m_wbitset.c_str(), m_chunk_n, m_chunk_start, m_chunk_end, m_total_chunk_size, @@ -1751,8 +1740,7 @@ struct read_data { m_total_chunk_size(other.total_chunk_size), m_buffers(other.bulk_handle) {} - explicit - operator rpc_read_data_in_t() { + explicit operator rpc_read_data_in_t() { return {m_path.c_str(), m_offset, m_host_id, m_host_size, m_wbitset.c_str(), m_chunk_n, m_chunk_start, m_chunk_end, m_total_chunk_size, @@ -1886,8 +1874,7 @@ struct trunc_data { explicit input(const rpc_trunc_in_t& other) : m_path(other.path), m_length(other.length) {} - explicit - operator rpc_trunc_in_t() { + explicit operator rpc_trunc_in_t() { return { m_path.c_str(), m_length, @@ -2006,8 +1993,7 @@ struct get_dirents { explicit input(const rpc_get_dirents_in_t& other) : m_path(other.path), m_buffers(other.bulk_handle) {} - explicit - operator rpc_get_dirents_in_t() { + explicit operator rpc_get_dirents_in_t() { return {m_path.c_str(), hg_bulk_t(m_buffers)}; } @@ -2132,8 +2118,7 @@ struct get_dirents_extended { explicit input(const rpc_get_dirents_in_t& other) : m_path(other.path), m_buffers(other.bulk_handle) {} - explicit - operator rpc_get_dirents_in_t() { + explicit operator rpc_get_dirents_in_t() { return {m_path.c_str(), hg_bulk_t(m_buffers)}; } @@ -2252,8 +2237,7 @@ struct chunk_stat { explicit input(const rpc_chunk_stat_in_t& other) : m_dummy(other.dummy) {} - explicit - operator rpc_chunk_stat_in_t() { + explicit operator rpc_chunk_stat_in_t() { return {m_dummy}; } @@ -3788,8 +3772,7 @@ struct expand_start { : m_old_server_conf(other.old_server_conf), m_new_server_conf(other.new_server_conf) {} - explicit - operator rpc_expand_start_in_t() { + explicit operator rpc_expand_start_in_t() { return {m_old_server_conf, m_new_server_conf}; } @@ -3893,8 +3876,7 @@ struct expand_status { explicit input(const hermes::detail::hg_void_t& other) {} - explicit - operator hermes::detail::hg_void_t() { + explicit operator hermes::detail::hg_void_t() { return {}; } }; @@ -3995,8 +3977,7 @@ struct expand_finalize { explicit input(const hermes::detail::hg_void_t& other) {} - explicit - operator hermes::detail::hg_void_t() { + explicit operator hermes::detail::hg_void_t() { return {}; } }; diff --git a/include/common/common_defs.hpp b/include/common/common_defs.hpp index 9e5eda72b..58a67f17c 100644 --- a/include/common/common_defs.hpp +++ b/include/common/common_defs.hpp @@ -113,8 +113,8 @@ namespace malleable::rpc::tag { constexpr auto expand_start = "rpc_srv_expand_start"; constexpr auto expand_status = "rpc_srv_expand_status"; constexpr auto expand_finalize = "rpc_srv_expand_finalize"; +// migrate data uses the write rpc constexpr auto migrate_metadata = "rpc_srv_migrate_metadata"; -constexpr auto migrate_data = "rpc_srv_migrate_data"; } // namespace malleable::rpc::tag namespace config::syscall::stat { diff --git a/include/common/rpc/rpc_types.hpp b/include/common/rpc/rpc_types.hpp index a7c937a43..d86015be5 100644 --- a/include/common/rpc/rpc_types.hpp +++ b/include/common/rpc/rpc_types.hpp @@ -174,8 +174,4 @@ MERCURY_GEN_PROC(rpc_expand_start_in_t, MERCURY_GEN_PROC(rpc_migrate_metadata_in_t, ((hg_const_string_t) (key))((hg_const_string_t) (value))) -MERCURY_GEN_PROC(rpc_migrate_data_in_t, - ((hg_const_string_t) (path))((hg_uint64_t) (chunk_id))( - (hg_bulk_t) (bulk_handle))) - #endif // LFS_RPC_TYPES_HPP diff --git a/include/daemon/backend/data/chunk_storage.hpp b/include/daemon/backend/data/chunk_storage.hpp index 14a4cf31e..ce97c3a42 100644 --- a/include/daemon/backend/data/chunk_storage.hpp +++ b/include/daemon/backend/data/chunk_storage.hpp @@ -39,6 +39,7 @@ #include #include #include +#include /* Forward declarations */ namespace spdlog { @@ -186,6 +187,12 @@ public: */ [[nodiscard]] ChunkStat chunk_stat() const; + + std::filesystem::recursive_directory_iterator + get_all_chunk_files(); + + std::string + get_chunk_directory(); }; } // namespace gkfs::data diff --git a/include/daemon/malleability/rpc/forward_redistribution.hpp b/include/daemon/malleability/rpc/forward_redistribution.hpp index b42fa968f..b450a6525 100644 --- a/include/daemon/malleability/rpc/forward_redistribution.hpp +++ b/include/daemon/malleability/rpc/forward_redistribution.hpp @@ -37,8 +37,9 @@ namespace gkfs::malleable::rpc { int forward_metadata(std::string& key, std::string& value, unsigned int dest_id); -void -forward_data(); +int +forward_data(const std::string& path, void* buf, const size_t count, + const uint64_t chnk_id, const uint64_t dest_id); } // namespace gkfs::malleable::rpc diff --git a/src/daemon/backend/data/chunk_storage.cpp b/src/daemon/backend/data/chunk_storage.cpp index 467bde8fd..dac7830c0 100644 --- a/src/daemon/backend/data/chunk_storage.cpp +++ b/src/daemon/backend/data/chunk_storage.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -340,4 +341,18 @@ ChunkStorage::chunk_stat() const { return {chunksize_, bytes_total / chunksize_, bytes_free / chunksize_}; } +fs::recursive_directory_iterator +ChunkStorage::get_all_chunk_files() { + auto chunk_dir = fs::path(root_path_); + if(!fs::exists(chunk_dir)) { + throw ChunkStorageException(ENOENT, "Chunk directory does not exist"); + } + return fs::recursive_directory_iterator(chunk_dir); +} + +std::string +ChunkStorage::get_chunk_directory() { + return root_path_; +} + } // namespace gkfs::data \ No newline at end of file diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index 8f298073d..9e7075e49 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -187,8 +187,6 @@ register_server_rpcs(margo_instance_id mid) { MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::migrate_metadata, rpc_migrate_metadata_in_t, rpc_err_out_t, rpc_srv_migrate_metadata); - MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::migrate_data, - rpc_migrate_data_in_t, rpc_err_out_t, rpc_srv_migrate_data); } /** @@ -261,9 +259,10 @@ register_client_rpcs(margo_instance_id mid) { RPC_DATA->rpc_client_ids().migrate_metadata_id = MARGO_REGISTER(mid, gkfs::malleable::rpc::tag::migrate_metadata, rpc_migrate_metadata_in_t, rpc_err_out_t, NULL); - // RPC_DATA->rpc_client_ids().migrate_data_id = MARGO_REGISTER( - // mid, gkfs::malleable::rpc::tag::migrate_metadata, - // rpc_migrate_metadata_in_t, rpc_err_out_t, NULL); + // this is just a write + RPC_DATA->rpc_client_ids().migrate_data_id = + MARGO_REGISTER(mid, gkfs::rpc::tag::write, rpc_write_data_in_t, + rpc_data_out_t, NULL); } /** diff --git a/src/daemon/malleability/malleable_manager.cpp b/src/daemon/malleability/malleable_manager.cpp index 5ab0ea676..3018d3816 100644 --- a/src/daemon/malleability/malleable_manager.cpp +++ b/src/daemon/malleability/malleable_manager.cpp @@ -28,16 +28,26 @@ #include #include - #include +#include #include +#include +#include #include #include #include +extern "C" { +#include +#include +#include +#include +} + using namespace std; +namespace fs = std::filesystem; namespace gkfs::malleable { @@ -190,7 +200,12 @@ MalleableManager::expand_abt(void* _arg) { __func__); GKFS_DATA->redist_running(true); GKFS_DATA->malleable_manager()->redistribute_metadata(); - GKFS_DATA->malleable_manager()->redistribute_data(); + try { + GKFS_DATA->malleable_manager()->redistribute_data(); + } catch(const gkfs::data::ChunkStorageException& e) { + GKFS_DATA->spdlogger()->error("{}() Failed to redistribute data: '{}'", + __func__, e.what()); + } GKFS_DATA->redist_running(false); GKFS_DATA->spdlogger()->info( "{}() Expansion process successfully finished.", __func__); @@ -269,101 +284,72 @@ void MalleableManager::redistribute_data() { GKFS_DATA->spdlogger()->info("{}() Starting data redistribution...", __func__); + + auto chunk_dir = fs::path(GKFS_DATA->storage()->get_chunk_directory()); + auto dir_iterator = GKFS_DATA->storage()->get_all_chunk_files(); + + for(const auto& entry : dir_iterator) { + if(!entry.is_regular_file()) { + continue; + } + // path under chunkdir as placed in the rootdir + auto rel_chunk_dir = fs::relative(entry, chunk_dir); + // chunk id from this entry used for determining destination + uint64_t chunk_id = stoul(rel_chunk_dir.filename().string()); + // mountdir gekkofs path used for determining destination + auto gkfs_path = rel_chunk_dir.parent_path().string(); + ::replace(gkfs_path.begin(), gkfs_path.end(), ':', '/'); + gkfs_path = "/" + gkfs_path; + auto dest_id = + RPC_DATA->distributor()->locate_data(gkfs_path, chunk_id, 0); + GKFS_DATA->spdlogger()->trace( + "{}() Migrating chunkfile: {} for gkfs file {} chnkid {} destid {}", + __func__, rel_chunk_dir.string(), gkfs_path, chunk_id, dest_id); + if(dest_id == RPC_DATA->local_host_id()) { + GKFS_DATA->spdlogger()->trace("{}() SKIPPERS", __func__); + continue; + } + auto fd = open(entry.path().c_str(), O_RDONLY); + if(fd < 0) { + GKFS_DATA->spdlogger()->error("{}() Failed to open chunkfile: {}", + __func__, entry.path().c_str()); + continue; + } + auto buf = new char[entry.file_size()]; + auto bytes_read = read(fd, buf, entry.file_size()); + if(bytes_read < 0) { + GKFS_DATA->spdlogger()->error("{}() Failed to read chunkfile: {}", + __func__, entry.path().c_str()); + continue; + } + auto err = gkfs::malleable::rpc::forward_data( + gkfs_path, buf, bytes_read, chunk_id, dest_id); + if(err != 0) { + GKFS_DATA->spdlogger()->error( + "{}() Failed to migrate data for chunkfile: {}", __func__, + entry.path().c_str()); + } + close(fd); + GKFS_DATA->spdlogger()->trace( + "{}() Data migration completed for chunkfile: {}. Removing ...", + __func__, entry.path().c_str()); + // remove file after migration + auto entry_dir = entry.path().parent_path(); + try { + fs::remove(entry); + if(fs::is_empty(entry_dir)) { + fs::remove(entry_dir); + } + } catch(const fs::filesystem_error& e) { + GKFS_DATA->spdlogger()->error("{}() Failed to remove chunkfile: {}", + __func__, entry.path().c_str()); + } + GKFS_DATA->spdlogger()->trace("{}() Done for chunkfile: {}", __func__, + entry.path().c_str()); + } + GKFS_DATA->spdlogger()->info("{}() Data redistribution completed.", __func__); - // Relocate data (chunks) - // auto relocate_chunk_rpc_id = - // gkfs::rpc::get_rpc_id(mid, gkfs::rpc::tag::relocate_chunk); - // for(auto& chunks_dir : - // GKFS_DATA->storage()->chunks_directory_iterator()) { - // if(!chunks_dir.is_directory()) { - // GKFS_DATA->spdlogger()->warn( - // "{}() Expected directory but got something else: {}", - // __func__, chunks_dir.path().string()); - // continue; - // } - // string file_path = GKFS_DATA->storage()->get_file_path( - // chunks_dir.path().filename().string()); - // - // for(auto& chunk_file : fs::directory_iterator(chunks_dir)) { - // if(!chunk_file.is_regular_file()) { - // GKFS_DATA->spdlogger()->warn( - // "{}() Expected regular file but got something - // else: {}", - // __func__, chunk_file.path().string()); - // continue; - // } - // gkfs::rpc::chnk_id_t chunk_id = - // std::stoul(chunk_file.path().filename().string()); - // auto destination = distributor.locate_data(file_path, - // chunk_id); size_t size = chunk_file.file_size(); - // - // GKFS_DATA->spdlogger()->trace( - // "{}() Checking {} chunk: {} size: {} {} {}", __func__, - // file_path, chunk_id, size, - // (destination == localhost ? " Stay on" : " -> Goto "), - // destination); - // - // if(destination == localhost) { - // continue; - // } - // - // // prepare bulk - // unique_ptr buf(new char[size]()); - // // read data (blocking) - // hg_size_t bytes_read = GKFS_DATA->storage()->read_chunk( - // file_path, chunk_id, buf.get(), size, 0); - // hg_bulk_t bulk{}; - // char* bufptr = buf.get(); - // auto ret = margo_bulk_create(mid, 1, (void**) &bufptr, - // &bytes_read, - // HG_BULK_READ_ONLY, &bulk); - // assert(ret == HG_SUCCESS); - // - // // send RPC - // rpc_relocate_chunk_in_t in{}; - // rpc_err_out_t out{}; - // hg_addr_t host_addr{}; - // - // in.path = file_path.c_str(); - // in.chunk_id = chunk_id; - // in.bulk_handle = bulk; - // - // ret = margo_addr_lookup(mid, - // hosts[destination].second.c_str(), - // &host_addr); - // assert(ret == HG_SUCCESS); - // - // // let's do this sequential first - // hg_handle_t handle; - // ret = margo_create(mid, host_addr, relocate_chunk_rpc_id, - // &handle); assert(ret == HG_SUCCESS); - // - // ret = margo_forward(handle, &in); // blocking - // assert(ret == HG_SUCCESS); - // - // ret = margo_get_output(handle, &out); - // assert(ret == HG_SUCCESS); - // - // // TODO(dauer) process output - // GKFS_DATA->storage()->remove_chunk(file_path, chunk_id); - // - // // FIXME This can leave behind empty directories, even when - // the - // // whole file is delete later. Three possibilities: - // // 1) Clean them up, but make sure this doesn't break another - // thread - // // creating a new chunk in this directory at the same time. - // // 2) Switch to a flat namespace without directories - // // 3) Ignore and waste some inodes - // - // if(HG_SUCCESS != gkfs::rpc::margo_client_cleanup( - // &handle, &out, &mid, &host_addr, - // &bulk)) { - // cout << "Error during margo cleanup.\n"; - // } - // } - // } } } // namespace gkfs::malleable diff --git a/src/daemon/malleability/rpc/forward_redistribution.cpp b/src/daemon/malleability/rpc/forward_redistribution.cpp index 50921778f..9bc7d05dc 100644 --- a/src/daemon/malleability/rpc/forward_redistribution.cpp +++ b/src/daemon/malleability/rpc/forward_redistribution.cpp @@ -28,6 +28,7 @@ #include #include +#include "common/rpc/rpc_util.hpp" namespace gkfs::malleable::rpc { @@ -79,7 +80,79 @@ forward_metadata(std::string& key, std::string& value, unsigned int dest_id) { return err; } -void -forward_data() {} +int +forward_data(const std::string& path, void* buf, const size_t count, + const uint64_t chnk_id, const uint64_t dest_id) { + hg_handle_t rpc_handle = nullptr; + rpc_write_data_in_t in{}; + rpc_data_out_t out{}; + int err = 0; + in.path = path.c_str(); + in.offset = 0; // relative to chunkfile not gkfs file + in.host_id = dest_id; + in.host_size = RPC_DATA->distributor()->hosts_size(); + in.chunk_n = 1; + in.chunk_start = chnk_id; + in.chunk_end = chnk_id; + in.total_chunk_size = count; + std::vector write_ops_vect = {1}; + in.wbitset = gkfs::rpc::compress_bitset(write_ops_vect).c_str(); + + hg_bulk_t bulk_handle = nullptr; + // register local target buffer for bulk access + auto bulk_buf = buf; + auto size = std::make_shared(count); // XXX Why shared ptr? + auto ret = margo_bulk_create(RPC_DATA->client_rpc_mid(), 1, &bulk_buf, + size.get(), HG_BULK_READ_ONLY, &bulk_handle); + if(ret != HG_SUCCESS) { + GKFS_DATA->spdlogger()->error("{}() Failed to create rpc bulk handle", + __func__); + return EBUSY; + } + in.bulk_handle = bulk_handle; + GKFS_DATA->spdlogger()->trace( + "{}() Sending non-blocking RPC to '{}': path '{}' offset '{}' chunk_n '{}' chunk_start '{}' chunk_end '{}' total_chunk_size '{}'", + __func__, dest_id, in.path, in.offset, in.chunk_n, in.chunk_start, + in.chunk_end, in.total_chunk_size); + ret = margo_create(RPC_DATA->client_rpc_mid(), + RPC_DATA->rpc_endpoints().at(dest_id), + RPC_DATA->rpc_client_ids().migrate_data_id, &rpc_handle); + if(ret != HG_SUCCESS) { + margo_destroy(rpc_handle); + margo_bulk_free(bulk_handle); + return EBUSY; + } + // Send RPC + ret = margo_forward(rpc_handle, &in); + if(ret != HG_SUCCESS) { + GKFS_DATA->spdlogger()->error( + "{}() Unable to send blocking rpc for path {} and recipient {}", + __func__, path, dest_id); + margo_destroy(rpc_handle); + margo_bulk_free(bulk_handle); + return EBUSY; + } + GKFS_DATA->spdlogger()->debug("{}() '1' RPCs sent, waiting for reply ...", + __func__); + ssize_t out_size = 0; + ret = margo_get_output(rpc_handle, &out); + if(ret != HG_SUCCESS) { + GKFS_DATA->spdlogger()->error( + "{}() Failed to get rpc output for path {} recipient {}", + __func__, path, dest_id); + err = EBUSY; + } + GKFS_DATA->spdlogger()->debug( + "{}() Got response from target '{}': err '{}' with io_size '{}'", + __func__, dest_id, out.err, out.io_size); + if(out.err != 0) + err = out.err; + else + out_size += static_cast(out.io_size); + margo_free_output(rpc_handle, &out); + margo_destroy(rpc_handle); + margo_bulk_free(bulk_handle); + return err; +} } // namespace gkfs::malleable::rpc \ No newline at end of file diff --git a/tools/malleability.cpp b/tools/malleability.cpp index 5380f00ad..ea3be3969 100644 --- a/tools/malleability.cpp +++ b/tools/malleability.cpp @@ -119,8 +119,10 @@ main(int argc, const char* argv[]) { cout << "Expand start failed. Exiting...\n"; gkfs_end(); return -1; + } else { + cout << "Expansion process from " << current_instance + << " nodes to " << expanded_instance << " nodes launched...\n"; } - cout << "Expand start " << err << "\n"; } else if(opts.action == "status") { if(gkfs::malleable::expand_status() > 0) { cout << "Expansion in progress...\n"; -- GitLab From b6184dfe20674d308e58c08b31bd37227d312010 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Mon, 8 Jul 2024 12:28:59 +0200 Subject: [PATCH 13/17] Better log and cleanup --- include/daemon/handler/rpc_defs.hpp | 2 - .../daemon/malleability/malleable_manager.hpp | 6 +- src/client/malleability.cpp | 13 +-- src/client/rpc/forward_malleability.cpp | 22 ++--- src/daemon/handler/srv_malleability.cpp | 7 -- src/daemon/malleability/malleable_manager.cpp | 82 ++++++++++--------- tools/malleability.cpp | 18 ++-- 7 files changed, 74 insertions(+), 76 deletions(-) diff --git a/include/daemon/handler/rpc_defs.hpp b/include/daemon/handler/rpc_defs.hpp index bb6c2ce13..8f3f47bad 100644 --- a/include/daemon/handler/rpc_defs.hpp +++ b/include/daemon/handler/rpc_defs.hpp @@ -94,7 +94,5 @@ DECLARE_MARGO_RPC_HANDLER(rpc_srv_expand_finalize) DECLARE_MARGO_RPC_HANDLER(rpc_srv_migrate_metadata) -DECLARE_MARGO_RPC_HANDLER(rpc_srv_migrate_data) - #endif // GKFS_DAEMON_RPC_DEFS_HPP diff --git a/include/daemon/malleability/malleable_manager.hpp b/include/daemon/malleability/malleable_manager.hpp index 3293f1d20..12ac587f4 100644 --- a/include/daemon/malleability/malleable_manager.hpp +++ b/include/daemon/malleability/malleable_manager.hpp @@ -48,15 +48,15 @@ private: connect_to_hosts( const std::vector>& hosts); - static void - expand_abt(void* _arg); - int redistribute_metadata(); void redistribute_data(); + static void + expand_abt(void* _arg); + public: void expand_start(int old_server_conf, int new_server_conf); diff --git a/src/client/malleability.cpp b/src/client/malleability.cpp index f53be3e9d..0e2a2e8af 100644 --- a/src/client/malleability.cpp +++ b/src/client/malleability.cpp @@ -59,19 +59,22 @@ expand_start(int old_server_conf, int new_server_conf) { // TODO check that hostsfile contains endmarker return gkfs::malleable::rpc::forward_expand_start(old_server_conf, new_server_conf); - // return 0; } int expand_status() { - LOG(INFO, "{}() Expand operation status", __func__); - return gkfs::malleable::rpc::forward_expand_status(); + LOG(INFO, "{}() enter", __func__); + auto res = gkfs::malleable::rpc::forward_expand_status(); + LOG(INFO, "{}() '{}' nodes working on extend operation.", __func__, res); + return res; } int expand_finalize() { - LOG(INFO, "{}() Expand operation finalize", __func__); - return gkfs::malleable::rpc::forward_expand_finalize(); + LOG(INFO, "{}() enter", __func__); + auto res = gkfs::malleable::rpc::forward_expand_finalize(); + LOG(INFO, "{}() extend operation finalized. ", __func__); + return res; } } // namespace gkfs::malleable \ No newline at end of file diff --git a/src/client/rpc/forward_malleability.cpp b/src/client/rpc/forward_malleability.cpp index ec02a8170..bf59dad5d 100644 --- a/src/client/rpc/forward_malleability.cpp +++ b/src/client/rpc/forward_malleability.cpp @@ -132,19 +132,20 @@ forward_expand_status() { // wait for RPC responses for(std::size_t i = 0; i < handles.size(); ++i) { - gkfs::malleable::rpc::expand_status::output out; - try { out = handles[i].get().at(0); - - if(out.err() != 0) { + if(out.err() > 0) { + LOG(DEBUG, + "{}() Host '{}' not done yet with malleable operation.", + __func__, targets[i]); + err += out.err(); + } + if(out.err() < 0) { + // ignore. shouldn't happen for now LOG(ERROR, - "{}() Failed to retrieve dir entries from host '{}'. Error '{}'", - __func__, targets[i], strerror(out.err())); - err = out.err(); - // We need to gather all responses before exiting - continue; + "{}() Host '{}' is unable to check for expansion progress. (shouldn't happen)", + __func__, targets[i]); } } catch(const std::exception& ex) { LOG(ERROR, @@ -200,8 +201,7 @@ forward_expand_finalize() { out = handles[i].get().at(0); if(out.err() != 0) { - LOG(ERROR, - "{}() Failed to retrieve dir entries from host '{}'. Error '{}'", + LOG(ERROR, "{}() Failed finalize on host '{}'. Error '{}'", __func__, targets[i], strerror(out.err())); err = out.err(); // We need to gather all responses before exiting diff --git a/src/daemon/handler/srv_malleability.cpp b/src/daemon/handler/srv_malleability.cpp index 4bdfea5cf..6a732bee7 100644 --- a/src/daemon/handler/srv_malleability.cpp +++ b/src/daemon/handler/srv_malleability.cpp @@ -138,11 +138,6 @@ rpc_srv_migrate_metadata(hg_handle_t handle) { return gkfs::rpc::cleanup_respond(&handle, &in, &out); } -hg_return_t -rpc_srv_migrate_data(hg_handle_t handle) { - return HG_SUCCESS; -} - } // namespace DEFINE_MARGO_RPC_HANDLER(rpc_srv_expand_start) @@ -152,5 +147,3 @@ DEFINE_MARGO_RPC_HANDLER(rpc_srv_expand_status) DEFINE_MARGO_RPC_HANDLER(rpc_srv_expand_finalize) DEFINE_MARGO_RPC_HANDLER(rpc_srv_migrate_metadata) - -DEFINE_MARGO_RPC_HANDLER(rpc_srv_migrate_data) diff --git a/src/daemon/malleability/malleable_manager.cpp b/src/daemon/malleability/malleable_manager.cpp index 3018d3816..c525956af 100644 --- a/src/daemon/malleability/malleable_manager.cpp +++ b/src/daemon/malleability/malleable_manager.cpp @@ -194,46 +194,6 @@ MalleableManager::connect_to_hosts( } } -void -MalleableManager::expand_abt(void* _arg) { - GKFS_DATA->spdlogger()->info("{}() Starting expansion process...", - __func__); - GKFS_DATA->redist_running(true); - GKFS_DATA->malleable_manager()->redistribute_metadata(); - try { - GKFS_DATA->malleable_manager()->redistribute_data(); - } catch(const gkfs::data::ChunkStorageException& e) { - GKFS_DATA->spdlogger()->error("{}() Failed to redistribute data: '{}'", - __func__, e.what()); - } - GKFS_DATA->redist_running(false); - GKFS_DATA->spdlogger()->info( - "{}() Expansion process successfully finished.", __func__); -} - -void -MalleableManager::expand_start(int old_server_conf, int new_server_conf) { - auto hosts = read_hosts_file(); - if(hosts.size() != static_cast(new_server_conf)) { - throw runtime_error( - fmt::format("MalleableManager::{}() Something is wrong. " - "Number of hosts in hosts file ({}) " - "does not match new server configuration ({})", - __func__, hosts.size(), new_server_conf)); - } - connect_to_hosts(hosts); - RPC_DATA->distributor()->hosts_size(hosts.size()); - auto abt_err = - ABT_thread_create(RPC_DATA->io_pool(), expand_abt, - ABT_THREAD_ATTR_NULL, nullptr, &redist_thread_); - if(abt_err != ABT_SUCCESS) { - auto err_str = fmt::format( - "MalleableManager::{}() Failed to create ABT thread with abt_err '{}'", - __func__, abt_err); - throw runtime_error(err_str); - } -} - int MalleableManager::redistribute_metadata() { uint64_t count = 0; @@ -352,4 +312,46 @@ MalleableManager::redistribute_data() { __func__); } +void +MalleableManager::expand_abt(void* _arg) { + GKFS_DATA->spdlogger()->info("{}() Starting expansion process...", + __func__); + GKFS_DATA->redist_running(true); + GKFS_DATA->malleable_manager()->redistribute_metadata(); + try { + GKFS_DATA->malleable_manager()->redistribute_data(); + } catch(const gkfs::data::ChunkStorageException& e) { + GKFS_DATA->spdlogger()->error("{}() Failed to redistribute data: '{}'", + __func__, e.what()); + } + GKFS_DATA->redist_running(false); + GKFS_DATA->spdlogger()->info( + "{}() Expansion process successfully finished.", __func__); +} + +// PUBLIC + +void +MalleableManager::expand_start(int old_server_conf, int new_server_conf) { + auto hosts = read_hosts_file(); + if(hosts.size() != static_cast(new_server_conf)) { + throw runtime_error( + fmt::format("MalleableManager::{}() Something is wrong. " + "Number of hosts in hosts file ({}) " + "does not match new server configuration ({})", + __func__, hosts.size(), new_server_conf)); + } + connect_to_hosts(hosts); + RPC_DATA->distributor()->hosts_size(hosts.size()); + auto abt_err = + ABT_thread_create(RPC_DATA->io_pool(), expand_abt, + ABT_THREAD_ATTR_NULL, nullptr, &redist_thread_); + if(abt_err != ABT_SUCCESS) { + auto err_str = fmt::format( + "MalleableManager::{}() Failed to create ABT thread with abt_err '{}'", + __func__, abt_err); + throw runtime_error(err_str); + } +} + } // namespace gkfs::malleable diff --git a/tools/malleability.cpp b/tools/malleability.cpp index ea3be3969..14022ede7 100644 --- a/tools/malleability.cpp +++ b/tools/malleability.cpp @@ -105,7 +105,7 @@ main(int argc, const char* argv[]) { if(opts.verbose) { // Check the verbose flag from the main options std::cout << "Verbose mode is on." << std::endl; } - int err; + int res; gkfs_init(); if(opts.action == "start") { @@ -113,9 +113,9 @@ main(int argc, const char* argv[]) { if(current_instance == -1 || expanded_instance == -1) { return 1; } - err = gkfs::malleable::expand_start(current_instance, + res = gkfs::malleable::expand_start(current_instance, expanded_instance); - if(err) { + if(res) { cout << "Expand start failed. Exiting...\n"; gkfs_end(); return -1; @@ -124,14 +124,16 @@ main(int argc, const char* argv[]) { << " nodes to " << expanded_instance << " nodes launched...\n"; } } else if(opts.action == "status") { - if(gkfs::malleable::expand_status() > 0) { - cout << "Expansion in progress...\n"; + res = gkfs::malleable::expand_status(); + if(res > 0) { + cout << "Expansion in progress: " << res + << " nodes not finished.\n"; } else { - cout << "No expansion running.\n"; + cout << "No expansion running/finished.\n"; } } else if(opts.action == "finalize") { - err = gkfs::malleable::expand_finalize(); - cout << "Expand finalize " << err << endl; + res = gkfs::malleable::expand_finalize(); + cout << "Expand finalize " << res << endl; } gkfs_end(); } \ No newline at end of file -- GitLab From bac0881ae2ee85293f081c23fb81701702dfff6b Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Tue, 9 Jul 2024 17:47:04 +0200 Subject: [PATCH 14/17] Finish GekkoFS expand first version via gkfs script. --- scripts/run/gkfs | 230 ++++++++++++++++++++++++++++++------ scripts/run/gkfs.conf | 19 +-- scripts/run/gkfs_io500.conf | 7 +- tools/malleability.cpp | 24 +++- 4 files changed, 231 insertions(+), 49 deletions(-) diff --git a/scripts/run/gkfs b/scripts/run/gkfs index 9f1482331..5b6f542bc 100755 --- a/scripts/run/gkfs +++ b/scripts/run/gkfs @@ -15,25 +15,34 @@ fi C_AST_GREEN="${C_GREEN}*${C_NONE} [gkfs] " C_AST_YELLOW="${C_BYELLOW}*${C_NONE} [gkfs] " C_AST_RED="${C_BRED}*${C_NONE} [gkfs] " + +# Important const globals +FS_INSTANCE_MARKER_CONST="#FS_INSTANCE_END" ####################################### # Poll GekkoFS hostsfile until all daemons are started. # Exits with 1 if daemons cannot be started. # Globals: # HOSTSFILE # NODE_NUM +# NODE_CNT_EXPAND +# COMMAND # Arguments: # None # Outputs: # Writes error to stdout ####################################### wait_for_gkfs_daemons() { - sleep 2 + sleep 1 local server_wait_cnt=0 local nodes=1 if [[ -n ${NODE_NUM} ]]; then nodes=${NODE_NUM} fi - until [ $(($(wc -l "${HOSTSFILE}" 2> /dev/null | awk '{print $1}') + 0)) -eq "${nodes}" ] + # when expanding the total number of nodes is: initial nodelist + expand nodelist + if [[ ${COMMAND} == *"expand"* ]]; then + nodes=${NODE_CNT_EXPAND} + fi + until [ $(($(grep -cv '^#' "${HOSTSFILE}" 2> /dev/null | awk '{print $1}') + 0)) -eq "${nodes}" ] do #echo "Waiting for all servers to report connection. Try $server_wait_cnt" sleep 2 @@ -43,15 +52,13 @@ wait_for_gkfs_daemons() { exit 1 fi done - # This must be equivalent to the line set in include/common/common_defs.hpp - echo "#FS_INSTANCE_END" >> "${HOSTSFILE}" } ####################################### # Creates a pid file for a given pid. If pid file exists, we check if its pids are still valid. # If valid, an additional line is added. Otherwise, the pid in the file is deleted. # Globals: -# SRUN_DAEMON_PID_FILE -# SRUN_PROXY_PID_FILE +# DAEMON_PID_FILE +# PROXY_PID_FILE # VERBOSE # Arguments: # path to pid file @@ -59,15 +66,15 @@ wait_for_gkfs_daemons() { # Outputs: # Writes status to stdout if VERBOSE is true ####################################### -create_pid_file() { +write_pid_file() { local pid_file=${1} local pid=${2} if [[ ${VERBOSE} == true ]]; then echo -e "${C_AST_GREEN}Creating pid file at ${pid_file} with pid ${pid} ..." fi - # if PID file exists another daemon could run + # if PID file exists another daemon (or srun) could run if [[ -e ${pid_file} ]]; then - local pid_file_tmp=${SRUN_DAEMON_PID_FILE}.swp + local pid_file_tmp=${DAEMON_PID_FILE}.swp # create empty tmp file truncate -s 0 "${pid_file_tmp}" while IFS= read -r line @@ -101,10 +108,13 @@ create_pid_file() { # GKFS_DAEMON_LOG_PATH # GKFS_DAEMON_LOG_LEVEL # RUN_FOREGROUND +# DAEMON_BIN +# PROXY_BIN +# COMMAND # Outputs: # Writes status to stdout ####################################### -start_daemon() { +start_daemons() { local node_list local srun_daemon_cmd local srun_proxy_cmd @@ -162,10 +172,14 @@ start_daemon() { echo -e "${C_AST_GREEN}cpus_per_task: ${CPUS_PER_TASK}" [[ ${USE_PROXY} == true ]] && echo -e "${C_AST_GREEN}Proxy enabled" fi - if [[ ${VERBOSE} == true ]]; then - echo -e "${C_AST_GREEN}Cleaning host file ..." + # sanity checks before starting + if [[ ${COMMAND} == *"start"* ]]; then + # only clear hostfile when starting for the first time + if [[ ${VERBOSE} == true ]]; then + echo -e "${C_AST_GREEN}Cleaning host file ..." + fi + rm "${HOSTSFILE}" 2> /dev/null fi - rm "${HOSTSFILE}" 2> /dev/null # Setting up base daemon cmd local daemon_cmd="${DAEMON_BIN} -r ${ROOTDIR} -m ${MOUNTDIR} -H ${HOSTSFILE} ${DAEMON_ARGS_}" if [[ ${USE_PROXY} == true ]]; then @@ -175,24 +189,24 @@ start_daemon() { if [[ -n ${DAEMON_AFFINITY_} ]]; then daemon_cmd="${DAEMON_AFFINITY_} ${daemon_cmd}" fi - # final daemon execute command + # final daemon execute COMMAND daemon_execute="${srun_daemon_cmd} ${SRUN_DAEMON_ARGS} ${daemon_cmd}" - # Setting up base proxy command + # Setting up base proxy COMMAND if [[ ${USE_PROXY} == true ]]; then local proxy_cmd="${PROXY_BIN} -H ${HOSTSFILE} --pid-path ${PROXY_LOCAL_PID_FILE} ${PROXY_ARGS_}" # Set cpu affinity for proxy if [[ -n ${PROXY_AFFINITY_} ]]; then proxy_cmd="${PROXY_AFFINITY_} ${proxy_cmd}" fi - # final proxy execute command + # final proxy execute COMMAND proxy_execute="${srun_proxy_cmd} ${SRUN_PROXY_ARGS} ${proxy_cmd}" fi if [[ ${VERBOSE} == true ]]; then - echo -e "${C_AST_GREEN}Full execute DAEMON command:" + echo -e "${C_AST_GREEN}Full execute DAEMON COMMAND:" echo -e "${C_AST_GREEN}# $daemon_execute" - [[ ${USE_PROXY} == true ]] && echo -e "${C_AST_GREEN}Full execute PROXY command:" + [[ ${USE_PROXY} == true ]] && echo -e "${C_AST_GREEN}Full execute PROXY COMMAND:" [[ ${USE_PROXY} == true ]] && echo -e "${C_AST_GREEN}# $proxy_execute" fi # setup environment variables @@ -256,24 +270,24 @@ start_daemon() { fi done else - create_pid_file ${SRUN_DAEMON_PID_FILE} ${daemon_pid} + write_pid_file ${DAEMON_PID_FILE} ${daemon_pid} if [[ ${USE_PROXY} == true ]]; then - create_pid_file ${SRUN_PROXY_PID_FILE} ${proxy_pid} + write_pid_file ${PROXY_PID_FILE} ${proxy_pid} fi fi } ####################################### # Stops GekkoFS daemons for the configured pid file # Globals: -# SRUN_DAEMON_PID_FILE -# SRUN_PROXY_PID_FILE +# DAEMON_PID_FILE +# PROXY_PID_FILE # VERBOSE # Outputs: # Writes status to stdout ####################################### stop_daemons() { - local pid_file=${SRUN_DAEMON_PID_FILE} - local proxy_pid_file=${SRUN_PROXY_PID_FILE} + local pid_file=${DAEMON_PID_FILE} + local proxy_pid_file=${PROXY_PID_FILE} # if no daemon or proxy pid file exists, exit if [[ ! -e ${pid_file} ]] && [[ ! -e ${proxy_pid_file} ]]; then echo -e "${C_AST_RED}No pid files found -> no daemon or proxy running. Exiting ..." @@ -303,6 +317,8 @@ stop_daemons() { if [[ -e ${pid_file} ]]; then while IFS= read -r line do + # if line starts with # continue + [[ ${line} =~ ^#.*$ ]] && continue if ps -p "${line}" > /dev/null; then echo -e "${C_AST_GREEN}Stopping daemon with pid ${line}" start_time="$(date -u +%s.%3N)" @@ -320,6 +336,142 @@ stop_daemons() { echo -e "${C_AST_GREEN}Shutdown time: ${elapsed} seconds" fi } + +####################################### +# Sets up expand progress for later operation +# Globals: +# RUN_FOREGROUND +# EXPAND_NODELIST +# HOSTSFILE +# DAEMON_NODELIST +# USE_PROXY +# GKFS_MALLEABILITY_BIN_ +# VERBOSE +# Outputs: +# sets GKFS_MALLEABILITY_BIN_ if not already given by config +####################################### +expand_setup() { + # sanity checks + if [[ ${RUN_FOREGROUND} == true ]]; then + echo -e "${C_AST_RED}ERROR: Cannot run in foreground for expansion. Exiting ..." + exit 1 + fi + if [[ -z ${EXPAND_NODELIST} ]]; then + echo -e "${C_AST_RED}ERROR: No expand host file given. We need to know which nodes should be used. Exiting ..." + exit 1 + fi + # if proxy is enabled error out + if [[ ${USE_PROXY} == true ]]; then + echo -e "${C_AST_RED}ERROR: Proxy not supported for expansion. Exiting ..." + exit 1 + fi + # check that gkfs host file exists + if [[ ! -f ${HOSTSFILE} ]]; then + echo -e "${C_AST_RED}ERROR: No GekkoFS hostfile for expansion found at ${HOSTSFILE}. Exiting ..." + exit 1 + fi + # check that daemon pid file exists + if [[ ! -f ${DAEMON_PID_FILE} ]]; then + echo -e "${C_AST_RED}ERROR: No daemon pid file found at ${DAEMON_PID_FILE}." + echo -e "${C_AST_RED} Existing daemon must run in background for extension. Exiting ..." + exit 1 + fi + # modify all necessary environment variables from the config file to fit expand + DAEMON_NODELIST_=${DAEMON_NODELIST} + # Set daemon node list based on given expand hostfile + DAEMON_NODELIST_=$(readlink -f ${EXPAND_NODELIST}) + # setup + # This must be equivalent to the line set in include/common/common_defs.hpp + echo "$FS_INSTANCE_MARKER_CONST" >> "${HOSTSFILE}" + # check that the gkfs_malleability binary exists in $PATH if not already set via config + if [[ -z ${GKFS_MALLEABILITY_BIN_} ]]; then + GKFS_MALLEABILITY_BIN_=$(COMMAND -v gkfs_malleability) + fi + # if not found check if it exists in the parent directory of the daemon bin + if [[ -z ${GKFS_MALLEABILITY_BIN_} ]]; then + # check that the gkfs_malleability binary exists somewhere in the parent directory where daemon bin is located + if [[ -f $(dirname ${DAEMON_BIN})/gkfs_malleability ]]; then + GKFS_MALLEABILITY_BIN_=$(readlink -f $(dirname ${DAEMON_BIN})/gkfs_malleability) + else + echo -e "${C_AST_RED}ERROR: gkfs_malleability binary not found. Exiting ..." + exit 1 + fi + fi +} + +####################################### +# Prints expansion progress +# Input: +# $1 current +# $2 total +# VERBOSE +# Outputs: +# Writes status to stdout +####################################### +show_expand_progress() { + local current="$1" + local total="$2" + local remaining=$((total - current)) + local progress=$(( (remaining * 100) / total )) + local bar_length=20 + local filled_length=$(( (progress * bar_length) / 100 )) + local empty_length=$(( bar_length - filled_length )) + + # Clear the entire line and move cursor to the beginning + tput el1; tput cr + + printf "[" + for ((i=0; i] [-m/--mountdir ] [-a/--args ] [--proxy ] [-f/--foreground ] [--srun ] [-n/--numnodes ] [--cpuspertask <64>] [-v/--verbose ] - {start,stop} + {start,expand,stop} " } ####################################### @@ -345,7 +497,7 @@ help_msg() { additional permanent configurations can be set. positional arguments: - command Command to execute: 'start' and 'stop' + COMMAND Command to execute: 'start', 'stop', 'expand' optional arguments: -h, --help Shows this help message and exits @@ -361,6 +513,7 @@ help_msg() { Nodelist is extracted from Slurm via the SLURM_JOB_ID env variable. --cpuspertask <#cores> Set the number of cores the daemons can use. Must use '--srun'. -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. + -e, --expand_hostfile Path to the hostfile with new nodes where GekkoFS should be extended to (hostfile contains one line per node). -v, --verbose Increase verbosity " } @@ -410,8 +563,10 @@ PROXY_BIN=$(readlink -f ${PROXY_BIN}) PRELOAD_LIB=$(readlink -f ${PRELOAD_LIB}) HOSTSFILE=$(readlink -f ${HOSTSFILE}) PROXY_LOCAL_PID_FILE=$(readlink -f ${PROXY_LOCAL_PID_FILE}) -SRUN_DAEMON_PID_FILE=$(readlink -f ${SRUN_DAEMON_PID_FILE}) -SRUN_PROXY_PID_FILE=$(readlink -f ${SRUN_PROXY_PID_FILE}) +DAEMON_PID_FILE=$(readlink -f ${DAEMON_PID_FILE}) +PROXY_PID_FILE=$(readlink -f ${PROXY_PID_FILE}) +EXPAND_NODELIST="" +GKFS_MALLEABILITY_BIN_=${GKFS_MALLEABILITY_BIN} # parse input POSITIONAL=() @@ -476,6 +631,11 @@ while [[ $# -gt 0 ]]; do shift # past argument shift # past value ;; + -e | --expand_hostfile) + EXPAND_NODELIST=$2 + shift # past argument + shift # past value + ;; -h | --help) help_msg exit @@ -498,18 +658,20 @@ if [[ -z ${1+x} ]]; then usage_short exit 1 fi -command="${1}" +COMMAND="${1}" # checking input -if [[ ${command} != *"start"* ]] && [[ ${command} != *"stop"* ]]; then - echo -e "${C_AST_RED}ERROR: command ${command} not supported" +if [[ ${COMMAND} != *"start"* ]] && [[ ${COMMAND} != *"stop"* ]] && [[ ${COMMAND} != *"expand"* ]]; then + echo -e "${C_AST_RED}ERROR: COMMAND ${COMMAND} not supported" usage_short exit 1 fi # Run script -if [[ ${command} == "start" ]]; then - start_daemon -elif [[ ${command} == "stop" ]]; then +if [[ ${COMMAND} == "start" ]]; then + start_daemons +elif [[ ${COMMAND} == "stop" ]]; then stop_daemons +elif [[ ${COMMAND} == "expand" ]]; then + add_daemons fi if [[ ${VERBOSE} == true ]]; then echo -e "${C_AST_GREEN}Nothing left to do. Exiting :)" diff --git a/scripts/run/gkfs.conf b/scripts/run/gkfs.conf index fc7922ddb..c3277229b 100644 --- a/scripts/run/gkfs.conf +++ b/scripts/run/gkfs.conf @@ -6,16 +6,20 @@ DAEMON_BIN=../../build/src/daemon/gkfs_daemon PROXY_BIN=../../build/src/proxy/gkfs_proxy # client configuration (needs to be set for all clients) -LIBGKFS_HOSTS_FILE=./gkfs_hostfile +LIBGKFS_HOSTS_FILE=/home/evie/workdir/gkfs_hosts.txt ## daemon configuration -DAEMON_ROOTDIR=/dev/shm/vef_gkfs_rootdir -DAEMON_MOUNTDIR=/dev/shm/vef_gkfs_mountdir +#DAEMON_ROOTDIR=/dev/shm/vef_gkfs_rootdir +DAEMON_ROOTDIR=/dev/shm/gkfs_rootdir +#DAEMON_MOUNTDIR=/dev/shm/vef_gkfs_mountdir +DAEMON_MOUNTDIR=/tmp/gkfs_mountdir # additional daemon arguments (see `gkfs_daemon -h`) # use numactl to pin daemon to socket DAEMON_ARGS="-l lo -c" # use cpu affinity. Set this eg to `taskset -c ...` DAEMON_AFFINITY="" +# used when run in background +DAEMON_PID_FILE=./gkfs_daemon.pid ## proxy configuration USE_PROXY=false @@ -24,6 +28,8 @@ PROXY_LOCAL_PID_FILE=/dev/shm/vef_gkfs_proxy.pid PROXY_ARGS="-p ofi+sockets" # use cpu affinity. Set this eg to `taskset -c ...` PROXY_AFFINITY="" +# used when run in background +PROXY_PID_FILE=./gkfs_proxy.pid ## slurm configuration # Use Slurm's srun to start the daemons on multiple nodes and set specific srun args @@ -35,13 +41,10 @@ SRUN_ARGS="--overlap --ntasks-per-node=1 --overcommit --overlap --oversubscribe SRUN_DAEMON_ARGS="" # Specific srun args for proxy SRUN_PROXY_ARGS="" -# path to daemon pid file; created where the script is run -SRUN_DAEMON_PID_FILE=./gkfs_daemon.pid -SRUN_PROXY_PID_FILE=./gkfs_proxy.pid # logging -GKFS_DAEMON_LOG_LEVEL=info -GKFS_DAEMON_LOG_PATH=/dev/shm/gkfs_daemon.log +GKFS_DAEMON_LOG_LEVEL=trace +GKFS_DAEMON_LOG_PATH=/tmp/gkfs_daemon.log GKFS_PROXY_LOG_LEVEL=info GKFS_PROXY_LOG_PATH=/dev/shm/gkfs_proxy.log # Modify the following for the client diff --git a/scripts/run/gkfs_io500.conf b/scripts/run/gkfs_io500.conf index eaca98fde..5a3cb688e 100644 --- a/scripts/run/gkfs_io500.conf +++ b/scripts/run/gkfs_io500.conf @@ -18,6 +18,8 @@ DAEMON_MOUNTDIR=/dev/shm/vef_gkfs_mountdir DAEMON_ARGS="-P ofi+verbs -l ib0 -c" # use cpu affinity. Set this eg to `taskset -c ...` DAEMON_AFFINITY="taskset -c 0-63" +# used when run in background +DAEMON_PID_FILE=/lustre/project/nhr-admire/vef/run/io500/gkfs_daemon.pid ## proxy configuration USE_PROXY=false @@ -26,6 +28,8 @@ PROXY_LOCAL_PID_FILE=/dev/shm/vef_gkfs_proxy.pid PROXY_ARGS="-p ofi+verbs" # use cpu affinity. Set this eg to `taskset -c ...` PROXY_AFFINITY="taskset -c 0-63" +# used when run in background +PROXY_PID_FILE=/lustre/project/nhr-admire/vef/run/io500/gkfs_proxy.pid ## slurm configuration # Use Slurm's srun to start the daemons on multiple nodes and set specific srun args @@ -37,9 +41,6 @@ SRUN_ARGS="--overlap --ntasks-per-node=1 --overcommit --overlap --oversubscribe SRUN_DAEMON_ARGS="" # Specific srun args for proxy SRUN_PROXY_ARGS="" -# path to daemon pid file; created where the script is run -SRUN_DAEMON_PID_FILE=/lustre/project/nhr-admire/vef/run/io500/gkfs_daemon.pid -SRUN_PROXY_PID_FILE=/lustre/project/nhr-admire/vef/run/io500/gkfs_proxy.pid # logging configuration GKFS_DAEMON_LOG_LEVEL=info diff --git a/tools/malleability.cpp b/tools/malleability.cpp index 14022ede7..c39ab7395 100644 --- a/tools/malleability.cpp +++ b/tools/malleability.cpp @@ -39,6 +39,7 @@ using namespace std; struct cli_options { bool verbose = false; + bool machine_readable = false; string action; string subcommand; }; @@ -90,6 +91,9 @@ main(int argc, const char* argv[]) { // Global verbose flag desc.add_flag("--verbose,-v", opts.verbose, "Verbose output"); + desc.add_flag("--machine-readable,-m", opts.machine_readable, + "machine-readable output"); + auto expand_args = desc.add_subcommand("expand", "Expansion-related actions"); @@ -126,14 +130,26 @@ main(int argc, const char* argv[]) { } else if(opts.action == "status") { res = gkfs::malleable::expand_status(); if(res > 0) { - cout << "Expansion in progress: " << res - << " nodes not finished.\n"; + if(opts.machine_readable) { + cout << res; + } else { + cout << "Expansion in progress: " << res + << " nodes not finished.\n"; + } } else { - cout << "No expansion running/finished.\n"; + if(opts.machine_readable) { + cout << res; + } else { + cout << "No expansion running/finished.\n"; + } } } else if(opts.action == "finalize") { res = gkfs::malleable::expand_finalize(); - cout << "Expand finalize " << res << endl; + if(opts.machine_readable) { + cout << res; + } else { + cout << "Expand finalize " << res << endl; + } } gkfs_end(); } \ No newline at end of file -- GitLab From 963b9e4fd33a7ae85e77fa4899df0a11535650c6 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Fri, 12 Jul 2024 11:23:03 +0200 Subject: [PATCH 15/17] Rudimentary Proxy support for extended file systems. Proxy must be restarted to know about the file system extension. --- scripts/run/gkfs | 2 ++ src/proxy/util.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/scripts/run/gkfs b/scripts/run/gkfs index 5b6f542bc..8c11bbdb2 100755 --- a/scripts/run/gkfs +++ b/scripts/run/gkfs @@ -361,6 +361,8 @@ expand_setup() { exit 1 fi # if proxy is enabled error out + # to support proxy, all proxies need to be shutdown during expansion and started up after again + # to get the new configuration. if [[ ${USE_PROXY} == true ]]; then echo -e "${C_AST_RED}ERROR: Proxy not supported for expansion. Exiting ..." exit 1 diff --git a/src/proxy/util.cpp b/src/proxy/util.cpp index c3f5c6045..ec0e762ce 100644 --- a/src/proxy/util.cpp +++ b/src/proxy/util.cpp @@ -53,6 +53,8 @@ load_hostfile(const std::string& lfpath) { string uri; std::smatch match; while(getline(lf, line)) { + if(line[0] == '#') + continue; if(!regex_match(line, match, line_re)) { PROXY_DATA->log()->debug( "{}() Unrecognized line format: [path: '{}', line: '{}']", -- GitLab From 0f42da539bf449d30f5154de773f6c44c5ea293a Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Fri, 12 Jul 2024 14:40:16 +0200 Subject: [PATCH 16/17] Fix protocol for daemon RPC client --- src/daemon/daemon.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index 9e7075e49..a00aa352c 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -284,7 +284,7 @@ init_rpc_client() { struct margo_init_info args = {nullptr}; args.json_config = margo_config.c_str(); args.hg_init_info = &hg_options; - auto* mid = margo_init_ext(GKFS_DATA->bind_addr().c_str(), + auto* mid = margo_init_ext(GKFS_DATA->rpc_protocol().c_str(), MARGO_CLIENT_MODE, &args); if(mid == MARGO_INSTANCE_NULL) { @@ -293,7 +293,7 @@ init_rpc_client() { GKFS_DATA->spdlogger()->info( "{}() RPC client initialization successful for protocol {}", - __func__, GKFS_DATA->bind_addr()); + __func__, GKFS_DATA->rpc_protocol()); RPC_DATA->client_rpc_mid(mid); register_client_rpcs(mid); @@ -517,7 +517,7 @@ init_environment() { // Init margo client GKFS_DATA->spdlogger()->debug("{}() Initializing RPC client: '{}'", - __func__, GKFS_DATA->bind_addr()); + __func__, GKFS_DATA->rpc_protocol()); try { init_rpc_client(); } catch(const std::exception& e) { -- GitLab From 49263be809964c8bb0e352485aed4c3cc9cec3c0 Mon Sep 17 00:00:00 2001 From: Marc Vef Date: Fri, 12 Jul 2024 16:33:27 +0200 Subject: [PATCH 17/17] Cleanup, Readme, changelog. --- CHANGELOG.md | 6 ++ README.md | 72 ++++++++++++++++--- scripts/run/gkfs | 2 - scripts/run/gkfs.conf | 5 +- src/daemon/malleability/malleable_manager.cpp | 10 ++- 5 files changed, 80 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e669d9123..df0b9bc3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,12 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ### New +- Added file system expansion support ([!196](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/196)). + - Added the tool `gkfs_malleability` to steer start, status, and finalize requests for expansion operations. + - `-DGKFS_BUILD_TOOLS=ON` must be set for CMake to build the tool. + - Overhauled the `gkfs` run script to accommodate the new tool. + - During expansion, redistribution of data is performed by the daemons. Therefore, an RPC client for daemons was added. + - See Readme for usage details. - Propagate PKG_CONFIG_PATH to dependency scripts ([!185](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/185)). - Added syscall support for listxattr family ([!186](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_request/186)). - Remove optimization, removing one RPC per operation ([!195](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_request/195)). diff --git a/README.md b/README.md index e57f23483..3c6004aef 100644 --- a/README.md +++ b/README.md @@ -159,7 +159,7 @@ to be empty. For MPI application, the `LD_PRELOAD` variable can be passed with the `-x` argument for `mpirun/mpiexec`. -## Run GekkoFS daemons on multiple nodes (beta version!) +## Run GekkoFS daemons on multiple nodes The `scripts/run/gkfs` script can be used to simplify starting the GekkoFS daemon on one or multiple nodes. To start GekkoFS on multiple nodes, a Slurm environment that can execute `srun` is required. Users can further @@ -168,9 +168,9 @@ modify `scripts/run/gkfs.conf` to mold default configurations to their environme The following options are available for `scripts/run/gkfs`: ```bash -usage: gkfs [-h/--help] [-r/--rootdir ] [-m/--mountdir ] [-a/--args ] [-f/--foreground ] - [--srun ] [-n/--numnodes ] [--cpuspertask <64>] [--numactl ] [-v/--verbose ] - {start,stop} +usage: gkfs [-h/--help] [-r/--rootdir ] [-m/--mountdir ] [-a/--args ] [--proxy ] [-f/--foreground ] + [--srun ] [-n/--numnodes ] [--cpuspertask <64>] [-v/--verbose ] + {start,expand,stop} This script simplifies the starting and stopping GekkoFS daemons. If daemons are started on multiple nodes, @@ -178,21 +178,23 @@ usage: gkfs [-h/--help] [-r/--rootdir ] [-m/--mountdir ] [-a/--args additional permanent configurations can be set. positional arguments: - command Command to execute: 'start' and 'stop' + COMMAND Command to execute: 'start', 'stop', 'expand' optional arguments: -h, --help Shows this help message and exits - -r, --rootdir Providing the rootdir path for GekkoFS daemons. - -m, --mountdir Providing the mountdir path for GekkoFS daemons. - -a, --args + -r, --rootdir The rootdir path for GekkoFS daemons. + -m, --mountdir The mountdir path for GekkoFS daemons. + -d, --daemon_args + --proxy Start proxy after the daemons are running. Add various additional daemon arguments, e.g., "-l ib0 -P ofi+psm2". + -p, --proxy_args -f, --foreground Starts the script in the foreground. Daemons are stopped by pressing 'q'. --srun Use srun to start daemons on multiple nodes. -n, --numnodes GekkoFS daemons are started on n nodes. Nodelist is extracted from Slurm via the SLURM_JOB_ID env variable. --cpuspertask <#cores> Set the number of cores the daemons can use. Must use '--srun'. - --numactl Use numactl for the daemon. Modify gkfs.conf for further numactl configurations. -c, --config Path to configuration file. By defaults looks for a 'gkfs.conf' in this directory. + -e, --expand_hostfile Path to the hostfile with new nodes where GekkoFS should be extended to (hostfile contains one line per node). -v, --verbose Increase verbosity ``` @@ -415,6 +417,58 @@ Press 'q' to exit Please consult `include/config.hpp` for additional configuration options. Note, GekkoFS proxy does not support replication. +### File system expansion + +GekkoFS supports extending the current daemon configuration to additional compute nodes. This includes redistribution of +the existing data and metadata and therefore scales file system performance and capacity of existing data. Note, +that it is the user's responsibility to not access the GekkoFS file system during redistribution. A corresponding +feature that is transparent to the user is planned. Note also, if the GekkoFS proxy is used, they need to be manually +restarted, after expansion. + +To enable this feature, the following CMake compilation flags are required to build the `gkfs_malleability` tool: +`-DGKFS_BUILD_TOOLS=ON`. The `gkfs_malleability` tool is then available in the `build/tools` directory. Please consult +`-h` for its arguments. While the tool can be used manually to expand the file system, the `scripts/run/gkfs` script +should be used instead which invokes the `gkfs_malleability` tool. + +The only requirement for extending the file system is a hostfile containing the hostnames/IPs of the new nodes (one line +per host). Example starting the file system. The `DAEMON_NODELIST` in the `gkfs.conf` is set to a hostfile containing +the initial set of file system nodes.: + +```bash +~/gekkofs/scripts/run/gkfs -c ~/run/gkfs_verbs_expandtest.conf start +* [gkfs] Starting GekkoFS daemons (4 nodes) ... +* [gkfs] GekkoFS daemons running +* [gkfs] Startup time: 10.853 seconds +``` + +... Some computation ... + +Expanding the file system. Using `-e ` to specify the new nodes. Redistribution is done automatically with a +progress bar. When finished, the file system is ready to use in the new configuration: + +```bash +~/gekkofs/scripts/run/gkfs -c ~/run/gkfs_verbs_expandtest.conf -e ~/hostfile_expand expand +* [gkfs] Starting GekkoFS daemons (8 nodes) ... +* [gkfs] GekkoFS daemons running +* [gkfs] Startup time: 1.058 seconds +Expansion process from 4 nodes to 12 nodes launched... +* [gkfs] Expansion progress: +[####################] 0/4 left +* [gkfs] Redistribution process done. Finalizing ... +* [gkfs] Expansion done. +``` + +Stop the file system: + +```bash +~/gekkofs/scripts/run/gkfs -c ~/run/gkfs_verbs_expandtest.conf stop +* [gkfs] Stopping daemon with pid 16462 +srun: sending Ctrl-C to StepId=282378.1 +* [gkfs] Stopping daemon with pid 16761 +srun: sending Ctrl-C to StepId=282378.2 +* [gkfs] Shutdown time: 1.032 seconds +``` + ## Acknowledgment This software was partially supported by the EC H2020 funded NEXTGenIO project (Project ID: 671951, www.nextgenio.eu). diff --git a/scripts/run/gkfs b/scripts/run/gkfs index 8c11bbdb2..8582c025e 100755 --- a/scripts/run/gkfs +++ b/scripts/run/gkfs @@ -449,8 +449,6 @@ add_daemons() { NODE_CNT_EXPAND=$((${node_cnt_initial}+$(cat ${EXPAND_NODELIST} | wc -l))) # start new set of daemons start_daemons - # TODO REMOVE -# sed -i '0,/evie/! s/evie/evie2/' ${HOSTSFILE} export LIBGKFS_HOSTS_FILE=${HOSTSFILE} # start expansion which redistributes metadata and data ${GKFS_MALLEABILITY_BIN_} expand start diff --git a/scripts/run/gkfs.conf b/scripts/run/gkfs.conf index c3277229b..b71e85fd3 100644 --- a/scripts/run/gkfs.conf +++ b/scripts/run/gkfs.conf @@ -6,7 +6,10 @@ DAEMON_BIN=../../build/src/daemon/gkfs_daemon PROXY_BIN=../../build/src/proxy/gkfs_proxy # client configuration (needs to be set for all clients) -LIBGKFS_HOSTS_FILE=/home/evie/workdir/gkfs_hosts.txt +LIBGKFS_HOSTS_FILE=/home/XXX/workdir/gkfs_hosts.txt + +# tools (if build) +GKFS_MALLEABILITY_BIN=../../build/tools/gkfs_malleability ## daemon configuration #DAEMON_ROOTDIR=/dev/shm/vef_gkfs_rootdir diff --git a/src/daemon/malleability/malleable_manager.cpp b/src/daemon/malleability/malleable_manager.cpp index c525956af..01290463a 100644 --- a/src/daemon/malleability/malleable_manager.cpp +++ b/src/daemon/malleability/malleable_manager.cpp @@ -51,6 +51,8 @@ namespace fs = std::filesystem; namespace gkfs::malleable { +// TODO The following three functions are almost identical to the proxy code +// They should be moved to a common and shared between the proxy and the daemon vector> MalleableManager::load_hostfile(const std::string& path) { @@ -198,7 +200,7 @@ int MalleableManager::redistribute_metadata() { uint64_t count = 0; auto estimate_db_size = GKFS_DATA->mdb()->db_size(); - auto percent_interval = estimate_db_size / 1000; + auto percent_interval = estimate_db_size / 100; GKFS_DATA->spdlogger()->info( "{}() Starting metadata redistribution for '{}' estimated number of KV pairs...", __func__, estimate_db_size); @@ -206,6 +208,7 @@ MalleableManager::redistribute_metadata() { string key, value; auto iter = static_cast(GKFS_DATA->mdb()->iterate_all()); + // TODO parallelize for(iter->SeekToFirst(); iter->Valid(); iter->Next()) { key = iter->key().ToString(); value = iter->value().ToString(); @@ -213,11 +216,11 @@ MalleableManager::redistribute_metadata() { continue; } auto dest_id = RPC_DATA->distributor()->locate_file_metadata(key, 0); - GKFS_DATA->spdlogger()->info( + GKFS_DATA->spdlogger()->trace( "{}() Migration: key {} and value {}. From host {} to host {}", __func__, key, value, RPC_DATA->local_host_id(), dest_id); if(dest_id == RPC_DATA->local_host_id()) { - GKFS_DATA->spdlogger()->info("{}() SKIPPERS", __func__); + GKFS_DATA->spdlogger()->trace("{}() SKIP", __func__); continue; } auto err = gkfs::malleable::rpc::forward_metadata(key, value, dest_id); @@ -248,6 +251,7 @@ MalleableManager::redistribute_data() { auto chunk_dir = fs::path(GKFS_DATA->storage()->get_chunk_directory()); auto dir_iterator = GKFS_DATA->storage()->get_all_chunk_files(); + // TODO this can be parallelized, e.g., async chunk I/O for(const auto& entry : dir_iterator) { if(!entry.is_regular_file()) { continue; -- GitLab