From 64e3255828dbced086807b8fa9f3549f4da0db8b Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 06:56:31 +0200 Subject: [PATCH 01/21] small refactor --- src/request_manager.cpp | 67 ++++++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/src/request_manager.cpp b/src/request_manager.cpp index 7f27846..93760fc 100644 --- a/src/request_manager.cpp +++ b/src/request_manager.cpp @@ -8,6 +8,7 @@ * * Cargo is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * @@ -99,26 +100,70 @@ request_manager::lookup(std::uint64_t tid) { abt::shared_lock lock(m_mutex); - if(const auto it = m_requests.find(tid); it != m_requests.end()) { + auto it = m_requests.find(tid); + if (it == m_requests.end()) { + LOGGER_ERROR("{}: Request {} not found", __FUNCTION__, tid); + return tl::make_unexpected(error_code::no_such_transfer); + } - const auto& file_statuses = it->second; + const auto& all_file_statuses = it->second; + if (all_file_statuses.empty()) { + return request_status{"", transfer_state::completed, 0.0f, error_code::success}; + } - for(const auto& fs : file_statuses) { - for(const auto& ps : fs) { + bool any_running = false; + bool all_completed = true; + float total_bw = 0.0f; + int active_parts = 0; - if(ps.state() == transfer_state::completed) { - continue; - } + for (const auto& file_status_vec : all_file_statuses) { + if (file_status_vec.empty()) { + continue; // Should not happen with proper creation, but good to be safe. + } - return request_status{ps}; + // The logic for a "file" (a collection of parts handled by workers) is tricky. + // A file is only considered 'completed' if ALL its parts are completed. + // However, in the small-file case, some parts will be pending forever. + // Let's refine: A file is 'completed' if at least one part is 'completed' and no parts are 'running' or 'failed'. + + bool this_file_has_completed_part = false; + bool this_file_is_active = false; // running or pending active work + + for (const auto& part_status : file_status_vec) { + if (part_status.state() == transfer_state::failed) { + // If any part of any file fails, the whole transfer fails immediately. + return request_status{part_status.name(), transfer_state::failed, part_status.bw(), part_status.error()}; + } + if (part_status.state() == transfer_state::running) { + any_running = true; + this_file_is_active = true; + } + if (part_status.state() == transfer_state::completed) { + this_file_has_completed_part = true; + } + if (part_status.bw() > 0) { // Consider only parts that are reporting bandwidth + total_bw += part_status.bw(); + active_parts++; } } - // TODO : completed should have the name of the file if its not found + + // A file is not yet complete if it's active (running) OR if no parts have completed yet. + if (this_file_is_active || !this_file_has_completed_part) { + all_completed = false; + } + } + + if (all_completed) { return request_status{"", transfer_state::completed, 0.0f, error_code::success}; } - LOGGER_ERROR("{}: Request {} not found", __FUNCTION__, tid); - return tl::make_unexpected(error_code::no_such_transfer); + if (any_running) { + float avg_bw = (active_parts > 0) ? (total_bw / active_parts) : 0.0f; + return request_status{"", transfer_state::running, avg_bw, std::nullopt}; + } + + // If nothing failed, not everything is complete, and nothing is running, it must be pending. + return request_status{"", transfer_state::pending, 0.0f, std::nullopt}; } tl::expected, error_code> -- GitLab From 7842679ff4e5bc38888e86d7313dfa0948f74e63 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 07:02:31 +0200 Subject: [PATCH 02/21] do async directory rpc --- src/master.cpp | 21 +++++++++++++++++---- src/master.hpp | 8 +++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/master.cpp b/src/master.cpp index 2b88b7e..b1748f8 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -61,11 +61,12 @@ make_message(std::uint64_t tid, std::uint32_t seqno, std::vector v_input; std::vector v_output; - // convert input to v_input - for(auto i : input) { + v_input.reserve(input.size()); + for(auto const& i : input) { v_input.push_back(i.path()); } - for(auto o : output) { + v_output.reserve(output.size()); + for(auto const& o : output) { v_output.push_back(o.path()); } if(iparallel) { @@ -511,6 +512,18 @@ void master_server::transfer_datasets(const network::request& req, const std::vector& sources, const std::vector& targets) { + // Offload the potentially blocking file operations to a handler thread pool + m_network_engine.get_handler_pool().make_thread( + [this, req, sources, targets]() mutable { + do_transfer_datasets(req, std::move(sources), std::move(targets)); + } + ); +} + +void +master_server::do_transfer_datasets(const network::request req, + std::vector sources, + std::vector targets) { using network::get_address; using network::rpc_info; using proto::generic_response; @@ -792,4 +805,4 @@ master_server::ftio_int(const network::request& req, float conf, float prob, req.respond(resp); } -} // namespace cargo +} // namespace cargo \ No newline at end of file diff --git a/src/master.hpp b/src/master.hpp index 6943121..862e4c1 100644 --- a/src/master.hpp +++ b/src/master.hpp @@ -93,6 +93,12 @@ private: ftio_int(const network::request& req, float confidence, float probability, float period, bool run, bool pause, bool resume); + // This function will contain the blocking logic, to be run in a separate thread. + void + do_transfer_datasets(const network::request req, + std::vector sources, + std::vector targets); + private: // Dedicated execution stream for the MPI listener ULT thallium::managed m_mpi_listener_ess; @@ -125,4 +131,4 @@ private: } // namespace cargo -#endif // CARGO_MASTER_HPP +#endif // CARGO_MASTER_HPP \ No newline at end of file -- GitLab From 84a72a2cab4d1a4c58372357325affcc2d2694c9 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 07:08:05 +0200 Subject: [PATCH 03/21] locking ftio --- src/master.cpp | 207 +++++++++++++++++-------------------------------- src/master.hpp | 31 ++++---- 2 files changed, 90 insertions(+), 148 deletions(-) diff --git a/src/master.cpp b/src/master.cpp index b1748f8..60ff763 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -39,6 +39,7 @@ #include "parallel_request.hpp" #include #include +#include using namespace std::literals; namespace mpi = boost::mpi; @@ -135,6 +136,8 @@ master_server::master_server(std::string name, std::string address, // serve this purpose. The former is called before Mercury is finalized, // while the latter is called in between that and Argobots finalization. m_network_engine.push_prefinalize_callback([this]() { + m_shutting_down = true; + m_ftio_cv.notify_all(); // Wake up FTIO scheduler to exit m_mpi_listener_ult->join(); m_mpi_listener_ult = thallium::managed{}; m_mpi_listener_ess->join(); @@ -200,99 +203,69 @@ master_server::mpi_listener_ult() { void master_server::ftio_scheduling_ult() { + while (!m_shutting_down) { + abt::unique_lock lock(m_ftio_mutex); - while(!m_shutting_down) { - - if(!m_pending_transfer.m_work or !m_ftio_run) { - std::this_thread::sleep_for(1000ms); + if (m_period > 0) { + LOGGER_INFO("FTIO scheduler waiting for period of {} seconds.", m_period); + // Wait for the period to elapse OR to be shut down. + m_ftio_cv.wait_for(lock, std::chrono::duration(m_period), [this] { return m_shutting_down.load(); }); + } else { + LOGGER_INFO("FTIO scheduler waiting for run trigger."); + // Wait for an explicit run trigger OR to be shut down. + m_ftio_cv.wait(lock, [this] { return m_ftio_run.load() || m_shutting_down.load(); }); } - // if(!m_pending_transfer.m_work or m_period < 0.0f) { - // std::this_thread::sleep_for(1000ms); - // } + if (m_shutting_down) break; - // Do something with the confidence and probability - - // if(m_ftio_run) { - // m_ftio_run = false; - // LOGGER_INFO("Confidence is {}, probability is {} and - // period is {}", - // m_confidence, m_probability, m_period); - // } - - if(!m_pending_transfer.m_work) - continue; - if(m_period > 0) { - LOGGER_INFO("Waiting period : {}", m_period); - } else { - LOGGER_DEBUG("Waiting for run trigger ..."); - } - // Wait in small periods, just in case we change it, This should be - // mutexed... - auto elapsed = m_period; - while(elapsed > 0) { - std::this_thread::sleep_for(std::chrono::seconds((int) (1))); - elapsed -= 1; - // reset elapsed value when new RPC comes in - if(m_ftio_run) { - elapsed = m_period; - m_ftio_run = false; - } - } - if(!m_ftio_run) { + if (!m_pending_transfer.m_work) { + m_ftio_run = false; // Consume trigger if there's no work continue; } - LOGGER_INFO("Checking if there is work to do in {}", - m_pending_transfer.m_sources); - m_pending_transfer.m_expanded_sources = {}; - m_pending_transfer.m_expanded_targets = {}; + LOGGER_INFO("FTIO triggered. Checking for work in {}", m_pending_transfer.m_sources); + m_pending_transfer.m_expanded_sources.clear(); + m_pending_transfer.m_expanded_targets.clear(); + // This logic is blocking, but it's in its own thread so it's fine. transfer_dataset_internal(m_pending_transfer); - // This launches the workers to do the work... - // We wait until this transfer is finished - LOGGER_INFO("Transferring {} files", - m_pending_transfer.m_expanded_sources.size()); - bool finished = false; - while(!finished) { - std::this_thread::sleep_for(1s); - m_request_manager.lookup(m_pending_transfer.m_p.tid()) - .or_else([&](auto&& ec) { - LOGGER_ERROR("Failed to lookup request: {}", ec); - }) + + if (!m_pending_transfer.m_expanded_sources.empty()) { + LOGGER_INFO("Transferring {} files.", m_pending_transfer.m_expanded_sources.size()); + bool finished = false; + while(!finished && !m_shutting_down) { + thallium::thread::self().sleep(m_network_engine, 1000); // Poll status every second + m_request_manager.lookup(m_pending_transfer.m_p.tid()) .map([&](auto&& rs) { - if(rs.state() == transfer_state::completed) { + if (rs.state() == transfer_state::completed || rs.state() == transfer_state::failed) { finished = true; + if (rs.state() == transfer_state::failed) { + LOGGER_ERROR("FTIO transfer {} failed with error: {}", m_pending_transfer.m_p.tid(), rs.error().value_or(error_code::other)); + } } }); - } + } - if(finished) { - // Delete all source files - LOGGER_INFO("Transfer finished {} files", - m_pending_transfer.m_expanded_sources); - if(m_pending_transfer.m_expanded_sources.size() > 0) { - - auto fs = FSPlugin::make_fs(static_cast( - m_pending_transfer.m_expanded_sources[0].get_type())); - for(auto& file : m_pending_transfer.m_expanded_sources) { - LOGGER_INFO("Deleting {}", file.path()); - // We need to use gekkofs to delete - fs->unlink(file.path()); + if(finished) { + LOGGER_INFO("Transfer finished for {} files.", m_pending_transfer.m_expanded_sources.size()); + if(!m_pending_transfer.m_expanded_sources.empty()) { + auto fs = FSPlugin::make_fs(static_cast(m_pending_transfer.m_expanded_sources[0].get_type())); + for(const auto& file : m_pending_transfer.m_expanded_sources) { + LOGGER_INFO("Deleting {}", file.path()); + fs->unlink(file.path()); + } } } } - if(m_period > 0) { - // always run whenever period is set - m_ftio_run = true; - } else { - m_ftio_run = false; + + if (m_period <= 0) { + m_ftio_run = false; // Consume the trigger if not periodic. } } - - LOGGER_INFO("Shutting down."); + LOGGER_INFO("FTIO scheduler ULT finished."); } + #define RPC_NAME() (__FUNCTION__) void @@ -514,8 +487,8 @@ master_server::transfer_datasets(const network::request& req, const std::vector& targets) { // Offload the potentially blocking file operations to a handler thread pool m_network_engine.get_handler_pool().make_thread( - [this, req, sources, targets]() mutable { - do_transfer_datasets(req, std::move(sources), std::move(targets)); + [this, req, s = sources, t = targets]() mutable { + do_transfer_datasets(req, std::move(s), std::move(t)); } ); } @@ -627,52 +600,29 @@ master_server::do_transfer_datasets(const network::request req, .map([&](auto&& r) { assert(v_s_new.size() == v_d_new.size()); if(m_ftio) { - // if(sources[0].get_type() == - // cargo::dataset::type::gekkofs) { - - // We have only one pendingTransfer for FTIO - // that can be updated, the issue is that we - // need the tid. + abt::unique_lock lock(m_ftio_mutex); m_pending_transfer.m_p = r; m_pending_transfer.m_sources = sources; m_pending_transfer.m_targets = targets; m_pending_transfer.m_work = true; - LOGGER_INFO("Stored stage-out information"); - // } - } - // For all the transfers - for(std::size_t i = 0; i < v_d_new.size(); ++i) { - // const auto& s = v_s_new[i]; - const auto& d = v_d_new[i]; - - // Create the directory if it does not exist (only in - // parallel transfer) - if(!std::filesystem::path(d.path()) - .parent_path() - .empty() and - d.supports_parallel_transfer()) { - std::filesystem::create_directories( - std::filesystem::path(d.path()).parent_path()); - } - } - - // If we are not using ftio start transfer if we are on - // stage-out - if(!m_ftio) { - // If we are on stage-out - if(v_s_new.size() != 0) { - for(std::size_t rank = 1; rank <= r.nworkers(); - ++rank) { + m_ftio_tid.store(r.tid()); + LOGGER_INFO("Stored stage-out information for transfer {}", r.tid()); + } else { + if(!v_s_new.empty()) { + for(const auto& d_item : v_d_new) { + if(d_item.supports_parallel_transfer() && !std::filesystem::path(d_item.path()).parent_path().empty()) { + std::filesystem::create_directories(std::filesystem::path(d_item.path()).parent_path()); + } + } + for(std::size_t rank = 1; rank <= r.nworkers(); ++rank) { const auto [t, m] = make_message( r.tid(), 0, v_s_new, v_d_new, v_size_new); LOGGER_INFO("msg <= to: {} body: {}", rank, m); world.send(static_cast(rank), t, m); } } - } else { - m_ftio_tid = r.tid(); } - + LOGGER_INFO("rpc {:<} body: {{retval: {}, tid: {}}}", rpc, error_code::success, r.tid()); req.respond(response_with_id{rpc.id(), error_code::success, @@ -766,43 +716,30 @@ master_server::ftio_int(const network::request& req, float conf, float prob, using network::get_address; using network::rpc_info; using proto::generic_response; - mpi::communicator world; + const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); - if(pause) { - // send shaping info - for(int rank = 1; rank < world.size(); ++rank) { - // Slowdown 1 second per block - const auto m = cargo::shaper_message{m_ftio_tid, +10}; - LOGGER_INFO("msg <= to: {} body: {}", rank, m); - world.send(static_cast(rank), - static_cast(tag::bw_shaping), m); - } - } else if(resume) { + LOGGER_INFO("rpc {:>} body: {{confidence: {}, probability: {}, period: {}, run: {}, pause: {}, resume: {}}}", rpc, conf, prob, period, run, pause, resume); + + if(pause || resume) { + mpi::communicator world; for(int rank = 1; rank < world.size(); ++rank) { - // Restart operation - const auto m = cargo::shaper_message{m_ftio_tid, -1}; + const auto m = cargo::shaper_message{m_ftio_tid.load(), static_cast(pause ? 10 : -1)}; LOGGER_INFO("msg <= to: {} body: {}", rank, m); - world.send(static_cast(rank), - static_cast(tag::bw_shaping), m); + world.send(static_cast(rank), static_cast(tag::bw_shaping), m); } } else { + abt::unique_lock lock(m_ftio_mutex); m_confidence = conf; m_probability = prob; m_period = period; - m_ftio_run = run; - if(m_period > 0) - m_ftio_run = true; m_ftio = true; + if (run || period > 0) { + m_ftio_run = true; + m_ftio_cv.notify_one(); + } } - LOGGER_INFO( - "rpc {:>} body: {{confidence: {}, probability: {}, period: {}, run: {}, pause: {}, resume: {}}}", - rpc, conf, prob, period, run, pause, resume); - const auto resp = generic_response{rpc.id(), error_code::success}; - - LOGGER_INFO("rpc {:<} body: {{retval: {}}}", rpc, resp.error_code()); - - req.respond(resp); + req.respond(generic_response{rpc.id(), error_code::success}); } } // namespace cargo \ No newline at end of file diff --git a/src/master.hpp b/src/master.hpp index 862e4c1..465017f 100644 --- a/src/master.hpp +++ b/src/master.hpp @@ -30,6 +30,9 @@ #include "request_manager.hpp" #include "parallel_request.hpp" #include "env.hpp" +#include "shared_mutex.hpp" +#include +#include namespace cargo { @@ -53,7 +56,8 @@ class master_server : public network::server, public network::provider { public: master_server(std::string name, std::string address, bool daemonize, - std::filesystem::path rundir, std::uint64_t block_size, std::string regex_file, + std::filesystem::path rundir, std::uint64_t block_size, + std::string regex_file, std::optional pidfile = {}); ~master_server(); @@ -76,6 +80,11 @@ private: const std::vector& sources, const std::vector& targets); + void + do_transfer_datasets(const network::request req, + std::vector sources, + std::vector targets); + void transfer_status(const network::request& req, std::uint64_t tid); @@ -93,12 +102,6 @@ private: ftio_int(const network::request& req, float confidence, float probability, float period, bool run, bool pause, bool resume); - // This function will contain the blocking logic, to be run in a separate thread. - void - do_transfer_datasets(const network::request req, - std::vector sources, - std::vector targets); - private: // Dedicated execution stream for the MPI listener ULT thallium::managed m_mpi_listener_ess; @@ -108,15 +111,17 @@ private: thallium::managed m_ftio_listener_ess; // ULT for the ftio scheduler thallium::managed m_ftio_listener_ult; - // FTIO decision values (below 0, implies not used) + + // FTIO decision values and state, protected by a mutex + mutable abt::shared_mutex m_ftio_mutex; + std::condition_variable_any m_ftio_cv; float m_confidence = -1.0f; float m_probability = -1.0f; float m_period = -1.0f; - bool m_ftio_run = true; - // We store the tid of the ftio transfer to proper slow it down. - std::uint64_t m_ftio_tid = 0; - // FTIO enabled flag, we need to call ftio once. - bool m_ftio = false; + std::atomic m_ftio_run = {false}; + std::atomic m_ftio_tid = {0}; + std::atomic m_ftio = {false}; + ssize_t m_block_size = 0; pending_transfer m_pending_transfer; -- GitLab From 0aaf5edcb9868b5198a89f5062afe6dbc24d7679 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 07:19:19 +0200 Subject: [PATCH 04/21] removed sequential, refactor worker --- src/CMakeLists.txt | 2 - src/worker/mpio_read.cpp | 232 +++++++++++----------------------- src/worker/mpio_read.hpp | 26 ++-- src/worker/mpio_write.cpp | 257 ++++++++++++-------------------------- src/worker/mpio_write.hpp | 42 ++----- src/worker/ops.cpp | 23 ++-- src/worker/ops.hpp | 81 ++++-------- src/worker/seq_mixed.cpp | 197 +++++++++-------------------- src/worker/seq_mixed.hpp | 43 ++----- src/worker/sequential.cpp | 252 ------------------------------------- src/worker/sequential.hpp | 93 -------------- src/worker/worker.cpp | 213 ++++++++++--------------------- src/worker/worker.hpp | 7 +- 13 files changed, 353 insertions(+), 1115 deletions(-) delete mode 100644 src/worker/sequential.cpp delete mode 100644 src/worker/sequential.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9bc9ce8..7021a84 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -41,8 +41,6 @@ target_sources( worker/mpio_write.cpp worker/ops.cpp worker/ops.hpp - worker/sequential.cpp - worker/sequential.hpp worker/seq_mixed.cpp worker/seq_mixed.hpp worker/worker.cpp diff --git a/src/worker/mpio_read.cpp b/src/worker/mpio_read.cpp index ee9c572..bd8e8f6 100644 --- a/src/worker/mpio_read.cpp +++ b/src/worker/mpio_read.cpp @@ -27,6 +27,9 @@ #include "mpioxx.hpp" #include "memory.hpp" #include +#include + +using namespace std::chrono_literals; namespace cargo { @@ -36,217 +39,134 @@ mpio_read::mpio_read(mpi::communicator workers, std::uint64_t block_size, FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single) : m_workers(std::move(workers)), m_input_path(std::move(input_path)), - m_output_path(std::move(output_path)), m_kb_size(std::move(block_size)), + m_output_path(std::move(output_path)), m_kb_size(block_size), m_fs_i_type(fs_i_type), m_fs_o_type(fs_o_type), m_file_size(size), m_single(single) {} cargo::error_code -mpio_read::operator()() { - +mpio_read::setup() { using posix_file::views::all_of; using posix_file::views::as_blocks; using posix_file::views::strided; m_status = error_code::transfer_in_progress; try { - + // MPI communicators are cheap to copy const auto input_file = mpioxx::file::open( m_workers, m_input_path, mpioxx::file_open_mode::rdonly); - mpioxx::offset file_size = m_file_size; std::size_t block_size = m_kb_size * 1024u; - // create block type + MPI_Datatype block_type; - MPI_Type_contiguous(static_cast(block_size), MPI_BYTE, - &block_type); + MPI_Type_contiguous(static_cast(block_size), MPI_BYTE, &block_type); MPI_Type_commit(&block_type); - // compute the number of blocks in the file - int total_blocks = static_cast(file_size / block_size); - - if(file_size % block_size != 0) { + int total_blocks = static_cast(m_file_size / block_size); + if(m_file_size % block_size != 0) { ++total_blocks; } - auto workers_size = m_workers.size(); - auto workers_rank = m_workers.rank(); - - if(m_single) { - workers_size = 1; - workers_rank = 0; - } - // create file type + m_workers_size = m_single ? 1 : m_workers.size(); + m_workers_rank = m_single ? 0 : m_workers.rank(); + MPI_Datatype file_type; - /* - * count: number of blocks in the type - * blocklen: number of elements in each block - * stride: number of elements between start of each block - */ - MPI_Type_vector(/* count: */ total_blocks, /* blocklength: */ 1, - /* stride: */ workers_size, /* oldtype: */ block_type, - &file_type); + MPI_Type_vector(total_blocks, 1, m_workers_size, block_type, &file_type); MPI_Type_commit(&file_type); - MPI_Offset disp = workers_rank * block_size; - MPI_Datatype etype = block_type; - MPI_Datatype filetype = file_type; - - if(const auto ec = MPI_File_set_view(input_file, disp, etype, filetype, - "native", MPI_INFO_NULL); - ec != MPI_SUCCESS) { - LOGGER_ERROR("MPI_File_set_view() failed: {}", - mpi::error_string(ec)); - return make_mpi_error(ec); + MPI_Offset disp = m_workers_rank * block_size; + if(const auto ec = MPI_File_set_view(input_file, disp, block_type, file_type, "native", MPI_INFO_NULL); ec != MPI_SUCCESS) { + LOGGER_ERROR("MPI_File_set_view() failed: {}", mpi::error_string(ec)); + MPI_Type_free(&block_type); + MPI_Type_free(&file_type); + return (m_status = make_mpi_error(ec)); } - // find how many blocks this rank is responsible for - std::size_t blocks_per_rank = total_blocks / workers_size; - - if(int64_t n = total_blocks % workers_size; - n != 0 && workers_rank < n) { + std::size_t blocks_per_rank = total_blocks / m_workers_size; + if(int64_t n = total_blocks % m_workers_size; n != 0 && m_workers_rank < n) { ++blocks_per_rank; } - // step 1. acquire buffers - m_buffer.resize(blocks_per_rank * block_size); - m_buffer_regions.reserve(blocks_per_rank); - for(std::size_t i = 0; i < blocks_per_rank; ++i) { - m_buffer_regions.emplace_back(m_buffer.data() + i * block_size, - block_size); + m_buffer_regions.emplace_back(m_buffer.data() + i * block_size, block_size); } - MPI_Datatype datatype = block_type; - - // step2. parallel read data into buffers - if(const auto ec = MPI_File_read_all(input_file, m_buffer.data(), - static_cast(blocks_per_rank), - datatype, MPI_STATUS_IGNORE); - ec != MPI_SUCCESS) { - LOGGER_ERROR("MPI_File_read_all() failed: {}", - mpi::error_string(ec)); - return make_mpi_error(ec); + if(const auto ec = MPI_File_read_all(input_file, m_buffer.data(), static_cast(blocks_per_rank), block_type, MPI_STATUS_IGNORE); ec != MPI_SUCCESS) { + LOGGER_ERROR("MPI_File_read_all() failed: {}", mpi::error_string(ec)); + MPI_Type_free(&block_type); + MPI_Type_free(&file_type); + return (m_status = make_mpi_error(ec)); } + + MPI_Type_free(&block_type); + MPI_Type_free(&file_type); - // step3. POSIX write data - // We need to create the directory if it does not exists (using - // FSPlugin) m_output_file = std::make_unique(posix_file::create( m_output_path, O_WRONLY, S_IRUSR | S_IWUSR, m_fs_o_type)); + m_output_file->fallocate(0, 0, m_file_size); - m_output_file->fallocate(0, 0, file_size); - - - m_workers_size = workers_size; - m_workers_rank = workers_rank; m_block_size = block_size; + + auto file_view = posix_file::file{m_input_path, m_fs_i_type}; + for(const auto& range : all_of(file_view) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { + m_file_ranges.push_back(range); + } } catch(const mpioxx::io_error& e) { LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - m_status = make_mpi_error(e.error_code()); - return make_mpi_error(e.error_code()); + return (m_status = make_mpi_error(e.error_code())); } catch(const posix_file::io_error& e) { LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - m_status = make_system_error(e.error_code()); - return make_system_error(e.error_code()); - } catch(const std::system_error& e) { - LOGGER_ERROR("Unexpected system error: {}", e.what()); - m_status = make_system_error(e.code().value()); - return make_system_error(e.code().value()); + return (m_status = make_system_error(e.error_code())); } catch(const std::exception& e) { - LOGGER_ERROR("operator ()() Unexpected exception: {}", e.what()); - m_status = error_code::other; - return error_code::other; + LOGGER_ERROR("setup() Unexpected exception: {}", e.what()); + return (m_status = error_code::other); } - m_status = error_code::transfer_in_progress; - return error_code::transfer_in_progress; + return (m_status = error_code::success); } -int -mpio_read::progress(int ongoing_index) { +operation::progress_status +mpio_read::progress() { + if (m_current_block_index >= m_file_ranges.size()) { + m_output_file->close(); + m_status = error_code::success; + return progress_status::Done; + } - using posix_file::views::all_of; - using posix_file::views::as_blocks; - using posix_file::views::strided; try { - int index = 0; - // TODO : FS not defined... - m_status = error_code::transfer_in_progress; - for(const auto& file_range : - all_of(posix_file::file{m_input_path, m_fs_i_type}) | - as_blocks(m_block_size) | - strided(m_workers_size, m_workers_rank)) { - if(index < ongoing_index) { - ++index; - continue; - } else { - if(index > ongoing_index) { - return index; - } - } - // LOG indexes and sizes - - assert(m_buffer_regions[index].size() >= file_range.size()); - - auto start = std::chrono::steady_clock::now(); - m_output_file->pwrite(m_buffer_regions[index], file_range.offset(), - file_range.size()); - // Do sleep - auto total_sleep = sleep_value(); - auto small_sleep = total_sleep / 100; - if(small_sleep == std::chrono::milliseconds(0)) - small_sleep = std::chrono::milliseconds(1); - while(total_sleep > std::chrono::milliseconds(0)) { - std::this_thread::sleep_for(small_sleep); - total_sleep -= small_sleep; - if(total_sleep > sleep_value()) { - break; - } - } - auto end = std::chrono::steady_clock::now(); - // Send transfer bw - double elapsed_seconds = - std::chrono::duration_cast>( - end - start) - .count(); - if((elapsed_seconds) > 0) { - bw((m_block_size / (1024.0 * 1024.0)) / (elapsed_seconds)); - LOGGER_DEBUG( - "BW (write) Update: {} / {} = {} mb/s [ Sleep {} ]", - m_block_size / 1024.0, elapsed_seconds, bw(), - sleep_value()); - } - - ++index; + const auto& file_range = m_file_ranges[m_current_block_index]; + const auto& buffer_region = m_buffer_regions[m_current_block_index]; + + assert(buffer_region.size() >= file_range.size()); + + auto start = std::chrono::steady_clock::now(); + m_output_file->pwrite(buffer_region, file_range.offset(), file_range.size()); + + auto sleep_duration = sleep_value(); + if(sleep_duration > 0ms) std::this_thread::sleep_for(sleep_duration); + + auto end = std::chrono::steady_clock::now(); + + double elapsed_seconds = std::chrono::duration_cast>(end - start).count(); + if (elapsed_seconds > 0) { + bw((m_block_size / (1024.0 * 1024.0)) / elapsed_seconds); + LOGGER_DEBUG("BW (write) Update: {} / {} = {} MB/s [ Sleep {}ms ]", + m_block_size / 1024.0, elapsed_seconds, bw(), sleep_duration.count()); } - } catch(const mpioxx::io_error& e) { - LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - m_status = make_mpi_error(e.error_code()); - return -1; + + m_current_block_index++; + } catch(const posix_file::io_error& e) { - LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); + LOGGER_ERROR("{}() failed in progress: {}", e.where(), e.what()); m_status = make_system_error(e.error_code()); - return -1; - } catch(const std::system_error& e) { - LOGGER_ERROR("Unexpected system error: {}", e.what()); - m_status = make_system_error(e.code().value()); - return -1; + return progress_status::Failed; } catch(const std::exception& e) { - LOGGER_ERROR("Progress: Unexpected exception: {}", e.what()); + LOGGER_ERROR("progress() Unexpected exception: {}", e.what()); m_status = error_code::other; - return -1; + return progress_status::Failed; } - - m_status = error_code::success; - m_output_file->close(); - return -1; -} - -// This needs to be go through different phases... -cargo::error_code -mpio_read::progress() const { - return m_status; + + return progress_status::InProgress; } } // namespace cargo \ No newline at end of file diff --git a/src/worker/mpio_read.hpp b/src/worker/mpio_read.hpp index b52d6cd..4ade702 100644 --- a/src/worker/mpio_read.hpp +++ b/src/worker/mpio_read.hpp @@ -41,28 +41,14 @@ public: std::filesystem::path output_path, std::uint64_t block_size, FSPlugin::type fs_i_type, FSPlugin::type m_fs_o_type, std::size_t size, bool single); - cargo::error_code - operator()() final; + cargo::error_code setup() final; + progress_status progress() final; - cargo::error_code - progress() const final; - - int - progress(int ongoing_index) final; - - std::string - output_path() const { - return m_output_path; - } - - std::string - input_path() const { - return m_input_path; - } + std::string output_path() const { return m_output_path.string(); } + std::string input_path() const { return m_input_path.string(); } private: mpi::communicator m_workers; - cargo::error_code m_status; std::filesystem::path m_input_path{}; std::filesystem::path m_output_path{}; @@ -72,6 +58,8 @@ private: std::size_t m_block_size; memory_buffer m_buffer; std::vector m_buffer_regions; + std::vector m_file_ranges; + size_t m_current_block_index = 0; std::uint64_t m_kb_size; FSPlugin::type m_fs_i_type; FSPlugin::type m_fs_o_type; @@ -81,4 +69,4 @@ private: } // namespace cargo -#endif // CARGO_WORKER_MPIO_READ_HPP +#endif // CARGO_WORKER_MPIO_READ_HPP \ No newline at end of file diff --git a/src/worker/mpio_write.cpp b/src/worker/mpio_write.cpp index 7e2c051..756fb76 100644 --- a/src/worker/mpio_write.cpp +++ b/src/worker/mpio_write.cpp @@ -27,228 +27,133 @@ #include "mpioxx.hpp" #include +#include + +using namespace std::chrono_literals; + namespace cargo { -cargo::error_code -mpio_write::operator()() { +mpio_write::mpio_write(mpi::communicator workers, std::filesystem::path input_path, + std::filesystem::path output_path, std::uint64_t block_size, + FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single) + : m_workers(std::move(workers)), m_input_path(std::move(input_path)), + m_output_path(std::move(output_path)), + m_kb_size(block_size), m_fs_i_type(fs_i_type), + m_fs_o_type(fs_o_type), m_file_size(size), m_single(single) {} +cargo::error_code +mpio_write::setup() { using posix_file::views::all_of; using posix_file::views::as_blocks; using posix_file::views::strided; m_status = error_code::transfer_in_progress; try { - - auto workers_size = m_workers.size(); - auto workers_rank = m_workers.rank(); - - if (m_single) { - workers_size = 1; - workers_rank = 0; - } + m_workers_size = m_single ? 1 : m_workers.size(); + m_workers_rank = m_single ? 0 : m_workers.rank(); - std::size_t block_size = m_kb_size * 1024u; - // We need to open the file and ask size (using fs_plugin) - m_input_file = std::make_unique( - posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type)); - - std::size_t file_size = m_file_size; - - // compute the number of blocks in the file - int total_blocks = static_cast(file_size / block_size); + m_block_size = m_kb_size * 1024u; + + auto input_file = posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type); - if(file_size % block_size != 0) { - ++total_blocks; + m_total_blocks = static_cast(m_file_size / m_block_size); + if(m_file_size % m_block_size != 0) { + ++m_total_blocks; } - // find how many blocks this rank is responsible for - std::size_t blocks_per_rank = total_blocks / workers_size; - - if(int64_t n = total_blocks % workers_size; - n != 0 && workers_rank < n) { + std::size_t blocks_per_rank = m_total_blocks / m_workers_size; + if(int64_t n = m_total_blocks % m_workers_size; n != 0 && m_workers_rank < n) { ++blocks_per_rank; } - // step 1. acquire buffers - - m_buffer.resize(blocks_per_rank * block_size); - m_buffer_regions.reserve(blocks_per_rank); - + m_buffer.resize(blocks_per_rank * m_block_size); + std::vector buffer_regions; + buffer_regions.reserve(blocks_per_rank); for(std::size_t i = 0; i < blocks_per_rank; ++i) { - m_buffer_regions.emplace_back(m_buffer.data() + i * block_size, - block_size); + buffer_regions.emplace_back(m_buffer.data() + i * m_block_size, m_block_size); } - - m_workers_size = workers_size; - m_workers_rank = workers_rank; - m_block_size = block_size; - m_file_size = file_size; - m_total_blocks = total_blocks; - - } catch(const mpioxx::io_error& e) { - LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - m_status = make_mpi_error(e.error_code()); - return make_mpi_error(e.error_code()); - } catch(const posix_file::io_error& e) { - LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - m_status = make_system_error(e.error_code()); - return make_system_error(e.error_code()); - } catch(const std::system_error& e) { - LOGGER_ERROR("Unexpected system error: {}", e.what()); - m_status = make_system_error(e.code().value()); - return make_system_error(e.code().value()); - } catch(const std::exception& e) { - LOGGER_ERROR("Unexpected exception: {}", e.what()); - m_status = error_code::other; - return error_code::other; - } - - return error_code::transfer_in_progress; -} -cargo::error_code -mpio_write::progress() const { - return m_status; -} - -int -mpio_write::progress(int ongoing_index) { - using posix_file::views::all_of; - using posix_file::views::as_blocks; - using posix_file::views::strided; - - // compute the number of blocks in the file - - int index = 0; - if(ongoing_index == 0) { m_bytes_per_rank = 0; - } - try { + int index = 0; for(const auto& file_range : - all_of(*m_input_file) | as_blocks(m_block_size) | - strided(m_workers_size, m_workers_rank)) { - - if(index < ongoing_index) { - ++index; - continue; - } else { - if(index > ongoing_index) { - return index; - } - } - m_status = error_code::transfer_in_progress; - assert(m_buffer_regions[index].size() >= file_range.size()); + all_of(input_file) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { + + assert((unsigned)index < buffer_regions.size()); + auto& buffer_region = buffer_regions[index]; + assert(buffer_region.size() >= file_range.size()); + auto start = std::chrono::steady_clock::now(); - const std::size_t n = - m_input_file->pread(m_buffer_regions[index], - file_range.offset(), file_range.size()); - - LOGGER_DEBUG("Buffer contents: [\"{}\" ... \"{}\"]", - fmt::join(buffer_regions[index].begin(), - buffer_regions[index].begin() + 10, ""), - fmt::join(buffer_regions[index].end() - 10, - buffer_regions[index].end(), "")); - - + const std::size_t n = input_file.pread(buffer_region, file_range.offset(), file_range.size()); m_bytes_per_rank += n; - // Do sleep (But be a bit reactive...) - auto total_sleep = sleep_value(); - auto small_sleep = total_sleep / 100; - if (small_sleep == std::chrono::milliseconds(0)) small_sleep = std::chrono::milliseconds(1); - while( total_sleep > std::chrono::milliseconds(0)) { - std::this_thread::sleep_for(small_sleep); - total_sleep -= small_sleep; - if (total_sleep > sleep_value()) { - break; - } - } + + auto sleep_duration = sleep_value(); + if(sleep_duration > 0ms) std::this_thread::sleep_for(sleep_duration); auto end = std::chrono::steady_clock::now(); - // Send transfer bw - double elapsed_seconds = - std::chrono::duration_cast>( - end - start) - .count(); - if((elapsed_seconds) > 0) { - bw((m_block_size / (1024.0 * 1024.0)) / (elapsed_seconds)); - LOGGER_DEBUG("BW (read) Update: {} / {} = {} mb/s [ Sleep {} ]", - m_block_size / 1024.0, elapsed_seconds, bw(), - sleep_value()); + double elapsed_seconds = std::chrono::duration_cast>(end - start).count(); + if (elapsed_seconds > 0) { + bw((m_block_size / (1024.0 * 1024.0)) / elapsed_seconds); + LOGGER_DEBUG("BW (read) Update: {} / {} = {} MB/s [ Sleep {}ms ]", + m_block_size / 1024.0, elapsed_seconds, bw(), sleep_duration.count()); } - - ++index; + index++; } - // step 2. write buffer data in parallel to the PFS - //LOGGER_INFO("START WRITING file {}", m_output_path); - const auto output_file = - mpioxx::file::open(m_workers, m_output_path, - mpioxx::file_open_mode::create | - mpioxx::file_open_mode::wronly); + } catch(const posix_file::io_error& e) { + LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); + return (m_status = make_system_error(e.error_code())); + } catch(const std::exception& e) { + LOGGER_ERROR("setup() Unexpected exception: {}", e.what()); + return (m_status = error_code::other); + } + + return (m_status = error_code::success); +} + +operation::progress_status +mpio_write::progress() { + try { + const auto output_file = mpioxx::file::open( + m_workers, m_output_path, mpioxx::file_open_mode::create | mpioxx::file_open_mode::wronly); - // create block type MPI_Datatype block_type; - MPI_Type_contiguous(static_cast(m_block_size), MPI_BYTE, - &block_type); + MPI_Type_contiguous(static_cast(m_block_size), MPI_BYTE, &block_type); MPI_Type_commit(&block_type); - // create file type MPI_Datatype file_type; - - /* - * count: number of blocks in the type - * blocklen: number of `oldtype` elements in each block - * stride: number of `oldtype` elements between start of each block - */ - MPI_Type_vector(/* count: */ m_total_blocks, /* blocklength: */ 1, - /* stride: */ m_workers_size, /* oldtype: */ block_type, - &file_type); + MPI_Type_vector(m_total_blocks, 1, m_workers_size, block_type, &file_type); MPI_Type_commit(&file_type); - if(const auto ec = - MPI_File_set_view(output_file, - /* disp: */ m_workers_rank * m_block_size, - /* elementary_type: */ block_type, - file_type, "native", MPI_INFO_NULL); - ec != MPI_SUCCESS) { - LOGGER_ERROR("MPI_File_set_view() failed: {}", - mpi::error_string(ec)); + if(const auto ec = MPI_File_set_view(output_file, m_workers_rank * m_block_size, block_type, file_type, "native", MPI_INFO_NULL); ec != MPI_SUCCESS) { + LOGGER_ERROR("MPI_File_set_view() failed: {}", mpi::error_string(ec)); m_status = make_mpi_error(ec); - return -1; + MPI_Type_free(&block_type); + MPI_Type_free(&file_type); + return progress_status::Failed; } - // step 3. parallel write data from buffers - if(const auto ec = - MPI_File_write_all(output_file, m_buffer.data(), - static_cast(m_bytes_per_rank), - MPI_BYTE, MPI_STATUS_IGNORE); - ec != MPI_SUCCESS) { - LOGGER_ERROR("MPI_File_write_all() failed: {}", - mpi::error_string(ec)); + if(const auto ec = MPI_File_write_all(output_file, m_buffer.data(), static_cast(m_bytes_per_rank), MPI_BYTE, MPI_STATUS_IGNORE); ec != MPI_SUCCESS) { + LOGGER_ERROR("MPI_File_write_all() failed: {}", mpi::error_string(ec)); m_status = make_mpi_error(ec); - return -1; + MPI_Type_free(&block_type); + MPI_Type_free(&file_type); + return progress_status::Failed; } + + MPI_Type_free(&block_type); + MPI_Type_free(&file_type); + } catch(const mpioxx::io_error& e) { LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); m_status = make_mpi_error(e.error_code()); - return -1; - } catch(const posix_file::io_error& e) { - LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - m_status = make_system_error(e.error_code()); - return -1; - } catch(const std::system_error& e) { - LOGGER_ERROR("Unexpected system error: {}", e.what()); - m_status = make_system_error(e.code().value()); - return -1; + return progress_status::Failed; } catch(const std::exception& e) { - LOGGER_ERROR("Unexpected exception: {}", e.what()); + LOGGER_ERROR("progress() Unexpected exception: {}", e.what()); m_status = error_code::other; - return -1; + return progress_status::Failed; } - //LOGGER_INFO("END WRITING file {}", m_output_path); m_status = error_code::success; - - return -1; + return progress_status::Done; } -} // namespace cargo +} // namespace cargo \ No newline at end of file diff --git a/src/worker/mpio_write.hpp b/src/worker/mpio_write.hpp index 514afdd..52ace79 100644 --- a/src/worker/mpio_write.hpp +++ b/src/worker/mpio_write.hpp @@ -25,62 +25,38 @@ #ifndef CARGO_WORKER_MPIO_WRITE_HPP #define CARGO_WORKER_MPIO_WRITE_HPP -#include -#include #include "ops.hpp" #include "memory.hpp" +#include +#include namespace mpi = boost::mpi; namespace cargo { class mpio_write : public operation { - public: mpio_write(mpi::communicator workers, std::filesystem::path input_path, std::filesystem::path output_path, std::uint64_t block_size, - FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single) - : m_workers(std::move(workers)), m_input_path(std::move(input_path)), - m_output_path(std::move(output_path)), - m_kb_size(std::move(block_size)), m_fs_i_type(fs_i_type), - m_fs_o_type(fs_o_type), m_file_size(size), m_single(single) {} - - cargo::error_code - operator()() final; + FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single); - cargo::error_code - progress() const final; - - int - progress(int ongoing_index) final; - - std::string - output_path() const { - return m_output_path; - } - - std::string - input_path() const { - return m_input_path; - } + cargo::error_code setup() final; + progress_status progress() final; + std::string output_path() const { return m_output_path.string(); } + std::string input_path() const { return m_input_path.string(); } private: mpi::communicator m_workers; - cargo::error_code m_status; std::filesystem::path m_input_path{}; std::filesystem::path m_output_path{}; - std::unique_ptr m_input_file; int m_workers_size; int m_workers_rank; std::size_t m_block_size; - int m_total_blocks; - memory_buffer m_buffer; - std::vector m_buffer_regions; - std::size_t m_bytes_per_rank; + std::size_t m_bytes_per_rank = 0; std::uint64_t m_kb_size; FSPlugin::type m_fs_i_type; FSPlugin::type m_fs_o_type; @@ -90,4 +66,4 @@ private: } // namespace cargo -#endif // CARGO_WORKER_MPIO_WRITE_HPP +#endif // CARGO_WORKER_MPIO_WRITE_HPP \ No newline at end of file diff --git a/src/worker/ops.cpp b/src/worker/ops.cpp index 124ab96..17ad2e2 100644 --- a/src/worker/ops.cpp +++ b/src/worker/ops.cpp @@ -25,7 +25,6 @@ #include "ops.hpp" #include "mpio_read.hpp" #include "mpio_write.hpp" -#include "sequential.hpp" #include "seq_mixed.hpp" namespace mpi = boost::mpi; @@ -49,10 +48,7 @@ operation::make_operation(cargo::tag t, mpi::communicator workers, return std::make_unique( std::move(workers), std::move(input_path), std::move(output_path), block_size, fs_i_type, fs_o_type, size, single); - case tag::sequential: - return std::make_unique( - std::move(workers), std::move(input_path), - std::move(output_path), block_size, fs_i_type, fs_o_type, size, single); + case tag::sequential: // Fallthrough to seq_mixed case tag::seq_mixed: return std::make_unique( std::move(workers), std::move(input_path), @@ -73,28 +69,29 @@ operation::sleep_value() const { void operation::set_bw_shaping(std::int16_t incr) { m_sleep_value += incr; + if (m_sleep_value < 0) m_sleep_value = 0; } int -operation::source() { +operation::source() const { return m_rank; } std::uint64_t -operation::tid() { +operation::tid() const { return m_tid; } std::uint32_t -operation::seqno() { +operation::seqno() const { return m_seqno; } cargo::tag -operation::t() { +operation::t() const { return m_t; } float_t -operation::bw() { +operation::bw() const { return m_bw; } @@ -112,8 +109,8 @@ operation::set_comm(int rank, std::uint64_t tid, std::uint32_t seqno, } cargo::error_code -operation::progress() const { - return error_code::success; +operation::status() const { + return m_status; } -} // namespace cargo +} // namespace cargo \ No newline at end of file diff --git a/src/worker/ops.hpp b/src/worker/ops.hpp index 79e0e35..cfe5d06 100644 --- a/src/worker/ops.hpp +++ b/src/worker/ops.hpp @@ -33,26 +33,10 @@ #include "posix_file/file.hpp" namespace cargo { -/** - * Interface for transfer operations - */ class operation { - public: -/** - * @brief - * - * @param t - * @param workers - * @param input_path - * @param output_path - * @param block_size - * @param fs_i_type - * @param fs_o_type - * @param size size of the file gathered in the master to reduce operations - * @param single The file is only processed in this rank - * @return std::unique_ptr - */ + enum class progress_status { InProgress, Done, Failed }; + static std::unique_ptr make_operation(cargo::tag t, boost::mpi::communicator workers, std::filesystem::path input_path, @@ -61,53 +45,36 @@ public: virtual ~operation() = default; - virtual cargo::error_code - operator()() = 0; - - - std::chrono::milliseconds - sleep_value() const; - // We pass a - or + value to decrease or increase the bw shaping. - void - set_bw_shaping(std::int16_t incr); - virtual cargo::error_code - progress() const = 0; - virtual int - progress(int index) = 0; - + virtual cargo::error_code setup() = 0; + virtual progress_status progress() = 0; - int - source(); - std::uint64_t - tid(); - std::uint32_t - seqno(); - void - set_comm(int rank, std::uint64_t tid, std::uint32_t seqno, cargo::tag t); - cargo::tag - t(); + std::chrono::milliseconds sleep_value() const; + void set_bw_shaping(std::int16_t incr); - float_t - bw(); - void - bw(float_t bw); + int source() const; + std::uint64_t tid() const; + std::uint32_t seqno() const; + void set_comm(int rank, std::uint64_t tid, std::uint32_t seqno, cargo::tag t); + cargo::tag t() const; - virtual std::string - output_path() const = 0; + float_t bw() const; + void bw(float_t bw); - virtual std::string - input_path() const = 0; + cargo::error_code status() const; + virtual std::string output_path() const = 0; + virtual std::string input_path() const = 0; -private: +protected: std::int16_t m_sleep_value = 0; - int m_rank; - std::uint64_t m_tid; - std::uint32_t m_seqno; - cargo::tag m_t; - float m_bw; + int m_rank = 0; + std::uint64_t m_tid = 0; + std::uint32_t m_seqno = 0; + cargo::tag m_t = cargo::tag::sequential; + float m_bw = 0.0f; + cargo::error_code m_status = {error_code::success}; }; } // namespace cargo -#endif // CARGO_WORKER_OPS_HPP +#endif // CARGO_WORKER_OPS_HPP \ No newline at end of file diff --git a/src/worker/seq_mixed.cpp b/src/worker/seq_mixed.cpp index 7663759..5998195 100644 --- a/src/worker/seq_mixed.cpp +++ b/src/worker/seq_mixed.cpp @@ -25,168 +25,95 @@ #include #include "seq_mixed.hpp" #include +#include + +using namespace std::chrono_literals; namespace cargo { +seq_mixed_operation::seq_mixed_operation(mpi::communicator workers, + std::filesystem::path input_path, + std::filesystem::path output_path, + std::uint64_t block_size, FSPlugin::type fs_i_type, + FSPlugin::type fs_o_type, std::size_t size, bool single) + : m_workers(std::move(workers)), m_input_path(std::move(input_path)), + m_output_path(std::move(output_path)), + m_kb_size(block_size), m_fs_i_type(fs_i_type), + m_fs_o_type(fs_o_type), m_file_size(size), m_single(single) {} + cargo::error_code -seq_mixed_operation::operator()() { +seq_mixed_operation::setup() { using posix_file::views::all_of; using posix_file::views::as_blocks; using posix_file::views::strided; m_status = error_code::transfer_in_progress; try { - - auto workers_size = m_workers.size(); - auto workers_rank = m_workers.rank(); - - if (m_single) { - workers_size = 1; - workers_rank = 0; - } - std::size_t block_size = m_kb_size * 1024u; + m_workers_size = m_single ? 1 : m_workers.size(); + m_workers_rank = m_single ? 0 : m_workers.rank(); + + m_block_size = m_kb_size * 1024u; + m_buffer.resize(m_block_size); - m_input_file = std::make_unique( - posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type)); - std::size_t file_size = m_file_size; + m_input_file = std::make_unique(posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type)); + m_output_file = std::make_unique(posix_file::create(m_output_path, O_WRONLY, S_IRUSR | S_IWUSR, m_fs_o_type)); + m_output_file->fallocate(0, 0, m_file_size); - // compute the number of blocks in the file - int total_blocks = static_cast(file_size / block_size); - - if(file_size % block_size != 0) { - ++total_blocks; + auto file_view = posix_file::file{m_input_path, m_fs_i_type}; + for(const auto& range : all_of(file_view) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { + m_file_ranges.push_back(range); } - // find how many blocks this rank is responsible for - std::size_t blocks_per_rank = total_blocks / workers_size; - - if(int64_t n = total_blocks % workers_size; - n != 0 && workers_rank < n) { - ++blocks_per_rank; - } - - // step 1. acquire buffers - - m_buffer.resize(blocks_per_rank * block_size); - m_buffer_regions.reserve(blocks_per_rank); - - for(std::size_t i = 0; i < blocks_per_rank; ++i) { - m_buffer_regions.emplace_back(m_buffer.data() + i * block_size, - block_size); - } - - m_output_file = std::make_unique(posix_file::create( - m_output_path, O_WRONLY, S_IRUSR | S_IWUSR, m_fs_o_type)); - - m_output_file->fallocate(0, 0, file_size); - - m_workers_size = workers_size; - m_workers_rank = workers_rank; - m_block_size = block_size; - m_file_size = file_size; - m_total_blocks = total_blocks; - } catch(const posix_file::io_error& e) { LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - m_status = make_system_error(e.error_code()); - return make_system_error(e.error_code()); - } catch(const std::system_error& e) { - LOGGER_ERROR("Unexpected system error: {}", e.what()); - m_status = make_system_error(e.code().value()); - return make_system_error(e.code().value()); + return (m_status = make_system_error(e.error_code())); } catch(const std::exception& e) { - LOGGER_ERROR("Unexpected exception: {}", e.what()); - m_status = error_code::other; - return error_code::other; + LOGGER_ERROR("setup() Unexpected exception: {}", e.what()); + return (m_status = error_code::other); } - m_status = error_code::transfer_in_progress; - return error_code::transfer_in_progress; + return (m_status = error_code::success); } -cargo::error_code -seq_mixed_operation::progress() const { - return m_status; -} - -int -seq_mixed_operation::progress(int ongoing_index) { - using posix_file::views::all_of; - using posix_file::views::as_blocks; - using posix_file::views::strided; - - // compute the number of blocks in the file - - int index = 0; - - if(ongoing_index == 0) { - m_bytes_per_rank = 0; +operation::progress_status +seq_mixed_operation::progress() { + if (m_current_block_index >= m_file_ranges.size()) { + m_input_file->close(); + m_output_file->close(); + m_status = error_code::success; + return progress_status::Done; } - try { - for(const auto& file_range : - all_of(*m_input_file) | as_blocks(m_block_size) | - strided(m_workers_size, m_workers_rank)) { - if(index < ongoing_index) { - ++index; - continue; - } else { - if(index > ongoing_index) { - return index; - } - } - m_status = error_code::transfer_in_progress; - assert(m_buffer_regions[index].size() >= file_range.size()); - auto start = std::chrono::steady_clock::now(); - const std::size_t n = - m_input_file->pread(m_buffer_regions[index], - file_range.offset(), file_range.size()); - - LOGGER_DEBUG("Buffer contents: [\"{}\" ... \"{}\"]", - fmt::join(buffer_regions[index].begin(), - buffer_regions[index].begin() + 10, ""), - fmt::join(buffer_regions[index].end() - 10, - buffer_regions[index].end(), "")); - - /* Do write */ - m_output_file->pwrite(m_buffer_regions[index], file_range.offset(), - file_range.size()); - - - m_bytes_per_rank += n; - // Do sleep - std::this_thread::sleep_for(sleep_value()); - auto end = std::chrono::steady_clock::now(); - // Send transfer bw - double elapsed_seconds = - std::chrono::duration_cast>( - end - start) - .count(); - if((elapsed_seconds) > 0) { - bw((m_block_size / (1024.0 * 1024.0)) / (elapsed_seconds)); - LOGGER_DEBUG("BW (read) Update: {} / {} = {} mb/s [ Sleep {} ]", - m_block_size / 1024.0, elapsed_seconds, bw(), - sleep_value()); - } - - ++index; + try { + const auto& file_range = m_file_ranges[m_current_block_index]; + + auto start = std::chrono::steady_clock::now(); + + const std::size_t n_read = m_input_file->pread(m_buffer, file_range.offset(), file_range.size()); + m_output_file->pwrite(m_buffer, file_range.offset(), n_read); + + auto sleep_duration = sleep_value(); + if(sleep_duration > 0ms) std::this_thread::sleep_for(sleep_duration); + + auto end = std::chrono::steady_clock::now(); + + double elapsed_seconds = std::chrono::duration_cast>(end - start).count(); + if (elapsed_seconds > 0) { + bw((n_read / (1024.0 * 1024.0)) / elapsed_seconds); + LOGGER_DEBUG("BW (seq_mixed) Update: {} / {} = {} MB/s [ Sleep {}ms ]", + n_read / 1024.0, elapsed_seconds, bw(), sleep_duration.count()); } + + m_current_block_index++; } catch(const posix_file::io_error& e) { - LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); + LOGGER_ERROR("{}() failed in progress: {}", e.where(), e.what()); m_status = make_system_error(e.error_code()); - return -1; - } catch(const std::system_error& e) { - LOGGER_ERROR("Unexpected system error: {}", e.what()); - m_status = make_system_error(e.code().value()); - return -1; + return progress_status::Failed; } catch(const std::exception& e) { - LOGGER_ERROR("Unexpected exception: {}", e.what()); + LOGGER_ERROR("progress() Unexpected exception: {}", e.what()); m_status = error_code::other; - return -1; + return progress_status::Failed; } - - m_status = error_code::success; - m_output_file->close(); - return -1; + return progress_status::InProgress; } -} // namespace cargo +} // namespace cargo \ No newline at end of file diff --git a/src/worker/seq_mixed.hpp b/src/worker/seq_mixed.hpp index 7d6af0f..565b216 100644 --- a/src/worker/seq_mixed.hpp +++ b/src/worker/seq_mixed.hpp @@ -28,7 +28,6 @@ #include "ops.hpp" #include #include -#include "ops.hpp" #include "memory.hpp" namespace mpi = boost::mpi; @@ -42,34 +41,13 @@ public: std::filesystem::path input_path, std::filesystem::path output_path, std::uint64_t block_size, FSPlugin::type fs_i_type, - FSPlugin::type fs_o_type, ssize_t size, bool single) - : m_workers(std::move(workers)), m_input_path(std::move(input_path)), - m_output_path(std::move(output_path)), - m_kb_size(std::move(block_size)), m_fs_i_type(fs_i_type), - m_fs_o_type(fs_o_type), m_file_size(size), m_single(single) {} - - cargo::error_code - operator()() final; - cargo::error_code - progress() const; - - int - progress(int ongoing_index) final; + FSPlugin::type fs_o_type, std::size_t size, bool single); - std::string - output_path() const { - return m_output_path; - } + cargo::error_code setup() final; + progress_status progress() final; - std::string - input_path() const { - return m_input_path; - } - - ssize_t - size() const { - return m_file_size; - } + std::string output_path() const { return m_output_path.string(); } + std::string input_path() const { return m_input_path.string(); } private: mpi::communicator m_workers; @@ -81,20 +59,19 @@ private: int m_workers_size; int m_workers_rank; std::size_t m_block_size; - int m_total_blocks; + + std::vector m_file_ranges; + size_t m_current_block_index = 0; memory_buffer m_buffer; - std::vector m_buffer_regions; - std::size_t m_bytes_per_rank; + std::uint64_t m_kb_size; FSPlugin::type m_fs_i_type; FSPlugin::type m_fs_o_type; std::size_t m_file_size; - cargo::error_code m_status; bool m_single; - bool write{}; }; } // namespace cargo -#endif // CARGO_WORKER_SEQUENTIAL_HPP +#endif // CARGO_WORKER_SEQ_MIXED_HPP \ No newline at end of file diff --git a/src/worker/sequential.cpp b/src/worker/sequential.cpp deleted file mode 100644 index 42e3b9a..0000000 --- a/src/worker/sequential.cpp +++ /dev/null @@ -1,252 +0,0 @@ -/****************************************************************************** - * Copyright 2022-2023, Barcelona Supercomputing Center (BSC), Spain - * - * This software was partially supported by the EuroHPC-funded project ADMIRE - * (Project ID: 956748, https://www.admire-eurohpc.eu). - * - * This file is part of Cargo. - * - * Cargo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Cargo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Cargo. If not, see . - * - * SPDX-License-Identifier: GPL-3.0-or-later - *****************************************************************************/ - -#include -#include "sequential.hpp" -#include - -namespace cargo { - -cargo::error_code -seq_operation::operator()() { - using posix_file::views::all_of; - using posix_file::views::as_blocks; - using posix_file::views::strided; - m_status = error_code::transfer_in_progress; - try { - - auto workers_size = m_workers.size(); - auto workers_rank = m_workers.rank(); - - if (m_single) { - workers_size = 1; - workers_rank = 0; - } - std::size_t block_size = m_kb_size * 1024u; - m_input_file = std::make_unique( - posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type)); - std::size_t file_size = m_file_size; - - - // compute the number of blocks in the file - int total_blocks = static_cast(file_size / block_size); - - if(file_size % block_size != 0) { - ++total_blocks; - } - - // find how many blocks this rank is responsible for - std::size_t blocks_per_rank = total_blocks / workers_size; - - if(int64_t n = total_blocks % workers_size; - n != 0 && workers_rank < n) { - ++blocks_per_rank; - } - - // step 1. acquire buffers - - m_buffer.resize(blocks_per_rank * block_size); - m_buffer_regions.reserve(blocks_per_rank); - - for(std::size_t i = 0; i < blocks_per_rank; ++i) { - m_buffer_regions.emplace_back(m_buffer.data() + i * block_size, - block_size); - } - - m_input_file = std::make_unique( - posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type)); - - m_workers_size = workers_size; - m_workers_rank = workers_rank; - m_block_size = block_size; - m_file_size = file_size; - m_total_blocks = total_blocks; - - } catch(const posix_file::io_error& e) { - LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - m_status = make_system_error(e.error_code()); - return make_system_error(e.error_code()); - } catch(const std::system_error& e) { - LOGGER_ERROR("Unexpected system error: {}", e.what()); - m_status = make_system_error(e.code().value()); - return make_system_error(e.code().value()); - } catch(const std::exception& e) { - LOGGER_ERROR("Unexpected exception: {}", e.what()); - m_status = error_code::other; - return error_code::other; - } - - return error_code::transfer_in_progress; -} - -cargo::error_code -seq_operation::progress() const { - return m_status; -} - -int -seq_operation::progress(int ongoing_index) { - using posix_file::views::all_of; - using posix_file::views::as_blocks; - using posix_file::views::strided; - - // compute the number of blocks in the file - - int index = 0; - if(write == false) { - if(ongoing_index == 0) { - m_bytes_per_rank = 0; - } - try { - for(const auto& file_range : - all_of(*m_input_file) | as_blocks(m_block_size) | - strided(m_workers_size, m_workers_rank)) { - - if(index < ongoing_index) { - ++index; - continue; - } else { - if(index > ongoing_index) { - return index; - } - } - m_status = error_code::transfer_in_progress; - assert(m_buffer_regions[index].size() >= file_range.size()); - auto start = std::chrono::steady_clock::now(); - const std::size_t n = m_input_file->pread( - m_buffer_regions[index], file_range.offset(), - file_range.size()); - - LOGGER_DEBUG("Buffer contents: [\"{}\" ... \"{}\"]", - fmt::join(buffer_regions[index].begin(), - buffer_regions[index].begin() + 10, ""), - fmt::join(buffer_regions[index].end() - 10, - buffer_regions[index].end(), "")); - - - m_bytes_per_rank += n; - // Do sleep - std::this_thread::sleep_for(sleep_value()); - auto end = std::chrono::steady_clock::now(); - // Send transfer bw - double elapsed_seconds = - std::chrono::duration_cast< - std::chrono::duration>(end - start) - .count(); - if((elapsed_seconds) > 0) { - bw((m_block_size / (1024.0 * 1024.0)) / (elapsed_seconds)); - LOGGER_DEBUG( - "BW (read) Update: {} / {} = {} mb/s [ Sleep {} ]", - m_block_size / 1024.0, elapsed_seconds, bw(), - sleep_value()); - } - - ++index; - } - } catch(const posix_file::io_error& e) { - LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - m_status = make_system_error(e.error_code()); - return -1; - } catch(const std::system_error& e) { - LOGGER_ERROR("Unexpected system error: {}", e.what()); - m_status = make_system_error(e.code().value()); - return -1; - } catch(const std::exception& e) { - LOGGER_ERROR("Unexpected exception: {}", e.what()); - m_status = error_code::other; - return -1; - } - write = true; - ongoing_index = 0; - } - - // We finished reading - // step3. POSIX write data - // We need to create the directory if it does not exists (using - // FSPlugin) - - if(write and ongoing_index == 0) { - m_output_file = std::make_unique(posix_file::create( - m_output_path, O_WRONLY, S_IRUSR | S_IWUSR, m_fs_o_type)); - - m_output_file->fallocate(0, 0, m_file_size); - } - - try { - int index = 0; - m_status = error_code::transfer_in_progress; - for(const auto& file_range : - all_of(posix_file::file{m_input_path, m_fs_i_type}) | as_blocks(m_block_size) | - strided(m_workers_size, m_workers_rank)) { - if(index < ongoing_index) { - ++index; - continue; - } else { - if(index > ongoing_index) { - return index; - } - } - - assert(m_buffer_regions[index].size() >= file_range.size()); - auto start = std::chrono::steady_clock::now(); - m_output_file->pwrite(m_buffer_regions[index], file_range.offset(), - file_range.size()); - // Do sleep - std::this_thread::sleep_for(sleep_value()); - auto end = std::chrono::steady_clock::now(); - // Send transfer bw - double elapsed_seconds = - std::chrono::duration_cast>( - end - start) - .count(); - if((elapsed_seconds) > 0) { - bw((m_block_size / (1024.0 * 1024.0)) / (elapsed_seconds)); - LOGGER_DEBUG( - "BW (write) Update: {} / {} = {} mb/s [ Sleep {} ]", - m_block_size / 1024.0, elapsed_seconds, bw(), - sleep_value()); - } - - ++index; - } - - } catch(const posix_file::io_error& e) { - LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - m_status = make_system_error(e.error_code()); - return -1; - } catch(const std::system_error& e) { - LOGGER_ERROR("Unexpected system error: {}", e.what()); - m_status = make_system_error(e.code().value()); - return -1; - } catch(const std::exception& e) { - LOGGER_ERROR("Unexpected exception: {}", e.what()); - m_status = error_code::other; - return -1; - } - - m_status = error_code::success; - return -1; -} - -} // namespace cargo diff --git a/src/worker/sequential.hpp b/src/worker/sequential.hpp deleted file mode 100644 index 1273cf6..0000000 --- a/src/worker/sequential.hpp +++ /dev/null @@ -1,93 +0,0 @@ -/****************************************************************************** - * Copyright 2022-2023, Barcelona Supercomputing Center (BSC), Spain - * - * This software was partially supported by the EuroHPC-funded project ADMIRE - * (Project ID: 956748, https://www.admire-eurohpc.eu). - * - * This file is part of Cargo. - * - * Cargo is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Cargo is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Cargo. If not, see . - * - * SPDX-License-Identifier: GPL-3.0-or-later - *****************************************************************************/ - -#ifndef CARGO_WORKER_SEQUENTIAL_HPP -#define CARGO_WORKER_SEQUENTIAL_HPP - -#include "ops.hpp" -#include -#include -#include "ops.hpp" -#include "memory.hpp" - -namespace mpi = boost::mpi; - -namespace cargo { - -class seq_operation : public operation { - -public: - seq_operation(mpi::communicator workers, std::filesystem::path input_path, - std::filesystem::path output_path, std::uint64_t block_size, - FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single) - : m_workers(std::move(workers)), m_input_path(std::move(input_path)), - m_output_path(std::move(output_path)), - m_kb_size(std::move(block_size)), m_fs_i_type(fs_i_type), - m_fs_o_type(fs_o_type), m_file_size(size), m_single(single) {} - - cargo::error_code - operator()() final; - cargo::error_code - progress() const; - - int - progress(int ongoing_index) final; - - std::string - output_path() const { - return m_output_path; - } - - std::string - input_path() const { - return m_input_path; - } - -private: - mpi::communicator m_workers; - std::unique_ptr m_input_file; - std::unique_ptr m_output_file; - std::filesystem::path m_input_path{}; - std::filesystem::path m_output_path{}; - int m_workers_size; - int m_workers_rank; - std::size_t m_block_size; - - int m_total_blocks; - - memory_buffer m_buffer; - std::vector m_buffer_regions; - std::size_t m_bytes_per_rank; - std::uint64_t m_kb_size; - FSPlugin::type m_fs_i_type; - FSPlugin::type m_fs_o_type; - std::size_t m_file_size; - cargo::error_code m_status; - bool m_single; - bool write{}; -}; - -} // namespace cargo - -#endif // CARGO_WORKER_SEQUENTIAL_HPP diff --git a/src/worker/worker.cpp b/src/worker/worker.cpp index 4b38779..6967e6c 100644 --- a/src/worker/worker.cpp +++ b/src/worker/worker.cpp @@ -47,11 +47,11 @@ make_communicator(const mpi::communicator& comm, const mpi::group& group, boost::mpi::error_string(ec)); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } - return mpi::communicator{newcomm, boost::mpi::comm_take_ownership}; + return {newcomm, boost::mpi::comm_take_ownership}; } void -update_state(int rank, std::uint64_t tid, std::uint32_t seqno, std::string name, +update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, cargo::transfer_state st, float bw, std::optional ec = std::nullopt) { @@ -81,7 +81,6 @@ worker::set_block_size(std::uint64_t block_size) { int worker::run() { - // Create a separate communicator only for worker processes const mpi::communicator world; const auto ranks_to_exclude = std::array{0}; const auto workers = @@ -106,185 +105,111 @@ worker::run() { bool done = false; while(!done) { - // Always loop pending operations - // TODO: This seems that it is not a good idea, we have a lot of () - // ongoing - - auto I = m_ops.begin(); - auto IE = m_ops.end(); - // LOGGER_INFO ("[Status] Pending: {}", m_ops.size()); - if(I != IE) { - auto op = I->second.first.get(); - int index = I->second.second; - if(op) { - if(index == -1) { - // operation not started - // Print error message - /* We avoid this update, that may not come into order...*/ - // update_state(op->source(), op->tid(), op->seqno(), - // op->output_path(), transfer_state::running, - // -1.0f); - cargo::error_code ec = (*op)(); - if(ec != cargo::error_code::transfer_in_progress) { - update_state(op->source(), op->tid(), op->seqno(), - op->output_path(), transfer_state::failed, - -1.0f, ec); - I = m_ops.erase(I); - break; - } - - index = 0; - } - // Operation in progress - index = op->progress(index); - if(index == -1) { - // operation finishe - cargo::error_code ec = op->progress(); - update_state(op->source(), op->tid(), op->seqno(), - op->output_path(), - ec ? transfer_state::failed - : transfer_state::completed, - 0.0f, ec); - - // Transfer finished - I = m_ops.erase(I); - } else { - // update only if BW is set - if(op->bw() > 0.0f) { - update_state(op->source(), op->tid(), op->seqno(), - op->output_path(), transfer_state::running, - op->bw()); - } - I->second.second = index; - // If we have ++I we go trhu another file - //++I; + // First, progress all ongoing operations. + for (auto it = m_ops.begin(); it != m_ops.end(); ) { + auto& op = it->second; + + auto status = op->progress(); + + if (status != operation::progress_status::InProgress) { + // Operation is Done or Failed, send final status and remove. + cargo::error_code final_ec = op->status(); + update_state(op->source(), op->tid(), op->seqno(), op->output_path(), + (status == operation::progress_status::Done) ? transfer_state::completed : transfer_state::failed, + 0.0f, final_ec); + it = m_ops.erase(it); + } else { + // Operation is still in progress, send intermediate status if there's new info. + if (op->bw() > 0.0f) { + update_state(op->source(), op->tid(), op->seqno(), op->output_path(), transfer_state::running, op->bw()); } + ++it; } } + // Second, check for new MPI messages. + auto maybe_msg = m_ops.empty() ? world.probe() : world.iprobe(); - auto msg = world.iprobe(); - - if(!msg) { - // Only wait if there are no pending operations and no messages - if(m_ops.size() == 0) { + if(!maybe_msg) { + if (m_ops.empty()) { std::this_thread::sleep_for(10ms); } continue; } - switch(const auto t = static_cast(msg->tag())) { + auto msg = *maybe_msg; + switch(const auto t = static_cast(msg.tag())) { case tag::pread: - [[fallthrough]]; case tag::pwrite: - [[fallthrough]]; case tag::seq_mixed: - [[fallthrough]]; case tag::sequential: { transfer_message m; - world.recv(msg->source(), msg->tag(), m); - LOGGER_INFO("msg => from: {} body: {}", msg->source(), m); - // Iterate over all the vector (input and output) and create a - // new op per file + world.recv(msg.source(), msg.tag(), m); + LOGGER_INFO("msg => from: {} body: {}", msg.source(), m); + for(std::size_t i = 0; i < m.input_path().size(); i++) { std::string input_path = m.input_path()[i]; std::string output_path = m.output_path()[i]; std::size_t size = m.sizes()[i]; + + bool is_small_file = (size <= m_block_size * 1024); + bool am_i_responsible = !is_small_file || ((i % workers.size()) == static_cast(workers.rank())); + + if (am_i_responsible) { + auto op_workers = workers; + if(is_small_file && workers.size() > 1) { + std::vector self_rank = { workers.rank() }; + auto group = workers.group().include(self_rank.begin(), self_rank.end()); + op_workers = ::make_communicator(workers, group, 0); + } - if(size <= m_block_size * 1024) { - // Optimize and process only to one worker - // the one that is processing is i%worker == 0 - if(((i % workers.size()) == - (unsigned int) workers.rank())) { - update_state(msg->source(), m.tid(), i, output_path, - transfer_state::pending, -1.0f); - std::vector ranks_to_exclude; - // Exclude all - for(auto id = 0; id < workers.size(); id++) { - if(id != workers.rank()) { - ranks_to_exclude.push_back(id); - } - } - - auto tempworkers = ::make_communicator( - workers, - workers.group().exclude( - ranks_to_exclude.begin(), - ranks_to_exclude.end()), - 0); - // The communicator is not correct if ranks are empty - if (workers.size()==1) tempworkers = workers; - - m_ops.emplace(std::make_pair( - make_pair(input_path, output_path), - make_pair(operation::make_operation( - t, tempworkers, - input_path, output_path, - m_block_size, m.i_type(), - m.o_type(), size, true), - -1))); - // TODO : Issue 1, seqno is not different from each - // file - // -(we use i) - const auto op = - m_ops[make_pair(input_path, output_path)] - .first.get(); - - op->set_comm(msg->source(), m.tid(), i, t); - + auto op = operation::make_operation(t, op_workers, input_path, output_path, m_block_size, m.i_type(), m.o_type(), size, is_small_file); + op->set_comm(msg.source(), m.tid(), i, t); + + update_state(op->source(), op->tid(), i, output_path, transfer_state::pending, -1.0f); + + // Setup the operation. If it fails, report failure immediately. + if (op->setup() == error_code::success) { + update_state(op->source(), op->tid(), i, output_path, transfer_state::running, -1.0f); + m_ops.emplace(std::make_pair(m.tid(), i), std::move(op)); } else { - update_state(msg->source(), m.tid(), i, output_path, - transfer_state::completed, -1.0f); + LOGGER_ERROR("Operation setup failed for transfer {} file {}", op->tid(), op->input_path()); + update_state(op->source(), op->tid(), i, output_path, transfer_state::failed, 0.0f, op->status()); } - } else { - update_state(msg->source(), m.tid(), i, output_path, - transfer_state::pending, -1.0f); - - m_ops.emplace(std::make_pair( - make_pair(input_path, output_path), - make_pair(operation::make_operation( - t, workers, input_path, - output_path, m_block_size, - m.i_type(), m.o_type(), size, - false), - -1))); - // TODO : Issue 1, seqno is not different from each file - // -(we use i) - const auto op = - m_ops[make_pair(input_path, output_path)] - .first.get(); - op->set_comm(msg->source(), m.tid(), i, t); + } else { + // This part is critical for the "many small files" case to unblock the master. + update_state(msg.source(), m.tid(), i, output_path, transfer_state::completed, 0.0f, error_code::success); } } break; } case tag::bw_shaping: { shaper_message m; - world.recv(msg->source(), msg->tag(), m); - LOGGER_INFO("msg => from: {} body: {}", msg->source(), m); - for(auto I = m_ops.begin(); I != m_ops.end(); I++) { - const auto op = I->second.first.get(); - if(op) { + world.recv(msg.source(), msg.tag(), m); + LOGGER_INFO("msg => from: {} body: {}", msg.source(), m); + for(auto& [key, op] : m_ops) { + if(op && op->tid() == m.tid()) { op->set_bw_shaping(m.shaping()); - } else { - LOGGER_INFO("Operation non existent", msg->source(), m); } } break; } - - case tag::shutdown: - LOGGER_INFO("msg => from: {} body: {{shutdown}}", - msg->source()); - world.recv(msg->source(), msg->tag()); + LOGGER_INFO("msg => from: {} body: {{shutdown}}", msg.source()); + world.recv(msg.source(), msg.tag()); done = true; break; default: - LOGGER_WARN("[{}] Unexpected message tag: {}", msg->source(), - msg->tag()); + LOGGER_WARN("[{}] Unexpected message tag: {}", msg.source(), msg.tag()); + // Discard unexpected message + if (auto count = msg.count()) { + std::vector discard_buffer(*count); + world.recv(msg.source(), msg.tag(), discard_buffer.data(), *count); + } else { + world.recv(msg.source(), msg.tag()); + } break; } } @@ -296,4 +221,4 @@ worker::run() { return 0; } -} // namespace cargo +} // namespace cargo \ No newline at end of file diff --git a/src/worker/worker.hpp b/src/worker/worker.hpp index a4eded9..b3c7ac9 100644 --- a/src/worker/worker.hpp +++ b/src/worker/worker.hpp @@ -28,6 +28,7 @@ #include "../proto/mpi/message.hpp" #include +#include #include "ops.hpp" namespace cargo { @@ -45,7 +46,9 @@ public: run(); private: - std::map, std::pair< std::unique_ptr, int> > m_ops; + // Key: {transfer_id, file_sequence_number} + // Value: The operation object + std::map, std::unique_ptr> m_ops; std::string m_name; int m_rank; std::optional m_output_file; @@ -55,4 +58,4 @@ private: } // namespace cargo -#endif // CARGO_WORKER_HPP +#endif // CARGO_WORKER_HPP \ No newline at end of file -- GitLab From 3e8dca8a08cc98a898a35a539e74a079423351ae Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 07:23:09 +0200 Subject: [PATCH 05/21] cleanup --- src/master.cpp | 453 +++++++++++++++---------------------------------- src/master.hpp | 20 +-- 2 files changed, 151 insertions(+), 322 deletions(-) diff --git a/src/master.cpp b/src/master.cpp index 60ff763..a3c84d5 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -116,7 +116,25 @@ master_server::master_server(std::string name, std::string address, { m_block_size = block_size; - REGEX_file = regex_file; + REGEX_file = std::move(regex_file); + if(!REGEX_file.empty()) { + std::ifstream file(REGEX_file); + if (file.is_open()) { + std::string pattern_str; + std::getline(file, pattern_str); + file.close(); + try { + m_filename_pattern.assign(pattern_str); + m_is_filtering_enabled = true; + LOGGER_INFO("Using pattern str '{}' for regex from file '{}'", pattern_str, REGEX_file); + } catch (const std::regex_error& e) { + LOGGER_ERROR("Invalid regex pattern '{}' from file {}: {}", pattern_str, REGEX_file, e.what()); + m_is_filtering_enabled = false; + } + } else { + LOGGER_ERROR("Could not open pattern file '{}'", REGEX_file); + } + } #define EXPAND(rpc_name) #rpc_name##s, &master_server::rpc_name @@ -130,11 +148,6 @@ master_server::master_server(std::string name, std::string address, #undef EXPAND - // ESs and ULTs need to be joined before the network engine is - // actually finalized, and ~master_server() is too late for that. - // The push_prefinalize_callback() and push_finalize_callback() functions - // serve this purpose. The former is called before Mercury is finalized, - // while the latter is called in between that and Argobots finalization. m_network_engine.push_prefinalize_callback([this]() { m_shutting_down = true; m_ftio_cv.notify_all(); // Wake up FTIO scheduler to exit @@ -157,47 +170,42 @@ master_server::mpi_listener_ult() { mpi::communicator world; while(!m_shutting_down) { + if (auto msg = world.iprobe()) { + switch(static_cast(msg->tag())) { + case tag::status: { + status_message m; + world.recv(msg->source(), msg->tag(), m); + LOGGER_DEBUG("msg => from: {} body: {{payload: {}}}", + msg->source(), m); + + m_request_manager.update(m.tid(), m.seqno(), msg->source() - 1, + m.name(), m.state(), m.bw(), + m.error_code()); + break; + } - auto msg = world.iprobe(); - - if(!msg) { - std::this_thread::sleep_for(10ms); - // thallium::thread::self().sleep(m_network_engine, 10); - continue; - } - - switch(static_cast(msg->tag())) { - case tag::status: { - status_message m; - world.recv(msg->source(), msg->tag(), m); - LOGGER_DEBUG("msg => from: {} body: {{payload: {}}}", - msg->source(), m); - - m_request_manager.update(m.tid(), m.seqno(), msg->source() - 1, - m.name(), m.state(), m.bw(), - m.error_code()); - break; + default: + LOGGER_WARN("msg => from: {} body: {{Unexpected tag: {}}}", + msg->source(), msg->tag()); + if (auto count = msg->count()) { + std::vector discard_buffer(*count); + world.recv(msg->source(), msg->tag(), discard_buffer.data(), *count); + } else { + world.recv(msg->source(), msg->tag()); + } + break; } - - default: - LOGGER_WARN("msg => from: {} body: {{Unexpected tag: {}}}", - msg->source(), msg->tag()); - break; + } else { + thallium::thread::self().sleep(m_network_engine, 10); } } - LOGGER_INFO("Shutting down. Notifying workers..."); - - // shutting down, notify all workers + LOGGER_INFO("Shutting down MPI listener. Notifying workers..."); for(int rank = 1; rank < world.size(); ++rank) { - LOGGER_INFO("msg <= to: {} body: {{shutdown}}", rank); - world.send(static_cast(rank), static_cast(tag::shutdown)); + world.isend(static_cast(rank), static_cast(tag::shutdown)); } - LOGGER_INFO("Entering exit barrier..."); - world.barrier(); - - LOGGER_INFO("Exit"); + LOGGER_INFO("MPI listener ULT finished."); } @@ -208,11 +216,9 @@ master_server::ftio_scheduling_ult() { if (m_period > 0) { LOGGER_INFO("FTIO scheduler waiting for period of {} seconds.", m_period); - // Wait for the period to elapse OR to be shut down. m_ftio_cv.wait_for(lock, std::chrono::duration(m_period), [this] { return m_shutting_down.load(); }); } else { LOGGER_INFO("FTIO scheduler waiting for run trigger."); - // Wait for an explicit run trigger OR to be shut down. m_ftio_cv.wait(lock, [this] { return m_ftio_run.load() || m_shutting_down.load(); }); } @@ -227,7 +233,6 @@ master_server::ftio_scheduling_ult() { m_pending_transfer.m_expanded_sources.clear(); m_pending_transfer.m_expanded_targets.clear(); - // This logic is blocking, but it's in its own thread so it's fine. transfer_dataset_internal(m_pending_transfer); if (!m_pending_transfer.m_expanded_sources.empty()) { @@ -275,13 +280,9 @@ master_server::ping(const network::request& req) { using proto::generic_response; const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); - LOGGER_INFO("rpc {:>} body: {{}}", rpc); - const auto resp = generic_response{rpc.id(), error_code::success}; - LOGGER_INFO("rpc {:<} body: {{retval: {}}}", rpc, resp.error_code()); - req.respond(resp); } @@ -305,9 +306,7 @@ master_server::bw_control(const network::request& req, std::uint64_t tid, } const auto resp = generic_response{rpc.id(), error_code::success}; - LOGGER_INFO("rpc {:<} body: {{retval: {}}}", rpc, resp.error_code()); - req.respond(resp); } @@ -315,166 +314,78 @@ void master_server::shutdown(const network::request& req) { using network::get_address; using network::rpc_info; - using proto::generic_response; const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); - LOGGER_INFO("rpc {:>} body: {{}}", rpc); + req.respond(proto::generic_response{rpc.id(), error_code::success}); server::shutdown(); } -// Function that gets a pending_request, fills the request and sends the mpi -// message for the transfer We only put files that has mtime < actual -// timestamp , intended for stage-out and ftio void master_server::transfer_dataset_internal(pending_transfer& pt) { - // setup regex - // Read the regular expression pattern from the file - auto patternFile = REGEX_file; - bool filtering = false; - std::string patternStr; // default - if(REGEX_file != "") { - std::ifstream file(patternFile); - if(file.is_open()) { - std::getline(file, patternStr); - file.close(); - filtering = true; - } else { - LOGGER_ERROR("opening pattern file {}", patternFile); - // return; - } - LOGGER_INFO("Using pattern str '{}' for regex from file '{}'", - patternStr, patternFile); - } - std::regex pattern; - if(filtering) - pattern.assign(patternStr); - mpi::communicator world; std::vector v_s_new; std::vector v_d_new; std::vector v_size_new; - time_t now = time(0); - // now = now - 5; // Threshold for mtime + time_t now = time(nullptr); + for(auto i = 0u; i < pt.m_sources.size(); ++i) { - const auto& s = pt.m_sources[i]; const auto& d = pt.m_targets[i]; - - // We need to expand directories to single files on the s - // Then create a new message for each file and append the - // file to the d prefix - // We will asume that the path is the original absolute - // The prefix selects the method of transfer - // And if not specified then we will use none - // i.e. ("xxxx:/xyyy/bbb -> gekko:/cccc/ttt ) then - // bbb/xxx -> ttt/xxx const auto& p = s.path(); - std::vector files; - // Check stat of p using FSPlugin class - auto fs = FSPlugin::make_fs( - static_cast(s.get_type())); + auto fs = FSPlugin::make_fs(static_cast(s.get_type())); struct stat buf; auto rstat = fs->stat(p, &buf); - if(rstat == 0 and (buf.st_mode & S_IFDIR)) { + if(rstat == 0 and S_ISDIR(buf.st_mode)) { LOGGER_INFO("Expanding input directory {}", p); - files = fs->readdir(p); - - // As we need to create a new directory, we need to order the files - // so that directories are created in the correct order - - // Order the files alphabetically + std::vector files = fs->readdir(p); std::sort(files.begin(), files.end()); - /* - We have all the files expanded. Now create a new - cargo::dataset for each file as s and a new - cargo::dataset appending the base directory in d to the - file name. - */ + for(const auto& f : files) { - cargo::dataset s_new(s); - cargo::dataset d_new(d); - s_new.path(f); - // We need to get filename from the original root - // path (d.path) plus the path from f, removing the - // initial path p (taking care of the trailing /) - // LOGGER_INFO("GKFS file {} checking ...", s_new.path()); - if(filtering) { - if(!std::regex_match(s_new.path(), pattern)) { - LOGGER_INFO("GKFS file {} IGNORED", s_new.path()); - continue; - } - } - auto leading = p.size(); - if(leading > 0 and p.back() == '/') { - leading--; + if(m_is_filtering_enabled && !std::regex_match(f, m_filename_pattern)) { + LOGGER_INFO("GKFS file {} IGNORED by regex", f); + continue; } - - d_new.path(d.path() / - std::filesystem::path(f.substr(leading + 1))); - - LOGGER_DEBUG("Expanded file {} -> {}", s_new.path(), - d_new.path()); - rstat = fs->stat(s_new.path(), &buf); - if(rstat == 0) { - if(buf.st_mtime < now) { - v_size_new.push_back(buf.st_size); - v_s_new.push_back(s_new); - v_d_new.push_back(d_new); - } - } - // break; - } - } else { - rstat = fs->stat(s.path(), &buf); - if(rstat == 0) { - if(buf.st_mtime < now) { - v_s_new.push_back(s); - v_d_new.push_back(d); + if(fs->stat(f, &buf) == 0 && buf.st_mtime < now) { + v_s_new.emplace_back(f, s.get_type()); v_size_new.push_back(buf.st_size); + + std::filesystem::path relative_path = std::filesystem::relative(f, p); + v_d_new.emplace_back(std::filesystem::path(d.path()) / relative_path, d.get_type()); } } + } else if (rstat == 0 && buf.st_mtime < now) { + v_s_new.push_back(s); + v_d_new.push_back(d); + v_size_new.push_back(buf.st_size); } } - // empty m_expanded_sources pt.m_expanded_sources.assign(v_s_new.begin(), v_s_new.end()); pt.m_expanded_targets.assign(v_d_new.begin(), v_d_new.end()); - // We have two vectors, so we process the transfer - // [1] Update request_manager - // [2] Send message to worker - - auto ec = m_request_manager.update(pt.m_p.tid(), v_s_new.size(), - pt.m_p.nworkers()); + auto ec = m_request_manager.update(pt.m_p.tid(), v_s_new.size(), pt.m_p.nworkers()); if(ec != error_code::success) { LOGGER_ERROR("Failed to update request: {}", ec); return; }; assert(v_s_new.size() == v_d_new.size()); - - // For all the transfers - for(std::size_t i = 0; i < v_s_new.size(); ++i) { - // const auto& s = v_s_new[i]; - const auto& d = v_d_new[i]; - - // Create the directory if it does not exist (only in - // parallel transfer) - if(!std::filesystem::path(d.path()).parent_path().empty() and - d.supports_parallel_transfer()) { - std::filesystem::create_directories( - std::filesystem::path(d.path()).parent_path()); + for(const auto& d : v_d_new) { + if(d.supports_parallel_transfer() && !std::filesystem::path(d.path()).parent_path().empty()) { + std::error_code fs_ec; + std::filesystem::create_directories(std::filesystem::path(d.path()).parent_path(), fs_ec); + if (fs_ec) { + LOGGER_ERROR("Failed to create directory {}: {}", std::filesystem::path(d.path()).parent_path().string(), fs_ec.message()); + } } } - // Send message to worker (seq number is 0) - if(v_s_new.size() != 0) { + if(!v_s_new.empty()) { for(std::size_t rank = 1; rank <= pt.m_p.nworkers(); ++rank) { - const auto [t, m] = - make_message(pt.m_p.tid(), 0, v_s_new, v_d_new, v_size_new); + const auto [t, m] = make_message(pt.m_p.tid(), 0, v_s_new, v_d_new, v_size_new); LOGGER_INFO("msg <= to: {} body: {}", rank, m); world.send(static_cast(rank), t, m); } @@ -485,7 +396,6 @@ void master_server::transfer_datasets(const network::request& req, const std::vector& sources, const std::vector& targets) { - // Offload the potentially blocking file operations to a handler thread pool m_network_engine.get_handler_pool().make_thread( [this, req, s = sources, t = targets]() mutable { do_transfer_datasets(req, std::move(s), std::move(t)); @@ -505,84 +415,37 @@ master_server::do_transfer_datasets(const network::request req, mpi::communicator world; const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); - LOGGER_INFO("rpc {:>} body: {{sources: {}, targets: {}}}", rpc, sources, - targets); - - - // As we accept directories expanding directories should be done before - // and update sources and targets. + LOGGER_INFO("rpc {:>} body: {{sources: {}, targets: {}}}", rpc, sources, targets); std::vector v_s_new; std::vector v_d_new; - // We ask for the size of the input files. std::vector v_size_new; for(auto i = 0u; i < sources.size(); ++i) { - const auto& s = sources[i]; const auto& d = targets[i]; - - - // We need to expand directories to single files on the s - // Then create a new message for each file and append the - // file to the d prefix - // We will asume that the path is the original absolute - // The prefix selects the method of transfer - // And if not specified then we will use none - // i.e. ("xxxx:/xyyy/bbb -> gekko:/cccc/ttt ) then - // bbb/xxx -> ttt/xxx const auto& p = s.path(); - std::vector files; - // Check stat of p using FSPlugin class - auto fs = FSPlugin::make_fs( - static_cast(s.get_type())); + auto fs = FSPlugin::make_fs(static_cast(s.get_type())); struct stat buf; auto rstat = fs->stat(p, &buf); - if(rstat == 0 and (buf.st_mode & S_IFDIR)) { + if(rstat == 0 and S_ISDIR(buf.st_mode)) { LOGGER_INFO("Expanding input directory {}", p); - files = fs->readdir(p); - // As we need to create a new directory, we need to order the files - // so that directories are created in the correct order - - // Order the files alphabetically + std::vector files = fs->readdir(p); std::sort(files.begin(), files.end()); - - /* - We have all the files expanded. Now create a new - cargo::dataset for each file as s and a new - cargo::dataset appending the base directory in d to the - file name. - */ for(const auto& f : files) { - cargo::dataset s_new(s); - cargo::dataset d_new(d); - s_new.path(f); - // We need to get filename from the original root - // path (d.path) plus the path from f, removing the - // initial path p (taking care of the trailing /) - auto leading = p.size(); - if(leading > 0 and p.back() == '/') { - leading--; - } - - d_new.path(d.path() / - std::filesystem::path(f.substr(leading + 1))); - - LOGGER_DEBUG("Expanded file {} -> {}", s_new.path(), - d_new.path()); - rstat = fs->stat(s_new.path(), &buf); - if(rstat == 0) + std::filesystem::path relative_path = std::filesystem::relative(f, p); + v_s_new.emplace_back(f, s.get_type()); + v_d_new.emplace_back(std::filesystem::path(d.path()) / relative_path, d.get_type()); + if(fs->stat(f, &buf) == 0) { v_size_new.push_back(buf.st_size); - v_s_new.push_back(s_new); - v_d_new.push_back(d_new); + } else { + v_size_new.push_back(0); + } } - } else { - // We do not create any optimization for single files - rstat = fs->stat(s.path(), &buf); if(rstat == 0) { v_size_new.push_back(buf.st_size); v_s_new.push_back(s); @@ -592,124 +455,90 @@ master_server::do_transfer_datasets(const network::request req, } m_request_manager.create(v_s_new.size(), world.size() - 1) - .or_else([&](auto&& ec) { - LOGGER_ERROR("Failed to create request: {}", ec); - LOGGER_INFO("rpc {:<} body: {{retval: {}}}", rpc, ec); - req.respond(generic_response{rpc.id(), ec}); - }) - .map([&](auto&& r) { - assert(v_s_new.size() == v_d_new.size()); - if(m_ftio) { - abt::unique_lock lock(m_ftio_mutex); - m_pending_transfer.m_p = r; - m_pending_transfer.m_sources = sources; - m_pending_transfer.m_targets = targets; - m_pending_transfer.m_work = true; - m_ftio_tid.store(r.tid()); - LOGGER_INFO("Stored stage-out information for transfer {}", r.tid()); - } else { - if(!v_s_new.empty()) { - for(const auto& d_item : v_d_new) { - if(d_item.supports_parallel_transfer() && !std::filesystem::path(d_item.path()).parent_path().empty()) { - std::filesystem::create_directories(std::filesystem::path(d_item.path()).parent_path()); - } - } - for(std::size_t rank = 1; rank <= r.nworkers(); ++rank) { - const auto [t, m] = make_message( - r.tid(), 0, v_s_new, v_d_new, v_size_new); - LOGGER_INFO("msg <= to: {} body: {}", rank, m); - world.send(static_cast(rank), t, m); + .or_else([&](auto&& ec) { + LOGGER_ERROR("Failed to create request: {}", ec); + req.respond(generic_response{rpc.id(), ec}); + }) + .map([&](auto&& r) { + if(m_ftio) { + abt::unique_lock lock(m_ftio_mutex); + m_pending_transfer.m_p = r; + m_pending_transfer.m_sources = sources; + m_pending_transfer.m_targets = targets; + m_pending_transfer.m_work = true; + m_ftio_tid.store(r.tid()); + LOGGER_INFO("Stored stage-out information for transfer {}", r.tid()); + } else { + if(!v_s_new.empty()) { + for(const auto& d_item : v_d_new) { + if(d_item.supports_parallel_transfer() && !std::filesystem::path(d_item.path()).parent_path().empty()) { + std::error_code fs_err; + std::filesystem::create_directories(std::filesystem::path(d_item.path()).parent_path(), fs_err); + if (fs_err) LOGGER_WARN("Could not create directory {}: {}", d_item.path(), fs_err.message()); } } + for(std::size_t rank = 1; rank <= r.nworkers(); ++rank) { + const auto [t, m] = make_message(r.tid(), 0, v_s_new, v_d_new, v_size_new); + LOGGER_INFO("msg <= to: {} body: {}", rank, m); + world.send(static_cast(rank), t, m); + } } - - LOGGER_INFO("rpc {:<} body: {{retval: {}, tid: {}}}", rpc, - error_code::success, r.tid()); - req.respond(response_with_id{rpc.id(), error_code::success, - r.tid()}); - }); + } + + req.respond(response_with_id{rpc.id(), error_code::success, r.tid()}); + }); } void master_server::transfer_status(const network::request& req, std::uint64_t tid) { - using network::get_address; using network::rpc_info; using proto::generic_response; using proto::status_response; + using response_type = status_response; - using response_type = - status_response; - - mpi::communicator world; const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); - LOGGER_DEBUG("rpc {:>} body: {{tid: {}}}", rpc, tid); m_request_manager.lookup(tid) - .or_else([&](auto&& ec) { - LOGGER_ERROR("Failed to lookup request: {}", ec); - LOGGER_INFO("rpc {:<} body: {{retval: {}}}", rpc, ec); - req.respond(generic_response{rpc.id(), ec}); - }) - .map([&](auto&& rs) { - LOGGER_INFO("rpc {:<} body: {{retval: {}, status: {}}}", rpc, - error_code::success, rs); - req.respond(response_type{ - rpc.id(), error_code::success, - std::make_tuple(rs.state(), rs.bw(), rs.error())}); - }); + .or_else([&](auto&& ec) { + LOGGER_ERROR("Failed to lookup request: {}", ec); + req.respond(generic_response{rpc.id(), ec}); + }) + .map([&](auto&& rs) { + LOGGER_INFO("rpc {:<} body: {{retval: {}, status: {}}}", rpc, error_code::success, rs); + req.respond(response_type{ + rpc.id(), error_code::success, + std::make_tuple(rs.state(), rs.bw(), rs.error())}); + }); } - void -master_server::transfer_statuses(const network::request& req, - std::uint64_t tid) { - +master_server::transfer_statuses(const network::request& req, std::uint64_t tid) { using network::get_address; using network::rpc_info; using proto::generic_response; using proto::statuses_response; + using response_type = statuses_response; - using response_type = statuses_response; - - mpi::communicator world; const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); - LOGGER_INFO("rpc {:>} body: {{tid: {}}}", rpc, tid); - // Get all the statuses of the associated transfer. returns a vector - // of transfer_status objects m_request_manager.lookup_all(tid) - .or_else([&](auto&& ec) { - LOGGER_ERROR("Failed to lookup request: {}", ec); - LOGGER_INFO("rpc {:<} body: {{retval: {}}}", rpc, ec); - req.respond(generic_response{rpc.id(), ec}); - }) - .map([&](auto&& rs) { - // We get a vector of request_status objects, we need to - // convert them to a vector of tuples with the same - // informations - std::vector>> - v{}; - for(auto& r : rs) { - v.push_back(std::make_tuple(r.name(), r.state(), r.bw(), - r.error())); - LOGGER_INFO( - "rpc {:<} body: {{retval: {}, name: {}, status: {}}}", - rpc, error_code::success, r.name(), r.state()); - } - // Generate a response type with the vector of tuples and - // respond - - - req.respond(response_type{rpc.id(), error_code::success, v}); - }); + .or_else([&](auto&& ec) { + LOGGER_ERROR("Failed to lookup request: {}", ec); + req.respond(generic_response{rpc.id(), ec}); + }) + .map([&](auto&& rs) { + std::vector>> v{}; + v.reserve(rs.size()); + for(auto& r : rs) { + v.emplace_back(r.name(), r.state(), r.bw(), r.error()); + } + req.respond(response_type{rpc.id(), error_code::success, v}); + }); } - void master_server::ftio_int(const network::request& req, float conf, float prob, float period, bool run, bool pause, bool resume) { diff --git a/src/master.hpp b/src/master.hpp index 465017f..6c0b828 100644 --- a/src/master.hpp +++ b/src/master.hpp @@ -33,6 +33,7 @@ #include "shared_mutex.hpp" #include #include +#include namespace cargo { @@ -56,8 +57,7 @@ class master_server : public network::server, public network::provider { public: master_server(std::string name, std::string address, bool daemonize, - std::filesystem::path rundir, std::uint64_t block_size, - std::string regex_file, + std::filesystem::path rundir, std::uint64_t block_size, std::string regex_file, std::optional pidfile = {}); ~master_server(); @@ -80,19 +80,12 @@ private: const std::vector& sources, const std::vector& targets); - void - do_transfer_datasets(const network::request req, - std::vector sources, - std::vector targets); - void transfer_status(const network::request& req, std::uint64_t tid); void transfer_statuses(const network::request& req, std::uint64_t tid); - // Receives a request to increase or decrease BW - // -1 faster, 0 , +1 slower void bw_control(const network::request& req, std::uint64_t tid, std::int16_t shaping); @@ -102,6 +95,11 @@ private: ftio_int(const network::request& req, float confidence, float probability, float period, bool run, bool pause, bool resume); + void + do_transfer_datasets(const network::request req, + std::vector sources, + std::vector targets); + private: // Dedicated execution stream for the MPI listener ULT thallium::managed m_mpi_listener_ess; @@ -121,7 +119,7 @@ private: std::atomic m_ftio_run = {false}; std::atomic m_ftio_tid = {0}; std::atomic m_ftio = {false}; - + ssize_t m_block_size = 0; pending_transfer m_pending_transfer; @@ -132,6 +130,8 @@ private: // Request manager request_manager m_request_manager; std::string REGEX_file; + std::regex m_filename_pattern; + bool m_is_filtering_enabled = false; }; } // namespace cargo -- GitLab From 8e9b656f64ede235030a544021ae6d6b7e3c04e0 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 07:58:17 +0200 Subject: [PATCH 06/21] refactor --- cli/CMakeLists.txt | 102 +++----------- cli/common.cpp | 37 ++++++ cli/common.hpp | 36 +++++ cli/copy.cpp | 14 +- cli/ftio.cpp | 15 +-- cli/ping.cpp | 15 +-- cli/shaping.cpp | 15 +-- cli/shutdown.cpp | 23 +--- src/CMakeLists.txt | 2 + src/master.cpp | 116 +++++++--------- src/master.hpp | 16 ++- src/worker/base_operation.cpp | 41 ++++++ src/worker/base_operation.hpp | 64 +++++++++ src/worker/mpio_read.cpp | 26 ++-- src/worker/mpio_read.hpp | 20 +-- src/worker/mpio_write.cpp | 117 ++++++++-------- src/worker/mpio_write.hpp | 20 +-- src/worker/ops.cpp | 8 +- src/worker/ops.hpp | 2 +- src/worker/seq_mixed.cpp | 26 ++-- src/worker/seq_mixed.hpp | 22 +-- src/worker/worker.cpp | 244 ++++++++++++++++++---------------- src/worker/worker.hpp | 8 ++ 23 files changed, 511 insertions(+), 478 deletions(-) create mode 100644 cli/common.cpp create mode 100644 cli/common.hpp create mode 100644 src/worker/base_operation.cpp create mode 100644 src/worker/base_operation.hpp diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt index f779733..e7919d5 100644 --- a/cli/CMakeLists.txt +++ b/cli/CMakeLists.txt @@ -34,95 +34,35 @@ configure_file(cargoctl.in cargoctl @ONLY) ################################################################################ -## cargo_ping: A CLI tool to check if a Cargo server is running -add_executable(cargo_ping) - -target_sources(cargo_ping - PRIVATE - ping.cpp -) - -target_link_libraries(cargo_ping - PUBLIC - fmt::fmt - CLI11::CLI11 - net::rpc_client - cargo -) +## Common object library for CLI tools +add_library(cli_common OBJECT common.cpp) ################################################################################ -## cargo_shutdown: A CLI tool to shutdown a Cargo server -add_executable(cargo_shutdown) - -target_sources(cargo_shutdown - PRIVATE - shutdown.cpp -) - -target_link_libraries(cargo_shutdown - PUBLIC +# Helper function to define a CLI tool +function(add_cargo_cli_tool name source) + add_executable(${name}) + target_sources(${name} PRIVATE + ${source} + $ + ) + target_link_libraries(${name} PUBLIC fmt::fmt CLI11::CLI11 net::rpc_client cargo -) - -################################################################################ -## ccp: A CLI tool to request a Cargo server to copy files between storage tiers -add_executable(ccp) - -target_sources(ccp - PRIVATE - copy.cpp -) - -target_link_libraries(ccp - PUBLIC - fmt::fmt - CLI11::CLI11 - net::rpc_client - cargo -) + ) + install(TARGETS ${name} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) +endfunction() ################################################################################ -## shaping: A CLI tool to request a Cargo server to slowdown transfers -add_executable(shaping) - -target_sources(shaping - PRIVATE - shaping.cpp -) - -target_link_libraries(shaping - PUBLIC - fmt::fmt - CLI11::CLI11 - net::rpc_client - cargo -) - +## CLI tool definitions +add_cargo_cli_tool(cargo_ping ping.cpp) +add_cargo_cli_tool(cargo_shutdown shutdown.cpp) +add_cargo_cli_tool(ccp copy.cpp) +add_cargo_cli_tool(shaping shaping.cpp) +add_cargo_cli_tool(cargo_ftio ftio.cpp) ################################################################################ -## ftio: A CLI tool to send the ftio info to a Cargo server -add_executable(cargo_ftio) - -target_sources(cargo_ftio - PRIVATE - ftio.cpp -) - -target_link_libraries(cargo_ftio - PUBLIC - fmt::fmt - CLI11::CLI11 - net::rpc_client - cargo -) - - -install(TARGETS cargo_ping cargo_shutdown ccp shaping cargo_ftio - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} -) - +# Installation install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/cargoctl - DESTINATION ${CMAKE_INSTALL_BINDIR}) + DESTINATION ${CMAKE_INSTALL_BINDIR}) \ No newline at end of file diff --git a/cli/common.cpp b/cli/common.cpp new file mode 100644 index 0000000..d80b73a --- /dev/null +++ b/cli/common.cpp @@ -0,0 +1,37 @@ +/****************************************************************************** + * Copyright 2022-2023, Barcelona Supercomputing Center (BSC), Spain + * + * This software was partially supported by the EuroHPC-funded project ADMIRE + * (Project ID: 956748, https://www.admire-eurohpc.eu). + * + * This file is part of Cargo. + * + * Cargo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Cargo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Cargo. If not, see . + * + * SPDX-License-Identifier: GPL-3.0-or-later + *****************************************************************************/ +#include "common.hpp" +#include +#include + +std::pair +parse_address(const std::string& address) { + const auto pos = address.find("://"); + if(pos == std::string::npos) { + throw std::runtime_error(fmt::format("Invalid address: {}", address)); + } + + const auto protocol = address.substr(0, pos); + return std::make_pair(protocol, address); +} \ No newline at end of file diff --git a/cli/common.hpp b/cli/common.hpp new file mode 100644 index 0000000..3cb52c3 --- /dev/null +++ b/cli/common.hpp @@ -0,0 +1,36 @@ +/****************************************************************************** + * Copyright 2022-2023, Barcelona Supercomputing Center (BSC), Spain + * + * This software was partially supported by the EuroHPC-funded project ADMIRE + * (Project ID: 956748, https://www.admire-eurohpc.eu). + * + * This file is part of Cargo. + * + * Cargo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Cargo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Cargo. If not, see . + * + * SPDX-License-Identifier: GPL-3.0-or-later + *****************************************************************************/ + +#ifndef CARGO_CLI_COMMON_HPP +#define CARGO_CLI_COMMON_HPP + +#include +#include + +// Parses a server address string into protocol and address. +// Throws a runtime_error if the address is invalid. +std::pair +parse_address(const std::string& address); + +#endif // CARGO_CLI_COMMON_HPP \ No newline at end of file diff --git a/cli/copy.cpp b/cli/copy.cpp index dcccfa2..3c677ca 100644 --- a/cli/copy.cpp +++ b/cli/copy.cpp @@ -28,6 +28,7 @@ #include #include #include +#include "common.hpp" enum class dataset_flags { posix, parallel, none, gekkofs, hercules, expand, dataclay }; @@ -102,17 +103,6 @@ parse_command_line(int argc, char* argv[]) { } } -auto -parse_address(const std::string& address) { - const auto pos = address.find("://"); - if(pos == std::string::npos) { - throw std::runtime_error(fmt::format("Invalid address: {}", address)); - } - - const auto protocol = address.substr(0, pos); - return std::make_pair(protocol, address); -} - int main(int argc, char* argv[]) { @@ -146,4 +136,4 @@ main(int argc, char* argv[]) { fmt::print(stderr, "{}: Error: {}\n", cfg.progname, ex.what()); return EXIT_FAILURE; } -} +} \ No newline at end of file diff --git a/cli/ftio.cpp b/cli/ftio.cpp index ddeb25e..52c748b 100644 --- a/cli/ftio.cpp +++ b/cli/ftio.cpp @@ -28,6 +28,7 @@ #include #include #include +#include "common.hpp" struct ftio_config { std::string progname; @@ -83,18 +84,6 @@ parse_command_line(int argc, char* argv[]) { } } -auto -parse_address(const std::string& address) { - const auto pos = address.find("://"); - if(pos == std::string::npos) { - throw std::runtime_error(fmt::format("Invalid address: {}", address)); - } - - const auto protocol = address.substr(0, pos); - return std::make_pair(protocol, address); -} - - int main(int argc, char* argv[]) { @@ -130,4 +119,4 @@ main(int argc, char* argv[]) { fmt::print(stderr, "Error: {}\n", ex.what()); return EXIT_FAILURE; } -} +} \ No newline at end of file diff --git a/cli/ping.cpp b/cli/ping.cpp index 2449f8b..ecb3bab 100644 --- a/cli/ping.cpp +++ b/cli/ping.cpp @@ -28,6 +28,7 @@ #include #include #include +#include "common.hpp" struct ping_config { std::string progname; @@ -55,18 +56,6 @@ parse_command_line(int argc, char* argv[]) { } } -auto -parse_address(const std::string& address) { - const auto pos = address.find("://"); - if(pos == std::string::npos) { - throw std::runtime_error(fmt::format("Invalid address: {}", address)); - } - - const auto protocol = address.substr(0, pos); - return std::make_pair(protocol, address); -} - - int main(int argc, char* argv[]) { @@ -100,4 +89,4 @@ main(int argc, char* argv[]) { fmt::print(stderr, "Error: {}\n", ex.what()); return EXIT_FAILURE; } -} +} \ No newline at end of file diff --git a/cli/shaping.cpp b/cli/shaping.cpp index 75d9371..8711759 100644 --- a/cli/shaping.cpp +++ b/cli/shaping.cpp @@ -28,6 +28,7 @@ #include #include #include +#include "common.hpp" struct shaping_config { std::string progname; @@ -66,18 +67,6 @@ parse_command_line(int argc, char* argv[]) { } } -auto -parse_address(const std::string& address) { - const auto pos = address.find("://"); - if(pos == std::string::npos) { - throw std::runtime_error(fmt::format("Invalid address: {}", address)); - } - - const auto protocol = address.substr(0, pos); - return std::make_pair(protocol, address); -} - - int main(int argc, char* argv[]) { @@ -111,4 +100,4 @@ main(int argc, char* argv[]) { fmt::print(stderr, "Error: {}\n", ex.what()); return EXIT_FAILURE; } -} +} \ No newline at end of file diff --git a/cli/shutdown.cpp b/cli/shutdown.cpp index 89b6d98..bdfb4ba 100644 --- a/cli/shutdown.cpp +++ b/cli/shutdown.cpp @@ -28,16 +28,17 @@ #include #include #include +#include "common.hpp" -struct ping_config { +struct shutdown_config { std::string progname; std::string server_address; }; -ping_config +shutdown_config parse_command_line(int argc, char* argv[]) { - ping_config cfg; + shutdown_config cfg; cfg.progname = std::filesystem::path{argv[0]}.filename().string(); @@ -55,22 +56,10 @@ parse_command_line(int argc, char* argv[]) { } } -auto -parse_address(const std::string& address) { - const auto pos = address.find("://"); - if(pos == std::string::npos) { - throw std::runtime_error(fmt::format("Invalid address: {}", address)); - } - - const auto protocol = address.substr(0, pos); - return std::make_pair(protocol, address); -} - - int main(int argc, char* argv[]) { - ping_config cfg = parse_command_line(argc, argv); + shutdown_config cfg = parse_command_line(argc, argv); try { const auto [protocol, address] = parse_address(cfg.server_address); @@ -89,4 +78,4 @@ main(int argc, char* argv[]) { fmt::print(stderr, "Error: {}\n", ex.what()); return EXIT_FAILURE; } -} +} \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7021a84..d3492ac 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -34,6 +34,8 @@ target_sources( PRIVATE cargo.cpp master.cpp master.hpp + worker/base_operation.cpp + worker/base_operation.hpp worker/memory.hpp worker/mpio_read.cpp worker/mpio_read.hpp diff --git a/src/master.cpp b/src/master.cpp index a3c84d5..aa49585 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -230,8 +230,6 @@ master_server::ftio_scheduling_ult() { } LOGGER_INFO("FTIO triggered. Checking for work in {}", m_pending_transfer.m_sources); - m_pending_transfer.m_expanded_sources.clear(); - m_pending_transfer.m_expanded_targets.clear(); transfer_dataset_internal(m_pending_transfer); @@ -321,59 +319,75 @@ master_server::shutdown(const network::request& req) { server::shutdown(); } -void -master_server::transfer_dataset_internal(pending_transfer& pt) { +master_server::expanded_requests +master_server::expand_transfer_requests( + const std::vector& sources, + const std::vector& targets, + time_t modification_time_threshold) { - mpi::communicator world; - std::vector v_s_new; - std::vector v_d_new; - std::vector v_size_new; - time_t now = time(nullptr); - - for(auto i = 0u; i < pt.m_sources.size(); ++i) { - const auto& s = pt.m_sources[i]; - const auto& d = pt.m_targets[i]; + expanded_requests result; + + for(auto i = 0u; i < sources.size(); ++i) { + const auto& s = sources[i]; + const auto& d = targets[i]; const auto& p = s.path(); auto fs = FSPlugin::make_fs(static_cast(s.get_type())); struct stat buf; auto rstat = fs->stat(p, &buf); - if(rstat == 0 and S_ISDIR(buf.st_mode)) { + + if(rstat == 0 && S_ISDIR(buf.st_mode)) { LOGGER_INFO("Expanding input directory {}", p); std::vector files = fs->readdir(p); std::sort(files.begin(), files.end()); for(const auto& f : files) { if(m_is_filtering_enabled && !std::regex_match(f, m_filename_pattern)) { - LOGGER_INFO("GKFS file {} IGNORED by regex", f); + LOGGER_INFO("File {} IGNORED by regex", f); continue; } - if(fs->stat(f, &buf) == 0 && buf.st_mtime < now) { - v_s_new.emplace_back(f, s.get_type()); - v_size_new.push_back(buf.st_size); - + + struct stat file_buf; + if (fs->stat(f, &file_buf) == 0) { + if (modification_time_threshold > 0 && file_buf.st_mtime >= modification_time_threshold) { + continue; + } + result.sources.emplace_back(f, s.get_type()); + result.sizes.push_back(file_buf.st_size); std::filesystem::path relative_path = std::filesystem::relative(f, p); - v_d_new.emplace_back(std::filesystem::path(d.path()) / relative_path, d.get_type()); + result.targets.emplace_back(std::filesystem::path(d.path()) / relative_path, d.get_type()); } } - } else if (rstat == 0 && buf.st_mtime < now) { - v_s_new.push_back(s); - v_d_new.push_back(d); - v_size_new.push_back(buf.st_size); + } else if (rstat == 0) { + if (modification_time_threshold > 0 && buf.st_mtime >= modification_time_threshold) { + continue; + } + result.sources.push_back(s); + result.targets.push_back(d); + result.sizes.push_back(buf.st_size); } } + return result; +} - pt.m_expanded_sources.assign(v_s_new.begin(), v_s_new.end()); - pt.m_expanded_targets.assign(v_d_new.begin(), v_d_new.end()); +void +master_server::transfer_dataset_internal(pending_transfer& pt) { + mpi::communicator world; + time_t now = time(nullptr); + + auto expanded = expand_transfer_requests(pt.m_sources, pt.m_targets, now); + pt.m_expanded_sources = std::move(expanded.sources); + pt.m_expanded_targets = std::move(expanded.targets); + auto& v_size_new = expanded.sizes; - auto ec = m_request_manager.update(pt.m_p.tid(), v_s_new.size(), pt.m_p.nworkers()); + auto ec = m_request_manager.update(pt.m_p.tid(), pt.m_expanded_sources.size(), pt.m_p.nworkers()); if(ec != error_code::success) { LOGGER_ERROR("Failed to update request: {}", ec); return; }; - assert(v_s_new.size() == v_d_new.size()); - for(const auto& d : v_d_new) { + assert(pt.m_expanded_sources.size() == pt.m_expanded_targets.size()); + for(const auto& d : pt.m_expanded_targets) { if(d.supports_parallel_transfer() && !std::filesystem::path(d.path()).parent_path().empty()) { std::error_code fs_ec; std::filesystem::create_directories(std::filesystem::path(d.path()).parent_path(), fs_ec); @@ -383,9 +397,9 @@ master_server::transfer_dataset_internal(pending_transfer& pt) { } } - if(!v_s_new.empty()) { + if(!pt.m_expanded_sources.empty()) { for(std::size_t rank = 1; rank <= pt.m_p.nworkers(); ++rank) { - const auto [t, m] = make_message(pt.m_p.tid(), 0, v_s_new, v_d_new, v_size_new); + const auto [t, m] = make_message(pt.m_p.tid(), 0, pt.m_expanded_sources, pt.m_expanded_targets, v_size_new); LOGGER_INFO("msg <= to: {} body: {}", rank, m); world.send(static_cast(rank), t, m); } @@ -417,42 +431,10 @@ master_server::do_transfer_datasets(const network::request req, LOGGER_INFO("rpc {:>} body: {{sources: {}, targets: {}}}", rpc, sources, targets); - std::vector v_s_new; - std::vector v_d_new; - std::vector v_size_new; - - for(auto i = 0u; i < sources.size(); ++i) { - const auto& s = sources[i]; - const auto& d = targets[i]; - const auto& p = s.path(); - - auto fs = FSPlugin::make_fs(static_cast(s.get_type())); - struct stat buf; - auto rstat = fs->stat(p, &buf); - - if(rstat == 0 and S_ISDIR(buf.st_mode)) { - LOGGER_INFO("Expanding input directory {}", p); - std::vector files = fs->readdir(p); - std::sort(files.begin(), files.end()); - - for(const auto& f : files) { - std::filesystem::path relative_path = std::filesystem::relative(f, p); - v_s_new.emplace_back(f, s.get_type()); - v_d_new.emplace_back(std::filesystem::path(d.path()) / relative_path, d.get_type()); - if(fs->stat(f, &buf) == 0) { - v_size_new.push_back(buf.st_size); - } else { - v_size_new.push_back(0); - } - } - } else { - if(rstat == 0) { - v_size_new.push_back(buf.st_size); - v_s_new.push_back(s); - v_d_new.push_back(d); - } - } - } + auto expanded = expand_transfer_requests(sources, targets); + auto& v_s_new = expanded.sources; + auto& v_d_new = expanded.targets; + auto& v_size_new = expanded.sizes; m_request_manager.create(v_s_new.size(), world.size() - 1) .or_else([&](auto&& ec) { diff --git a/src/master.hpp b/src/master.hpp index 6c0b828..5a10585 100644 --- a/src/master.hpp +++ b/src/master.hpp @@ -100,6 +100,20 @@ private: std::vector sources, std::vector targets); + struct expanded_requests { + std::vector sources; + std::vector targets; + std::vector sizes; + }; + + expanded_requests expand_transfer_requests( + const std::vector& sources, + const std::vector& targets, + time_t modification_time_threshold = 0); + + void + transfer_dataset_internal(pending_transfer& pt); + private: // Dedicated execution stream for the MPI listener ULT thallium::managed m_mpi_listener_ess; @@ -125,8 +139,6 @@ private: pending_transfer m_pending_transfer; - void - transfer_dataset_internal(pending_transfer& pt); // Request manager request_manager m_request_manager; std::string REGEX_file; diff --git a/src/worker/base_operation.cpp b/src/worker/base_operation.cpp new file mode 100644 index 0000000..164a58a --- /dev/null +++ b/src/worker/base_operation.cpp @@ -0,0 +1,41 @@ +/****************************************************************************** + * Copyright 2022-2023, Barcelona Supercomputing Center (BSC), Spain + * + * This software was partially supported by the EuroHPC-funded project ADMIRE + * (Project ID: 956748, https://www.admire-eurohpc.eu). + * + * This file is part of Cargo. + * + * Cargo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Cargo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Cargo. If not, see . + * + * SPDX-License-Identifier: GPL-3.0-or-later + *****************************************************************************/ + +#include "base_operation.hpp" + +namespace cargo { + +base_operation::base_operation(mpi::communicator workers, std::filesystem::path input_path, + std::filesystem::path output_path, std::uint64_t block_size, + FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single) + : m_workers(std::move(workers)), m_input_path(std::move(input_path)), + m_output_path(std::move(output_path)), m_kb_size(block_size), + m_fs_i_type(fs_i_type), m_fs_o_type(fs_o_type), m_file_size(size), + m_single(single) { + + m_workers_size = m_single ? 1 : m_workers.size(); + m_workers_rank = m_single ? 0 : m_workers.rank(); +} + +} // namespace cargo \ No newline at end of file diff --git a/src/worker/base_operation.hpp b/src/worker/base_operation.hpp new file mode 100644 index 0000000..9aa1c91 --- /dev/null +++ b/src/worker/base_operation.hpp @@ -0,0 +1,64 @@ +/****************************************************************************** + * Copyright 2022-2023, Barcelona Supercomputing Center (BSC), Spain + * + * This software was partially supported by the EuroHPC-funded project ADMIRE + * (Project ID: 956748, https://www.admire-eurohpc.eu). + * + * This file is part of Cargo. + * + * Cargo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Cargo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Cargo. If not, see . + * + * SPDX-License-Identifier: GPL-3.0-or-later + *****************************************************************************/ + +#ifndef CARGO_WORKER_BASE_OPERATION_HPP +#define CARGO_WORKER_BASE_OPERATION_HPP + +#include "ops.hpp" +#include "memory.hpp" +#include +#include + +namespace mpi = boost::mpi; + +namespace cargo { + +class base_operation : public operation { +public: + base_operation(mpi::communicator workers, std::filesystem::path input_path, + std::filesystem::path output_path, std::uint64_t block_size, + FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single); + + std::string output_path() const final { return m_output_path.string(); } + std::string input_path() const final { return m_input_path.string(); } + +protected: + mpi::communicator m_workers; + std::filesystem::path m_input_path{}; + std::filesystem::path m_output_path{}; + + int m_workers_size; + int m_workers_rank; + std::size_t m_block_size; + + std::uint64_t m_kb_size; + FSPlugin::type m_fs_i_type; + FSPlugin::type m_fs_o_type; + std::size_t m_file_size; + bool m_single; +}; + +} // namespace cargo + +#endif // CARGO_WORKER_BASE_OPERATION_HPP \ No newline at end of file diff --git a/src/worker/mpio_read.cpp b/src/worker/mpio_read.cpp index bd8e8f6..65eaba9 100644 --- a/src/worker/mpio_read.cpp +++ b/src/worker/mpio_read.cpp @@ -38,19 +38,23 @@ mpio_read::mpio_read(mpi::communicator workers, std::filesystem::path output_path, std::uint64_t block_size, FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single) - : m_workers(std::move(workers)), m_input_path(std::move(input_path)), - m_output_path(std::move(output_path)), m_kb_size(block_size), - m_fs_i_type(fs_i_type), m_fs_o_type(fs_o_type), m_file_size(size), - m_single(single) {} + : base_operation(std::move(workers), std::move(input_path), std::move(output_path), + block_size, fs_i_type, fs_o_type, size, single) {} -cargo::error_code -mpio_read::setup() { +void mpio_read::_calculate_file_ranges() { using posix_file::views::all_of; using posix_file::views::as_blocks; using posix_file::views::strided; + auto file_view = posix_file::file{m_input_path, m_fs_i_type}; + for(const auto& range : all_of(file_view) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { + m_file_ranges.push_back(range); + } +} + +cargo::error_code +mpio_read::setup() { m_status = error_code::transfer_in_progress; try { - // MPI communicators are cheap to copy const auto input_file = mpioxx::file::open( m_workers, m_input_path, mpioxx::file_open_mode::rdonly); @@ -65,9 +69,6 @@ mpio_read::setup() { ++total_blocks; } - m_workers_size = m_single ? 1 : m_workers.size(); - m_workers_rank = m_single ? 0 : m_workers.rank(); - MPI_Datatype file_type; MPI_Type_vector(total_blocks, 1, m_workers_size, block_type, &file_type); MPI_Type_commit(&file_type); @@ -107,10 +108,7 @@ mpio_read::setup() { m_block_size = block_size; - auto file_view = posix_file::file{m_input_path, m_fs_i_type}; - for(const auto& range : all_of(file_view) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { - m_file_ranges.push_back(range); - } + _calculate_file_ranges(); } catch(const mpioxx::io_error& e) { LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); diff --git a/src/worker/mpio_read.hpp b/src/worker/mpio_read.hpp index 4ade702..b0c83f9 100644 --- a/src/worker/mpio_read.hpp +++ b/src/worker/mpio_read.hpp @@ -25,7 +25,7 @@ #ifndef CARGO_WORKER_MPIO_READ_HPP #define CARGO_WORKER_MPIO_READ_HPP -#include "ops.hpp" +#include "base_operation.hpp" #include "memory.hpp" #include #include @@ -34,7 +34,7 @@ namespace mpi = boost::mpi; namespace cargo { -class mpio_read : public operation { +class mpio_read : public base_operation { public: mpio_read(mpi::communicator workers, std::filesystem::path input_path, @@ -44,27 +44,13 @@ public: cargo::error_code setup() final; progress_status progress() final; - std::string output_path() const { return m_output_path.string(); } - std::string input_path() const { return m_input_path.string(); } - private: - mpi::communicator m_workers; - std::filesystem::path m_input_path{}; - std::filesystem::path m_output_path{}; - + void _calculate_file_ranges(); std::unique_ptr m_output_file; - int m_workers_size; - int m_workers_rank; - std::size_t m_block_size; memory_buffer m_buffer; std::vector m_buffer_regions; std::vector m_file_ranges; size_t m_current_block_index = 0; - std::uint64_t m_kb_size; - FSPlugin::type m_fs_i_type; - FSPlugin::type m_fs_o_type; - std::size_t m_file_size; - bool m_single; }; } // namespace cargo diff --git a/src/worker/mpio_write.cpp b/src/worker/mpio_write.cpp index 756fb76..19b0a37 100644 --- a/src/worker/mpio_write.cpp +++ b/src/worker/mpio_write.cpp @@ -36,82 +36,77 @@ namespace cargo { mpio_write::mpio_write(mpi::communicator workers, std::filesystem::path input_path, std::filesystem::path output_path, std::uint64_t block_size, FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single) - : m_workers(std::move(workers)), m_input_path(std::move(input_path)), - m_output_path(std::move(output_path)), - m_kb_size(block_size), m_fs_i_type(fs_i_type), - m_fs_o_type(fs_o_type), m_file_size(size), m_single(single) {} + : base_operation(std::move(workers), std::move(input_path), std::move(output_path), + block_size, fs_i_type, fs_o_type, size, single) {} -cargo::error_code -mpio_write::setup() { +void mpio_write::_read_input_file_sequentially() { using posix_file::views::all_of; using posix_file::views::as_blocks; using posix_file::views::strided; - m_status = error_code::transfer_in_progress; - try { - m_workers_size = m_single ? 1 : m_workers.size(); - m_workers_rank = m_single ? 0 : m_workers.rank(); + + auto input_file = posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type); + + std::vector buffer_regions; + buffer_regions.reserve(m_buffer.size() / m_block_size); + for(std::size_t i = 0; i < m_buffer.size() / m_block_size; ++i) { + buffer_regions.emplace_back(m_buffer.data() + i * m_block_size, m_block_size); + } + + m_bytes_per_rank = 0; + int index = 0; + for(const auto& file_range : + all_of(input_file) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { - m_block_size = m_kb_size * 1024u; + assert((unsigned)index < buffer_regions.size()); + auto& buffer_region = buffer_regions[index]; + assert(buffer_region.size() >= file_range.size()); - auto input_file = posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type); - - m_total_blocks = static_cast(m_file_size / m_block_size); - if(m_file_size % m_block_size != 0) { - ++m_total_blocks; - } - - std::size_t blocks_per_rank = m_total_blocks / m_workers_size; - if(int64_t n = m_total_blocks % m_workers_size; n != 0 && m_workers_rank < n) { - ++blocks_per_rank; - } - - m_buffer.resize(blocks_per_rank * m_block_size); - std::vector buffer_regions; - buffer_regions.reserve(blocks_per_rank); - for(std::size_t i = 0; i < blocks_per_rank; ++i) { - buffer_regions.emplace_back(m_buffer.data() + i * m_block_size, m_block_size); + auto start = std::chrono::steady_clock::now(); + const std::size_t n = input_file.pread(buffer_region, file_range.offset(), file_range.size()); + m_bytes_per_rank += n; + + auto sleep_duration = sleep_value(); + if(sleep_duration > 0ms) std::this_thread::sleep_for(sleep_duration); + + auto end = std::chrono::steady_clock::now(); + double elapsed_seconds = std::chrono::duration_cast>(end - start).count(); + if (elapsed_seconds > 0) { + bw((m_block_size / (1024.0 * 1024.0)) / elapsed_seconds); + LOGGER_DEBUG("BW (read) Update: {} / {} = {} MB/s [ Sleep {}ms ]", + m_block_size / 1024.0, elapsed_seconds, bw(), sleep_duration.count()); } + index++; + } +} - m_bytes_per_rank = 0; - int index = 0; - for(const auto& file_range : - all_of(input_file) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { - - assert((unsigned)index < buffer_regions.size()); - auto& buffer_region = buffer_regions[index]; - assert(buffer_region.size() >= file_range.size()); - - auto start = std::chrono::steady_clock::now(); - const std::size_t n = input_file.pread(buffer_region, file_range.offset(), file_range.size()); - m_bytes_per_rank += n; - - auto sleep_duration = sleep_value(); - if(sleep_duration > 0ms) std::this_thread::sleep_for(sleep_duration); - - auto end = std::chrono::steady_clock::now(); - double elapsed_seconds = std::chrono::duration_cast>(end - start).count(); - if (elapsed_seconds > 0) { - bw((m_block_size / (1024.0 * 1024.0)) / elapsed_seconds); - LOGGER_DEBUG("BW (read) Update: {} / {} = {} MB/s [ Sleep {}ms ]", - m_block_size / 1024.0, elapsed_seconds, bw(), sleep_duration.count()); - } - index++; - } +cargo::error_code +mpio_write::setup() { + m_status = error_code::transfer_in_progress; + m_block_size = m_kb_size * 1024u; + + m_total_blocks = static_cast(m_file_size / m_block_size); + if(m_file_size % m_block_size != 0) { + ++m_total_blocks; + } - } catch(const posix_file::io_error& e) { - LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); - return (m_status = make_system_error(e.error_code())); - } catch(const std::exception& e) { - LOGGER_ERROR("setup() Unexpected exception: {}", e.what()); - return (m_status = error_code::other); + std::size_t blocks_per_rank = m_total_blocks / m_workers_size; + if(int64_t n = m_total_blocks % m_workers_size; n != 0 && m_workers_rank < n) { + ++blocks_per_rank; } + m_buffer.resize(blocks_per_rank * m_block_size); + return (m_status = error_code::success); } operation::progress_status mpio_write::progress() { try { + if (!m_is_read_complete) { + _read_input_file_sequentially(); + m_is_read_complete = true; + } + const auto output_file = mpioxx::file::open( m_workers, m_output_path, mpioxx::file_open_mode::create | mpioxx::file_open_mode::wronly); @@ -146,6 +141,10 @@ mpio_write::progress() { LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); m_status = make_mpi_error(e.error_code()); return progress_status::Failed; + } catch(const posix_file::io_error& e) { + LOGGER_ERROR("{}() failed in progress: {}", e.where(), e.what()); + m_status = make_system_error(e.error_code()); + return progress_status::Failed; } catch(const std::exception& e) { LOGGER_ERROR("progress() Unexpected exception: {}", e.what()); m_status = error_code::other; diff --git a/src/worker/mpio_write.hpp b/src/worker/mpio_write.hpp index 52ace79..9b82c11 100644 --- a/src/worker/mpio_write.hpp +++ b/src/worker/mpio_write.hpp @@ -25,7 +25,7 @@ #ifndef CARGO_WORKER_MPIO_WRITE_HPP #define CARGO_WORKER_MPIO_WRITE_HPP -#include "ops.hpp" +#include "base_operation.hpp" #include "memory.hpp" #include #include @@ -34,7 +34,7 @@ namespace mpi = boost::mpi; namespace cargo { -class mpio_write : public operation { +class mpio_write : public base_operation { public: mpio_write(mpi::communicator workers, std::filesystem::path input_path, std::filesystem::path output_path, std::uint64_t block_size, @@ -43,25 +43,13 @@ public: cargo::error_code setup() final; progress_status progress() final; - std::string output_path() const { return m_output_path.string(); } - std::string input_path() const { return m_input_path.string(); } - private: - mpi::communicator m_workers; - std::filesystem::path m_input_path{}; - std::filesystem::path m_output_path{}; + void _read_input_file_sequentially(); - int m_workers_size; - int m_workers_rank; - std::size_t m_block_size; int m_total_blocks; memory_buffer m_buffer; std::size_t m_bytes_per_rank = 0; - std::uint64_t m_kb_size; - FSPlugin::type m_fs_i_type; - FSPlugin::type m_fs_o_type; - std::size_t m_file_size; - bool m_single; + bool m_is_read_complete = false; }; } // namespace cargo diff --git a/src/worker/ops.cpp b/src/worker/ops.cpp index 17ad2e2..1244da8 100644 --- a/src/worker/ops.cpp +++ b/src/worker/ops.cpp @@ -32,7 +32,7 @@ namespace mpi = boost::mpi; namespace cargo { std::unique_ptr -operation::make_operation(cargo::tag t, mpi::communicator workers, +operation::make_operation(cargo::tag t, mpi::communicator& workers, std::filesystem::path input_path, std::filesystem::path output_path, std::uint64_t block_size, FSPlugin::type fs_i_type, @@ -42,16 +42,16 @@ operation::make_operation(cargo::tag t, mpi::communicator workers, switch(t) { case tag::pread: return std::make_unique( - std::move(workers), std::move(input_path), + workers, std::move(input_path), std::move(output_path), block_size, fs_i_type, fs_o_type, size, single); case tag::pwrite: return std::make_unique( - std::move(workers), std::move(input_path), + workers, std::move(input_path), std::move(output_path), block_size, fs_i_type, fs_o_type, size, single); case tag::sequential: // Fallthrough to seq_mixed case tag::seq_mixed: return std::make_unique( - std::move(workers), std::move(input_path), + workers, std::move(input_path), std::move(output_path), block_size, fs_i_type, fs_o_type, size, single); default: return {}; diff --git a/src/worker/ops.hpp b/src/worker/ops.hpp index cfe5d06..b0ca128 100644 --- a/src/worker/ops.hpp +++ b/src/worker/ops.hpp @@ -38,7 +38,7 @@ public: enum class progress_status { InProgress, Done, Failed }; static std::unique_ptr - make_operation(cargo::tag t, boost::mpi::communicator workers, + make_operation(cargo::tag t, boost::mpi::communicator& workers, std::filesystem::path input_path, std::filesystem::path output_path, std::uint64_t block_size, FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single); diff --git a/src/worker/seq_mixed.cpp b/src/worker/seq_mixed.cpp index 5998195..8a41ff6 100644 --- a/src/worker/seq_mixed.cpp +++ b/src/worker/seq_mixed.cpp @@ -36,21 +36,24 @@ seq_mixed_operation::seq_mixed_operation(mpi::communicator workers, std::filesystem::path output_path, std::uint64_t block_size, FSPlugin::type fs_i_type, FSPlugin::type fs_o_type, std::size_t size, bool single) - : m_workers(std::move(workers)), m_input_path(std::move(input_path)), - m_output_path(std::move(output_path)), - m_kb_size(block_size), m_fs_i_type(fs_i_type), - m_fs_o_type(fs_o_type), m_file_size(size), m_single(single) {} + : base_operation(std::move(workers), std::move(input_path), std::move(output_path), + block_size, fs_i_type, fs_o_type, size, single) {} -cargo::error_code -seq_mixed_operation::setup() { + +void seq_mixed_operation::_calculate_file_ranges() { using posix_file::views::all_of; using posix_file::views::as_blocks; using posix_file::views::strided; + auto file_view = posix_file::file{m_input_path, m_fs_i_type}; + for(const auto& range : all_of(file_view) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { + m_file_ranges.push_back(range); + } +} + +cargo::error_code +seq_mixed_operation::setup() { m_status = error_code::transfer_in_progress; try { - m_workers_size = m_single ? 1 : m_workers.size(); - m_workers_rank = m_single ? 0 : m_workers.rank(); - m_block_size = m_kb_size * 1024u; m_buffer.resize(m_block_size); @@ -58,10 +61,7 @@ seq_mixed_operation::setup() { m_output_file = std::make_unique(posix_file::create(m_output_path, O_WRONLY, S_IRUSR | S_IWUSR, m_fs_o_type)); m_output_file->fallocate(0, 0, m_file_size); - auto file_view = posix_file::file{m_input_path, m_fs_i_type}; - for(const auto& range : all_of(file_view) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { - m_file_ranges.push_back(range); - } + _calculate_file_ranges(); } catch(const posix_file::io_error& e) { LOGGER_ERROR("{}() failed: {}", e.where(), e.what()); diff --git a/src/worker/seq_mixed.hpp b/src/worker/seq_mixed.hpp index 565b216..66d6010 100644 --- a/src/worker/seq_mixed.hpp +++ b/src/worker/seq_mixed.hpp @@ -25,7 +25,7 @@ #ifndef CARGO_WORKER_SEQ_MIXED_HPP #define CARGO_WORKER_SEQ_MIXED_HPP -#include "ops.hpp" +#include "base_operation.hpp" #include #include #include "memory.hpp" @@ -34,7 +34,7 @@ namespace mpi = boost::mpi; namespace cargo { -class seq_mixed_operation : public operation { +class seq_mixed_operation : public base_operation { public: seq_mixed_operation(mpi::communicator workers, @@ -46,30 +46,16 @@ public: cargo::error_code setup() final; progress_status progress() final; - std::string output_path() const { return m_output_path.string(); } - std::string input_path() const { return m_input_path.string(); } - private: - mpi::communicator m_workers; - std::filesystem::path m_input_path{}; - std::filesystem::path m_output_path{}; + void _calculate_file_ranges(); + std::unique_ptr m_input_file; std::unique_ptr m_output_file; - int m_workers_size; - int m_workers_rank; - std::size_t m_block_size; - std::vector m_file_ranges; size_t m_current_block_index = 0; memory_buffer m_buffer; - - std::uint64_t m_kb_size; - FSPlugin::type m_fs_i_type; - FSPlugin::type m_fs_o_type; - std::size_t m_file_size; - bool m_single; }; } // namespace cargo diff --git a/src/worker/worker.cpp b/src/worker/worker.cpp index 6967e6c..c58a192 100644 --- a/src/worker/worker.cpp +++ b/src/worker/worker.cpp @@ -50,17 +50,6 @@ make_communicator(const mpi::communicator& comm, const mpi::group& group, return {newcomm, boost::mpi::comm_take_ownership}; } -void -update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, - cargo::transfer_state st, float bw, - std::optional ec = std::nullopt) { - - mpi::communicator world; - const cargo::status_message m{tid, seqno, name, st, bw, ec}; - LOGGER_DEBUG("msg <= to: {} body: {{payload: {}}}", rank, m); - world.send(rank, static_cast(cargo::tag::status), m); -} - } // namespace namespace cargo { @@ -78,6 +67,130 @@ worker::set_block_size(std::uint64_t block_size) { m_block_size = block_size; } + +void +worker::update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, + cargo::transfer_state st, float bw, + std::optional ec) { + + mpi::communicator world; + const cargo::status_message m{tid, seqno, name, st, bw, ec}; + LOGGER_DEBUG("msg <= to: {} body: {{payload: {}}}", rank, m); + world.send(rank, static_cast(cargo::tag::status), m); +} + +void +worker::progress_operations() { + for (auto it = m_ops.begin(); it != m_ops.end(); ) { + auto& op = it->second; + + auto status = op->progress(); + + if (status != operation::progress_status::InProgress) { + // Operation is Done or Failed, send final status and remove. + cargo::error_code final_ec = op->status(); + update_state(op->source(), op->tid(), op->seqno(), op->output_path(), + (status == operation::progress_status::Done) ? transfer_state::completed : transfer_state::failed, + 0.0f, final_ec); + it = m_ops.erase(it); + } else { + // Operation is still in progress, send intermediate status if there's new info. + if (op->bw() > 0.0f) { + update_state(op->source(), op->tid(), op->seqno(), op->output_path(), transfer_state::running, op->bw()); + } + ++it; + } + } +} + +bool +worker::handle_new_messages(const mpi::communicator& workers, const mpi::communicator& world) { + auto maybe_msg = m_ops.empty() ? world.probe() : world.iprobe(); + + if(!maybe_msg) { + if (m_ops.empty()) { + std::this_thread::sleep_for(10ms); + } + return false; // Not a shutdown request + } + + auto msg = *maybe_msg; + switch(const auto t = static_cast(msg.tag())) { + case tag::pread: + case tag::pwrite: + case tag::seq_mixed: + case tag::sequential: { + transfer_message m; + world.recv(msg.source(), msg.tag(), m); + LOGGER_INFO("msg => from: {} body: {}", msg.source(), m); + + for(std::size_t i = 0; i < m.input_path().size(); i++) { + std::string input_path = m.input_path()[i]; + std::string output_path = m.output_path()[i]; + std::size_t size = m.sizes()[i]; + + bool is_small_file = (size <= m_block_size * 1024); + bool am_i_responsible = !is_small_file || ((i % workers.size()) == static_cast(workers.rank())); + + if (am_i_responsible) { + auto op_workers = workers; + if(is_small_file && workers.size() > 1) { + std::vector self_rank = { workers.rank() }; + auto group = workers.group().include(self_rank.begin(), self_rank.end()); + op_workers = ::make_communicator(workers, group, 0); + } + + auto op = operation::make_operation(t, op_workers, input_path, output_path, m_block_size, m.i_type(), m.o_type(), size, is_small_file); + op->set_comm(msg.source(), m.tid(), i, t); + + update_state(op->source(), op->tid(), i, output_path, transfer_state::pending, -1.0f); + + // Setup the operation. If it fails, report failure immediately. + if (op->setup() == error_code::success) { + update_state(op->source(), op->tid(), i, output_path, transfer_state::running, -1.0f); + m_ops.emplace(std::make_pair(m.tid(), i), std::move(op)); + } else { + LOGGER_ERROR("Operation setup failed for transfer {} file {}", op->tid(), op->input_path()); + update_state(op->source(), op->tid(), i, output_path, transfer_state::failed, 0.0f, op->status()); + } + + } else { + // This part is critical for the "many small files" case to unblock the master. + update_state(msg.source(), m.tid(), i, output_path, transfer_state::completed, 0.0f, error_code::success); + } + } + break; + } + case tag::bw_shaping: { + shaper_message m; + world.recv(msg.source(), msg.tag(), m); + LOGGER_INFO("msg => from: {} body: {}", msg.source(), m); + for(auto& [key, op] : m_ops) { + if(op && op->tid() == m.tid()) { + op->set_bw_shaping(m.shaping()); + } + } + break; + } + case tag::shutdown: + LOGGER_INFO("msg => from: {} body: {{shutdown}}", msg.source()); + world.recv(msg.source(), msg.tag()); + return true; // Shutdown requested + + default: + LOGGER_WARN("[{}] Unexpected message tag: {}", msg.source(), msg.tag()); + // Discard unexpected message + if (auto count = msg.count()) { + std::vector discard_buffer(*count); + world.recv(msg.source(), msg.tag(), discard_buffer.data(), *count); + } else { + world.recv(msg.source(), msg.tag()); + } + break; + } + return false; // Not a shutdown request +} + int worker::run() { @@ -105,113 +218,8 @@ worker::run() { bool done = false; while(!done) { - // First, progress all ongoing operations. - for (auto it = m_ops.begin(); it != m_ops.end(); ) { - auto& op = it->second; - - auto status = op->progress(); - - if (status != operation::progress_status::InProgress) { - // Operation is Done or Failed, send final status and remove. - cargo::error_code final_ec = op->status(); - update_state(op->source(), op->tid(), op->seqno(), op->output_path(), - (status == operation::progress_status::Done) ? transfer_state::completed : transfer_state::failed, - 0.0f, final_ec); - it = m_ops.erase(it); - } else { - // Operation is still in progress, send intermediate status if there's new info. - if (op->bw() > 0.0f) { - update_state(op->source(), op->tid(), op->seqno(), op->output_path(), transfer_state::running, op->bw()); - } - ++it; - } - } - - // Second, check for new MPI messages. - auto maybe_msg = m_ops.empty() ? world.probe() : world.iprobe(); - - if(!maybe_msg) { - if (m_ops.empty()) { - std::this_thread::sleep_for(10ms); - } - continue; - } - - auto msg = *maybe_msg; - switch(const auto t = static_cast(msg.tag())) { - case tag::pread: - case tag::pwrite: - case tag::seq_mixed: - case tag::sequential: { - transfer_message m; - world.recv(msg.source(), msg.tag(), m); - LOGGER_INFO("msg => from: {} body: {}", msg.source(), m); - - for(std::size_t i = 0; i < m.input_path().size(); i++) { - std::string input_path = m.input_path()[i]; - std::string output_path = m.output_path()[i]; - std::size_t size = m.sizes()[i]; - - bool is_small_file = (size <= m_block_size * 1024); - bool am_i_responsible = !is_small_file || ((i % workers.size()) == static_cast(workers.rank())); - - if (am_i_responsible) { - auto op_workers = workers; - if(is_small_file && workers.size() > 1) { - std::vector self_rank = { workers.rank() }; - auto group = workers.group().include(self_rank.begin(), self_rank.end()); - op_workers = ::make_communicator(workers, group, 0); - } - - auto op = operation::make_operation(t, op_workers, input_path, output_path, m_block_size, m.i_type(), m.o_type(), size, is_small_file); - op->set_comm(msg.source(), m.tid(), i, t); - - update_state(op->source(), op->tid(), i, output_path, transfer_state::pending, -1.0f); - - // Setup the operation. If it fails, report failure immediately. - if (op->setup() == error_code::success) { - update_state(op->source(), op->tid(), i, output_path, transfer_state::running, -1.0f); - m_ops.emplace(std::make_pair(m.tid(), i), std::move(op)); - } else { - LOGGER_ERROR("Operation setup failed for transfer {} file {}", op->tid(), op->input_path()); - update_state(op->source(), op->tid(), i, output_path, transfer_state::failed, 0.0f, op->status()); - } - - } else { - // This part is critical for the "many small files" case to unblock the master. - update_state(msg.source(), m.tid(), i, output_path, transfer_state::completed, 0.0f, error_code::success); - } - } - break; - } - case tag::bw_shaping: { - shaper_message m; - world.recv(msg.source(), msg.tag(), m); - LOGGER_INFO("msg => from: {} body: {}", msg.source(), m); - for(auto& [key, op] : m_ops) { - if(op && op->tid() == m.tid()) { - op->set_bw_shaping(m.shaping()); - } - } - break; - } - case tag::shutdown: - LOGGER_INFO("msg => from: {} body: {{shutdown}}", msg.source()); - world.recv(msg.source(), msg.tag()); - done = true; - break; - - default: - LOGGER_WARN("[{}] Unexpected message tag: {}", msg.source(), msg.tag()); - // Discard unexpected message - if (auto count = msg.count()) { - std::vector discard_buffer(*count); - world.recv(msg.source(), msg.tag(), discard_buffer.data(), *count); - } else { - world.recv(msg.source(), msg.tag()); - } - break; - } + progress_operations(); + done = handle_new_messages(workers, world); } LOGGER_INFO("Entering exit barrier..."); diff --git a/src/worker/worker.hpp b/src/worker/worker.hpp index b3c7ac9..40e9d82 100644 --- a/src/worker/worker.hpp +++ b/src/worker/worker.hpp @@ -30,6 +30,7 @@ #include #include #include "ops.hpp" +#include namespace cargo { class worker { @@ -45,6 +46,13 @@ public: int run(); +private: + void progress_operations(); + bool handle_new_messages(const boost::mpi::communicator& workers, const boost::mpi::communicator& world); + void update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, + cargo::transfer_state st, float bw, + std::optional ec = std::nullopt); + private: // Key: {transfer_id, file_sequence_number} // Value: The operation object -- GitLab From c30beda0d8ec90b76205a86297027266a1f7a90c Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 08:09:29 +0200 Subject: [PATCH 07/21] refactor 2 --- src/master.cpp | 68 ++++++-------- src/master.hpp | 4 + .../posix_file/fs_plugin/gekko_plugin.cpp | 51 +++++----- src/worker/base_operation.cpp | 11 +++ src/worker/base_operation.hpp | 6 +- src/worker/mpio_read.cpp | 10 -- src/worker/mpio_read.hpp | 3 - src/worker/seq_mixed.cpp | 11 --- src/worker/seq_mixed.hpp | 4 - src/worker/worker.cpp | 94 ++++++++++--------- src/worker/worker.hpp | 8 +- 11 files changed, 132 insertions(+), 138 deletions(-) diff --git a/src/master.cpp b/src/master.cpp index aa49585..dd6437b 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -371,14 +371,39 @@ master_server::expand_transfer_requests( } void -master_server::transfer_dataset_internal(pending_transfer& pt) { +master_server::_dispatch_transfer_to_workers(const parallel_request& r, + const expanded_requests& expanded) { mpi::communicator world; + + if(expanded.sources.empty()) { + return; + } + + // Create parent directories for parallel write targets + for(const auto& d_item : expanded.targets) { + if(d_item.supports_parallel_transfer() && !std::filesystem::path(d_item.path()).parent_path().empty()) { + std::error_code fs_err; + std::filesystem::create_directories(std::filesystem::path(d_item.path()).parent_path(), fs_err); + if (fs_err) LOGGER_WARN("Could not create directory {}: {}", d_item.path(), fs_err.message()); + } + } + + // Send messages to workers + for(std::size_t rank = 1; rank <= r.nworkers(); ++rank) { + auto mutable_sizes = expanded.sizes; + const auto [t, m] = make_message(r.tid(), 0, expanded.sources, expanded.targets, mutable_sizes); + LOGGER_INFO("msg <= to: {} body: {}", rank, m); + world.send(static_cast(rank), t, m); + } +} + +void +master_server::transfer_dataset_internal(pending_transfer& pt) { time_t now = time(nullptr); auto expanded = expand_transfer_requests(pt.m_sources, pt.m_targets, now); pt.m_expanded_sources = std::move(expanded.sources); pt.m_expanded_targets = std::move(expanded.targets); - auto& v_size_new = expanded.sizes; auto ec = m_request_manager.update(pt.m_p.tid(), pt.m_expanded_sources.size(), pt.m_p.nworkers()); if(ec != error_code::success) { @@ -387,23 +412,8 @@ master_server::transfer_dataset_internal(pending_transfer& pt) { }; assert(pt.m_expanded_sources.size() == pt.m_expanded_targets.size()); - for(const auto& d : pt.m_expanded_targets) { - if(d.supports_parallel_transfer() && !std::filesystem::path(d.path()).parent_path().empty()) { - std::error_code fs_ec; - std::filesystem::create_directories(std::filesystem::path(d.path()).parent_path(), fs_ec); - if (fs_ec) { - LOGGER_ERROR("Failed to create directory {}: {}", std::filesystem::path(d.path()).parent_path().string(), fs_ec.message()); - } - } - } - - if(!pt.m_expanded_sources.empty()) { - for(std::size_t rank = 1; rank <= pt.m_p.nworkers(); ++rank) { - const auto [t, m] = make_message(pt.m_p.tid(), 0, pt.m_expanded_sources, pt.m_expanded_targets, v_size_new); - LOGGER_INFO("msg <= to: {} body: {}", rank, m); - world.send(static_cast(rank), t, m); - } - } + + _dispatch_transfer_to_workers(pt.m_p, expanded); } void @@ -432,11 +442,8 @@ master_server::do_transfer_datasets(const network::request req, LOGGER_INFO("rpc {:>} body: {{sources: {}, targets: {}}}", rpc, sources, targets); auto expanded = expand_transfer_requests(sources, targets); - auto& v_s_new = expanded.sources; - auto& v_d_new = expanded.targets; - auto& v_size_new = expanded.sizes; - m_request_manager.create(v_s_new.size(), world.size() - 1) + m_request_manager.create(expanded.sources.size(), world.size() - 1) .or_else([&](auto&& ec) { LOGGER_ERROR("Failed to create request: {}", ec); req.respond(generic_response{rpc.id(), ec}); @@ -451,20 +458,7 @@ master_server::do_transfer_datasets(const network::request req, m_ftio_tid.store(r.tid()); LOGGER_INFO("Stored stage-out information for transfer {}", r.tid()); } else { - if(!v_s_new.empty()) { - for(const auto& d_item : v_d_new) { - if(d_item.supports_parallel_transfer() && !std::filesystem::path(d_item.path()).parent_path().empty()) { - std::error_code fs_err; - std::filesystem::create_directories(std::filesystem::path(d_item.path()).parent_path(), fs_err); - if (fs_err) LOGGER_WARN("Could not create directory {}: {}", d_item.path(), fs_err.message()); - } - } - for(std::size_t rank = 1; rank <= r.nworkers(); ++rank) { - const auto [t, m] = make_message(r.tid(), 0, v_s_new, v_d_new, v_size_new); - LOGGER_INFO("msg <= to: {} body: {}", rank, m); - world.send(static_cast(rank), t, m); - } - } + _dispatch_transfer_to_workers(r, expanded); } req.respond(response_with_id{rpc.id(), error_code::success, r.tid()}); diff --git a/src/master.hpp b/src/master.hpp index 5a10585..e5834bb 100644 --- a/src/master.hpp +++ b/src/master.hpp @@ -113,6 +113,10 @@ private: void transfer_dataset_internal(pending_transfer& pt); + + void + _dispatch_transfer_to_workers(const parallel_request& r, + const expanded_requests& expanded); private: // Dedicated execution stream for the MPI listener ULT diff --git a/src/posix_file/posix_file/fs_plugin/gekko_plugin.cpp b/src/posix_file/posix_file/fs_plugin/gekko_plugin.cpp index 9b070e8..cecbb60 100644 --- a/src/posix_file/posix_file/fs_plugin/gekko_plugin.cpp +++ b/src/posix_file/posix_file/fs_plugin/gekko_plugin.cpp @@ -1,4 +1,5 @@ + #include "fs_plugin.hpp" #include "gekko_plugin.hpp" #include @@ -71,30 +72,32 @@ gekko_plugin::unlink(const std::string& path) { std::vector gekko_plugin::readdir(const std::string& path) { - // Fill recursively the files, checking if the file is a directory - std::vector files; std::vector final_list; - files = gkfs::syscall::gkfs_get_file_list(path); - - for(auto& file : files) { - - struct stat buf; - std::string correct_path = file; - if(path.size() != 1) { - correct_path = path + "/" + file; - } else { - correct_path = "/" + file; - } - - stat(correct_path, &buf); - - if(S_ISDIR(buf.st_mode)) { - - std::vector subfiles = readdir(correct_path); - final_list.insert(final_list.end(), subfiles.begin(), - subfiles.end()); - } else { - final_list.push_back(correct_path); + std::vector dir_queue; + dir_queue.push_back(path); + + while(!dir_queue.empty()) { + std::string current_path = dir_queue.back(); + dir_queue.pop_back(); + + std::vector entries = gkfs::syscall::gkfs_get_file_list(current_path); + for(auto& entry : entries) { + struct stat buf; + std::string full_path; + + if(current_path.size() == 1 && current_path[0] == '/') { + full_path = "/" + entry; + } else { + full_path = current_path + "/" + entry; + } + + if(stat(full_path, &buf) == 0) { + if(S_ISDIR(buf.st_mode)) { + dir_queue.push_back(full_path); + } else { + final_list.push_back(full_path); + } + } } } @@ -118,4 +121,4 @@ gekko_plugin::size(const std::string& path) { } -} // namespace cargo +} // namespace cargo \ No newline at end of file diff --git a/src/worker/base_operation.cpp b/src/worker/base_operation.cpp index 164a58a..24c1645 100644 --- a/src/worker/base_operation.cpp +++ b/src/worker/base_operation.cpp @@ -23,6 +23,7 @@ *****************************************************************************/ #include "base_operation.hpp" +#include namespace cargo { @@ -38,4 +39,14 @@ base_operation::base_operation(mpi::communicator workers, std::filesystem::path m_workers_rank = m_single ? 0 : m_workers.rank(); } +void base_operation::_calculate_file_ranges() { + using posix_file::views::all_of; + using posix_file::views::as_blocks; + using posix_file::views::strided; + auto file_view = posix_file::file{m_input_path, m_fs_i_type}; + for(const auto& range : all_of(file_view) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { + m_file_ranges.push_back(range); + } +} + } // namespace cargo \ No newline at end of file diff --git a/src/worker/base_operation.hpp b/src/worker/base_operation.hpp index 9aa1c91..dbebdfc 100644 --- a/src/worker/base_operation.hpp +++ b/src/worker/base_operation.hpp @@ -28,7 +28,7 @@ #include "ops.hpp" #include "memory.hpp" #include -#include +#include namespace mpi = boost::mpi; @@ -44,6 +44,8 @@ public: std::string input_path() const final { return m_input_path.string(); } protected: + void _calculate_file_ranges(); + mpi::communicator m_workers; std::filesystem::path m_input_path{}; std::filesystem::path m_output_path{}; @@ -57,6 +59,8 @@ protected: FSPlugin::type m_fs_o_type; std::size_t m_file_size; bool m_single; + + std::vector m_file_ranges; }; } // namespace cargo diff --git a/src/worker/mpio_read.cpp b/src/worker/mpio_read.cpp index 65eaba9..0fc9940 100644 --- a/src/worker/mpio_read.cpp +++ b/src/worker/mpio_read.cpp @@ -41,16 +41,6 @@ mpio_read::mpio_read(mpi::communicator workers, : base_operation(std::move(workers), std::move(input_path), std::move(output_path), block_size, fs_i_type, fs_o_type, size, single) {} -void mpio_read::_calculate_file_ranges() { - using posix_file::views::all_of; - using posix_file::views::as_blocks; - using posix_file::views::strided; - auto file_view = posix_file::file{m_input_path, m_fs_i_type}; - for(const auto& range : all_of(file_view) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { - m_file_ranges.push_back(range); - } -} - cargo::error_code mpio_read::setup() { m_status = error_code::transfer_in_progress; diff --git a/src/worker/mpio_read.hpp b/src/worker/mpio_read.hpp index b0c83f9..22661b8 100644 --- a/src/worker/mpio_read.hpp +++ b/src/worker/mpio_read.hpp @@ -28,7 +28,6 @@ #include "base_operation.hpp" #include "memory.hpp" #include -#include namespace mpi = boost::mpi; @@ -45,11 +44,9 @@ public: progress_status progress() final; private: - void _calculate_file_ranges(); std::unique_ptr m_output_file; memory_buffer m_buffer; std::vector m_buffer_regions; - std::vector m_file_ranges; size_t m_current_block_index = 0; }; diff --git a/src/worker/seq_mixed.cpp b/src/worker/seq_mixed.cpp index 8a41ff6..b59d7c8 100644 --- a/src/worker/seq_mixed.cpp +++ b/src/worker/seq_mixed.cpp @@ -39,17 +39,6 @@ seq_mixed_operation::seq_mixed_operation(mpi::communicator workers, : base_operation(std::move(workers), std::move(input_path), std::move(output_path), block_size, fs_i_type, fs_o_type, size, single) {} - -void seq_mixed_operation::_calculate_file_ranges() { - using posix_file::views::all_of; - using posix_file::views::as_blocks; - using posix_file::views::strided; - auto file_view = posix_file::file{m_input_path, m_fs_i_type}; - for(const auto& range : all_of(file_view) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { - m_file_ranges.push_back(range); - } -} - cargo::error_code seq_mixed_operation::setup() { m_status = error_code::transfer_in_progress; diff --git a/src/worker/seq_mixed.hpp b/src/worker/seq_mixed.hpp index 66d6010..71200cf 100644 --- a/src/worker/seq_mixed.hpp +++ b/src/worker/seq_mixed.hpp @@ -27,7 +27,6 @@ #include "base_operation.hpp" #include -#include #include "memory.hpp" namespace mpi = boost::mpi; @@ -47,12 +46,9 @@ public: progress_status progress() final; private: - void _calculate_file_ranges(); - std::unique_ptr m_input_file; std::unique_ptr m_output_file; - std::vector m_file_ranges; size_t m_current_block_index = 0; memory_buffer m_buffer; diff --git a/src/worker/worker.cpp b/src/worker/worker.cpp index c58a192..ad541f3 100644 --- a/src/worker/worker.cpp +++ b/src/worker/worker.cpp @@ -67,9 +67,8 @@ worker::set_block_size(std::uint64_t block_size) { m_block_size = block_size; } - void -worker::update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, +worker::_update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, cargo::transfer_state st, float bw, std::optional ec) { @@ -80,7 +79,7 @@ worker::update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std } void -worker::progress_operations() { +worker::_progress_operations() { for (auto it = m_ops.begin(); it != m_ops.end(); ) { auto& op = it->second; @@ -89,22 +88,63 @@ worker::progress_operations() { if (status != operation::progress_status::InProgress) { // Operation is Done or Failed, send final status and remove. cargo::error_code final_ec = op->status(); - update_state(op->source(), op->tid(), op->seqno(), op->output_path(), + _update_state(op->source(), op->tid(), op->seqno(), op->output_path(), (status == operation::progress_status::Done) ? transfer_state::completed : transfer_state::failed, 0.0f, final_ec); it = m_ops.erase(it); } else { // Operation is still in progress, send intermediate status if there's new info. if (op->bw() > 0.0f) { - update_state(op->source(), op->tid(), op->seqno(), op->output_path(), transfer_state::running, op->bw()); + _update_state(op->source(), op->tid(), op->seqno(), op->output_path(), transfer_state::running, op->bw()); } ++it; } } } +void +worker::_process_transfer_message(const transfer_message& m, int source, int tag, + const mpi::communicator& workers) +{ + for(std::size_t i = 0; i < m.input_path().size(); i++) { + std::string input_path = m.input_path()[i]; + std::string output_path = m.output_path()[i]; + std::size_t size = m.sizes()[i]; + + bool is_small_file = (size <= m_block_size * 1024); + bool am_i_responsible = !is_small_file || ((i % workers.size()) == static_cast(workers.rank())); + + if (am_i_responsible) { + auto op_workers = workers; + if(is_small_file && workers.size() > 1) { + std::vector self_rank = { workers.rank() }; + auto group = workers.group().include(self_rank.begin(), self_rank.end()); + op_workers = ::make_communicator(workers, group, 0); + } + + auto op = operation::make_operation(static_cast(tag), op_workers, input_path, output_path, m_block_size, m.i_type(), m.o_type(), size, is_small_file); + op->set_comm(source, m.tid(), i, static_cast(tag)); + + _update_state(op->source(), op->tid(), i, output_path, transfer_state::pending, -1.0f); + + // Setup the operation. If it fails, report failure immediately. + if (op->setup() == error_code::success) { + _update_state(op->source(), op->tid(), i, output_path, transfer_state::running, -1.0f); + m_ops.emplace(std::make_pair(m.tid(), i), std::move(op)); + } else { + LOGGER_ERROR("Operation setup failed for transfer {} file {}", op->tid(), op->input_path()); + _update_state(op->source(), op->tid(), i, output_path, transfer_state::failed, 0.0f, op->status()); + } + + } else { + // This part is critical for the "many small files" case to unblock the master. + _update_state(source, m.tid(), i, output_path, transfer_state::completed, 0.0f, error_code::success); + } + } +} + bool -worker::handle_new_messages(const mpi::communicator& workers, const mpi::communicator& world) { +worker::_handle_incoming_message(const mpi::communicator& workers, const mpi::communicator& world) { auto maybe_msg = m_ops.empty() ? world.probe() : world.iprobe(); if(!maybe_msg) { @@ -116,6 +156,7 @@ worker::handle_new_messages(const mpi::communicator& workers, const mpi::communi auto msg = *maybe_msg; switch(const auto t = static_cast(msg.tag())) { + (void)t; case tag::pread: case tag::pwrite: case tag::seq_mixed: @@ -123,42 +164,7 @@ worker::handle_new_messages(const mpi::communicator& workers, const mpi::communi transfer_message m; world.recv(msg.source(), msg.tag(), m); LOGGER_INFO("msg => from: {} body: {}", msg.source(), m); - - for(std::size_t i = 0; i < m.input_path().size(); i++) { - std::string input_path = m.input_path()[i]; - std::string output_path = m.output_path()[i]; - std::size_t size = m.sizes()[i]; - - bool is_small_file = (size <= m_block_size * 1024); - bool am_i_responsible = !is_small_file || ((i % workers.size()) == static_cast(workers.rank())); - - if (am_i_responsible) { - auto op_workers = workers; - if(is_small_file && workers.size() > 1) { - std::vector self_rank = { workers.rank() }; - auto group = workers.group().include(self_rank.begin(), self_rank.end()); - op_workers = ::make_communicator(workers, group, 0); - } - - auto op = operation::make_operation(t, op_workers, input_path, output_path, m_block_size, m.i_type(), m.o_type(), size, is_small_file); - op->set_comm(msg.source(), m.tid(), i, t); - - update_state(op->source(), op->tid(), i, output_path, transfer_state::pending, -1.0f); - - // Setup the operation. If it fails, report failure immediately. - if (op->setup() == error_code::success) { - update_state(op->source(), op->tid(), i, output_path, transfer_state::running, -1.0f); - m_ops.emplace(std::make_pair(m.tid(), i), std::move(op)); - } else { - LOGGER_ERROR("Operation setup failed for transfer {} file {}", op->tid(), op->input_path()); - update_state(op->source(), op->tid(), i, output_path, transfer_state::failed, 0.0f, op->status()); - } - - } else { - // This part is critical for the "many small files" case to unblock the master. - update_state(msg.source(), m.tid(), i, output_path, transfer_state::completed, 0.0f, error_code::success); - } - } + _process_transfer_message(m, msg.source(), msg.tag(), workers); break; } case tag::bw_shaping: { @@ -218,8 +224,8 @@ worker::run() { bool done = false; while(!done) { - progress_operations(); - done = handle_new_messages(workers, world); + _progress_operations(); + done = _handle_incoming_message(workers, world); } LOGGER_INFO("Entering exit barrier..."); diff --git a/src/worker/worker.hpp b/src/worker/worker.hpp index 40e9d82..cc2263f 100644 --- a/src/worker/worker.hpp +++ b/src/worker/worker.hpp @@ -47,13 +47,13 @@ public: run(); private: - void progress_operations(); - bool handle_new_messages(const boost::mpi::communicator& workers, const boost::mpi::communicator& world); - void update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, + void _progress_operations(); + bool _handle_incoming_message(const boost::mpi::communicator& workers, const boost::mpi::communicator& world); + void _process_transfer_message(const transfer_message& msg, int source, int tag, const boost::mpi::communicator& workers); + void _update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, cargo::transfer_state st, float bw, std::optional ec = std::nullopt); -private: // Key: {transfer_id, file_sequence_number} // Value: The operation object std::map, std::unique_ptr> m_ops; -- GitLab From dbdf21300949240a21064954700cca37f671e60a Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 08:16:41 +0200 Subject: [PATCH 08/21] refactor 3 --- src/posix_file/CMakeLists.txt | 1 + src/posix_file/posix_file/file.cpp | 47 +++++++++++++++++++ src/posix_file/posix_file/file.hpp | 21 +++------ .../posix_file/fs_plugin/gekko_plugin.cpp | 2 + .../posix_file/fs_plugin/hercules_plugin.cpp | 32 ++++++++++++- .../posix_file/fs_plugin/hercules_plugin.hpp | 11 ++++- 6 files changed, 98 insertions(+), 16 deletions(-) create mode 100644 src/posix_file/posix_file/file.cpp diff --git a/src/posix_file/CMakeLists.txt b/src/posix_file/CMakeLists.txt index f81927b..a7921c8 100644 --- a/src/posix_file/CMakeLists.txt +++ b/src/posix_file/CMakeLists.txt @@ -42,6 +42,7 @@ target_sources( posix_file PRIVATE posix_file/types.hpp posix_file/file.hpp + posix_file/file.cpp posix_file/ranges.hpp posix_file/views.hpp posix_file/math.hpp diff --git a/src/posix_file/posix_file/file.cpp b/src/posix_file/posix_file/file.cpp new file mode 100644 index 0000000..baf44a5 --- /dev/null +++ b/src/posix_file/posix_file/file.cpp @@ -0,0 +1,47 @@ +/****************************************************************************** + * Copyright 2022-2023, Barcelona Supercomputing Center (BSC), Spain + * + * This software was partially supported by the EuroHPC-funded project ADMIRE + * (Project ID: 956748, https://www.admire-eurohpc.eu). + * + * This file is part of Cargo. + * + * Cargo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Cargo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Cargo. If not, see . + * + * SPDX-License-Identifier: GPL-3.0-or-later + *****************************************************************************/ + +#include "file.hpp" +#include + +namespace posix_file { + +void +recursive_mkdir(const std::filesystem::path& path, + const std::shared_ptr& fs_plugin) { + if (path.empty() || path == path.root_path()) { + return; + } + struct stat st; + if (fs_plugin->stat(path.string(), &st) == 0 && S_ISDIR(st.st_mode)) { + return; // Directory already exists. + } + // If parent path is not root, recurse. + if (path.has_parent_path() && path.parent_path() != path) { + recursive_mkdir(path.parent_path(), fs_plugin); + } + fs_plugin->mkdir(path.string(), 0755); +} + +} // namespace posix_file \ No newline at end of file diff --git a/src/posix_file/posix_file/file.hpp b/src/posix_file/posix_file/file.hpp index 8b12b14..0ecd168 100644 --- a/src/posix_file/posix_file/file.hpp +++ b/src/posix_file/posix_file/file.hpp @@ -32,7 +32,6 @@ #include #include #include "fs_plugin/fs_plugin.hpp" -#include "cargo.hpp" #include extern "C" { #include @@ -152,6 +151,10 @@ private: int m_error_code; }; +void +recursive_mkdir(const std::filesystem::path& path, + const std::shared_ptr& fs_plugin); + class file { @@ -295,15 +298,6 @@ protected: std::shared_ptr m_fs_plugin; }; -static void -recursive_mkdir(const std::filesystem::path& path, - std::shared_ptr fs_plugin) { - if(path.has_parent_path() and path != "/") { - recursive_mkdir(path.parent_path(), fs_plugin); - fs_plugin->mkdir(path.parent_path().c_str(), 0755); - } -} - static inline file open(const std::filesystem::path& filepath, int flags, ::mode_t mode, cargo::FSPlugin::type t) { @@ -311,10 +305,9 @@ open(const std::filesystem::path& filepath, int flags, ::mode_t mode, std::shared_ptr fs_plugin; fs_plugin = cargo::FSPlugin::make_fs(t); - // We don't check if it exists, we just create it if flags is set to O_CREAT - if(flags & O_CREAT) { - recursive_mkdir(filepath, fs_plugin); + if((flags & O_CREAT) && filepath.has_parent_path()) { + recursive_mkdir(filepath.parent_path(), fs_plugin); } int fd = fs_plugin->open(filepath.c_str(), flags, mode); @@ -334,4 +327,4 @@ create(const std::filesystem::path& filepath, int flags, ::mode_t mode, } // namespace posix_file -#endif // POSIX_FILE_FILE_HPP +#endif // POSIX_FILE_FILE_HPP \ No newline at end of file diff --git a/src/posix_file/posix_file/fs_plugin/gekko_plugin.cpp b/src/posix_file/posix_file/fs_plugin/gekko_plugin.cpp index cecbb60..941b2c2 100644 --- a/src/posix_file/posix_file/fs_plugin/gekko_plugin.cpp +++ b/src/posix_file/posix_file/fs_plugin/gekko_plugin.cpp @@ -5,6 +5,8 @@ #include #include +#include + namespace cargo { gekko_plugin::gekko_plugin() { int result = gkfs_init(); diff --git a/src/posix_file/posix_file/fs_plugin/hercules_plugin.cpp b/src/posix_file/posix_file/fs_plugin/hercules_plugin.cpp index f3c4a11..91422b5 100644 --- a/src/posix_file/posix_file/fs_plugin/hercules_plugin.cpp +++ b/src/posix_file/posix_file/fs_plugin/hercules_plugin.cpp @@ -1,4 +1,5 @@ + #include "fs_plugin.hpp" #include "hercules_plugin.hpp" @@ -61,4 +62,33 @@ hercules_plugin::fallocate(int fd, int mode, off_t offset, off_t len) { (void) len; return len; } -} // namespace cargo + +std::vector +hercules_plugin::readdir(const std::string& path) { + (void) path; + std::cerr << "hercules_plugin readdir not supported" << std::endl; + return {}; +} + +int +hercules_plugin::unlink(const std::string& path) { + (void) path; + std::cerr << "hercules_plugin unlink not supported" << std::endl; + return 0; +} + +int +hercules_plugin::stat(const std::string& path, struct stat* buf) { + (void) path; + (void) buf; + std::cerr << "hercules_plugin stat not supported" << std::endl; + return 0; +} + +ssize_t +hercules_plugin::size(const std::string& path) { + (void) path; + std::cerr << "hercules_plugin size not supported" << std::endl; + return 0; +} +} // namespace cargo \ No newline at end of file diff --git a/src/posix_file/posix_file/fs_plugin/hercules_plugin.hpp b/src/posix_file/posix_file/fs_plugin/hercules_plugin.hpp index fdef892..53ead12 100644 --- a/src/posix_file/posix_file/fs_plugin/hercules_plugin.hpp +++ b/src/posix_file/posix_file/fs_plugin/hercules_plugin.hpp @@ -1,4 +1,5 @@ + #ifndef HERCULES_PLUGIN_HPP #define HERCULES_PLUGIN_HPP @@ -24,7 +25,15 @@ public: lseek(int fd, off_t offset, int whence) final; off_t fallocate(int fd, int mode, off_t offset, off_t len) final; + std::vector + readdir(const std::string& path) final; + int + unlink(const std::string& path) final; + int + stat(const std::string& path, struct stat* buf) final; + ssize_t + size(const std::string& path) final; }; }; // namespace cargo -#endif // HERCULES_PLUGIN_HPP +#endif // HERCULES_PLUGIN_HPP \ No newline at end of file -- GitLab From c3ffa26ba7acd5ab042803c6ee2fcd46f6d69606 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 08:33:57 +0200 Subject: [PATCH 09/21] last refactor --- CMakeLists.txt | 114 ++---------------- cmake/FetchContentWrapper.cmake | 100 +++++++++++++++ src/net/CMakeLists.txt | 33 +---- src/posix_file/CMakeLists.txt | 45 +++---- .../posix_file/fs_plugin/fs_plugin.cpp | 49 +++----- 5 files changed, 144 insertions(+), 197 deletions(-) create mode 100644 cmake/FetchContentWrapper.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 786f534..b97e812 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,106 +36,12 @@ project( set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) -#[=======================================================================[.rst: - - include_from_source(contentName ...) - -The ``include_from_source()`` function ensures that ``contentName`` is -populated and potentially added to the build by the time it returns. - -**Options:** - - ``SOURCE_DIR ``: Source directory into which downloaded contents reside. - This must point to an existing directory where the external project has - already been unpacked or cloned/checked out. If ```` doesn't exist, - the source code will be retrieved. - - ``GIT_REPOSITORY `` - URL of the git repository. Any URL understood by the ``git`` command - may be used. - - ``GIT_TAG `` - Git branch name, tag or commit hash. Note that branch names and tags should - generally be specified as remote names (i.e. origin/myBranch rather than - simply myBranch). This ensures that if the remote end has its tag moved or - branch rebased or history rewritten, the local clone will still be updated - correctly. In general, however, specifying a commit hash should be - preferred for a number of reasons: - - If the local clone already has the commit corresponding to the hash, no git - fetch needs to be performed to check for changes each time CMake is re-run. - This can result in a significant speed up if many external projects are - being used. - - Using a specific git hash ensures that the main project's own history is - fully traceable to a specific point in the external project's evolution. - If a branch or tag name is used instead, then checking out a specific - commit of the main project doesn't necessarily pin the whole build to a - specific point in the life of the external project. The lack of such - deterministic behavior makes the main project lose traceability and - repeatability. - - NOTE: If both ``SOURCE_DIR`` and ``GIT_REPOSITORY`` are specified, - ``SOURCE_DIR`` will be the preferred location to populate ``contentName`` - from. If ``SOURCE_DIR`` doesn't exist, the function will fall back to the - location defined by ``GIT_REPOSITORY``. - -#]=======================================================================] -function(include_from_source contentName) - - set(OPTIONS) - set(SINGLE_VALUE MESSAGE SOURCE_DIR GIT_REPOSITORY GIT_TAG) - set(MULTI_VALUE) - - cmake_parse_arguments(ARGS "${OPTIONS}" "${SINGLE_VALUE}" "${MULTI_VALUE}" ${ARGN}) - - if (ARGS_MESSAGE) - message(STATUS ${ARGS_MESSAGE}) - endif () - - include(FetchContent) - - if (EXISTS ${ARGS_SOURCE_DIR}) - file(GLOB_RECURSE SOURCE_FILES "${ARGS_SOURCE_DIR}/*") - if (SOURCE_FILES STREQUAL "") - message(FATAL_ERROR - "The '${ARGS_SOURCE_DIR}' source directory appears " - "to be empty. If it corresponds to a git submodule it may not have " - "been properly initialized. Running:\n" - " 'git submodule update --init --recursive'\n" - "may fix the issue. If the directory corresponds to a manually " - "downloaded dependency, please download it again.") - endif () - - message(STATUS "Found source directory for '${contentName}'. Building.") - FetchContent_Declare( - ${contentName} - SOURCE_DIR ${ARGS_SOURCE_DIR} - ) - else () - message(STATUS - "Source directory for '${contentName}' not found.\n" - "Downloading and building from remote Git repository.") - - if (NOT ARGS_GIT_REPOSITORY) - message(FATAL_ERROR "GIT_REPOSITORY for \"${contentName}\" not defined") - endif () - - if (NOT ARGS_GIT_TAG) - message(FATAL_ERROR "GIT_TAG for \"${contentName}\" not defined") - endif () - - FetchContent_Declare( - ${contentName} - GIT_REPOSITORY ${ARGS_GIT_REPOSITORY} - GIT_TAG ${ARGS_GIT_TAG} - GIT_SHALLOW ON - GIT_PROGRESS ON - ) - endif () - - FetchContent_MakeAvailable(${contentName}) -endfunction() +# Make sure that CMake can find our internal modules +list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") + +# Import some convenience functions +include(FetchContentWrapper) +include(cargo-utils) # Set default build type and also populate a list of available options @@ -202,12 +108,6 @@ include(GNUInstallDirs) # define options that depend on other options include(CMakeDependentOption) -# Make sure that CMake can find our internal modules -list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") - -# Import some convenience functions -include(cargo-utils) - # ############################################################################## # Project configuration options # ############################################################################## @@ -508,4 +408,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}-${PROJECT_VERSION} - ) + ) \ No newline at end of file diff --git a/cmake/FetchContentWrapper.cmake b/cmake/FetchContentWrapper.cmake new file mode 100644 index 0000000..85aaa1d --- /dev/null +++ b/cmake/FetchContentWrapper.cmake @@ -0,0 +1,100 @@ +#[=======================================================================[.rst: + + include_from_source(contentName ...) + +The ``include_from_source()`` function ensures that ``contentName`` is +populated and potentially added to the build by the time it returns. + +**Options:** + + ``SOURCE_DIR ``: Source directory into which downloaded contents reside. + This must point to an existing directory where the external project has + already been unpacked or cloned/checked out. If ```` doesn't exist, + the source code will be retrieved. + + ``GIT_REPOSITORY `` + URL of the git repository. Any URL understood by the ``git`` command + may be used. + + ``GIT_TAG `` + Git branch name, tag or commit hash. Note that branch names and tags should + generally be specified as remote names (i.e. origin/myBranch rather than + simply myBranch). This ensures that if the remote end has its tag moved or + branch rebased or history rewritten, the local clone will still be updated + correctly. In general, however, specifying a commit hash should be + preferred for a number of reasons: + + If the local clone already has the commit corresponding to the hash, no git + fetch needs to be performed to check for changes each time CMake is re-run. + This can result in a significant speed up if many external projects are + being used. + + Using a specific git hash ensures that the main project's own history is + fully traceable to a specific point in the external project's evolution. + If a branch or tag name is used instead, then checking out a specific + commit of the main project doesn't necessarily pin the whole build to a + specific point in the life of the external project. The lack of such + deterministic behavior makes the main project lose traceability and + repeatability. + + NOTE: If both ``SOURCE_DIR`` and ``GIT_REPOSITORY`` are specified, + ``SOURCE_DIR`` will be the preferred location to populate ``contentName`` + from. If ``SOURCE_DIR`` doesn't exist, the function will fall back to the + location defined by ``GIT_REPOSITORY``. + +#]=======================================================================] +function(include_from_source contentName) + + set(OPTIONS) + set(SINGLE_VALUE MESSAGE SOURCE_DIR GIT_REPOSITORY GIT_TAG) + set(MULTI_VALUE) + + cmake_parse_arguments(ARGS "${OPTIONS}" "${SINGLE_VALUE}" "${MULTI_VALUE}" ${ARGN}) + + if (ARGS_MESSAGE) + message(STATUS ${ARGS_MESSAGE}) + endif () + + include(FetchContent) + + if (EXISTS ${ARGS_SOURCE_DIR}) + file(GLOB_RECURSE SOURCE_FILES "${ARGS_SOURCE_DIR}/*") + if (SOURCE_FILES STREQUAL "") + message(FATAL_ERROR + "The '${ARGS_SOURCE_DIR}' source directory appears " + "to be empty. If it corresponds to a git submodule it may not have " + "been properly initialized. Running:\n" + " 'git submodule update --init --recursive'\n" + "may fix the issue. If the directory corresponds to a manually " + "downloaded dependency, please download it again.") + endif () + + message(STATUS "Found source directory for '${contentName}'. Building.") + FetchContent_Declare( + ${contentName} + SOURCE_DIR ${ARGS_SOURCE_DIR} + ) + else () + message(STATUS + "Source directory for '${contentName}' not found.\n" + "Downloading and building from remote Git repository.") + + if (NOT ARGS_GIT_REPOSITORY) + message(FATAL_ERROR "GIT_REPOSITORY for \"${contentName}\" not defined") + endif () + + if (NOT ARGS_GIT_TAG) + message(FATAL_ERROR "GIT_TAG for \"${contentName}\" not defined") + endif () + + FetchContent_Declare( + ${contentName} + GIT_REPOSITORY ${ARGS_GIT_REPOSITORY} + GIT_TAG ${ARGS_GIT_TAG} + GIT_SHALLOW ON + GIT_PROGRESS ON + ) + endif () + + FetchContent_MakeAvailable(${contentName}) +endfunction() \ No newline at end of file diff --git a/src/net/CMakeLists.txt b/src/net/CMakeLists.txt index 5aa4c9d..0b8aafd 100644 --- a/src/net/CMakeLists.txt +++ b/src/net/CMakeLists.txt @@ -22,14 +22,7 @@ # SPDX-License-Identifier: GPL-3.0-or-later # ################################################################################ -add_library(rpc_common OBJECT) -target_sources( - rpc_common - INTERFACE endpoint.hpp request.hpp serialization.hpp utilities.hpp - signal_listener.hpp - PRIVATE endpoint.cpp -) - +add_library(rpc_common OBJECT endpoint.cpp) target_link_libraries(rpc_common PUBLIC logger::logger thallium asio::asio) set_property(TARGET rpc_common PROPERTY POSITION_INDEPENDENT_CODE ON) @@ -37,32 +30,14 @@ set_property(TARGET rpc_common PROPERTY POSITION_INDEPENDENT_CODE ON) # headers from these libraries as `` get_filename_component(PARENT_DIRECTORY "../" ABSOLUTE) -add_library(rpc_client STATIC) -target_sources( - rpc_client - INTERFACE client.hpp - PRIVATE client.cpp -) - +add_library(rpc_client STATIC client.cpp) target_link_libraries(rpc_client PUBLIC rpc_common) target_include_directories(rpc_client PUBLIC ${PARENT_DIRECTORY}) set_property(TARGET rpc_client PROPERTY POSITION_INDEPENDENT_CODE ON) add_library(net::rpc_client ALIAS rpc_client) -add_library(rpc_server STATIC) -target_sources( - rpc_server - INTERFACE server.hpp - PRIVATE server.cpp -) - +add_library(rpc_server STATIC server.cpp) target_link_libraries(rpc_server PUBLIC rpc_common) target_include_directories(rpc_server PUBLIC ${PARENT_DIRECTORY}) set_property(TARGET rpc_server PROPERTY POSITION_INDEPENDENT_CODE ON) -add_library(net::rpc_server ALIAS rpc_server) - -add_library(net_net STATIC) -target_include_directories(net_net PUBLIC ${PARENT_DIRECTORY}) -target_link_libraries(net_net PUBLIC rpc_client rpc_server) -set_property(TARGET net_net PROPERTY POSITION_INDEPENDENT_CODE ON) -add_library(net::net ALIAS net_net) +add_library(net::rpc_server ALIAS rpc_server) \ No newline at end of file diff --git a/src/posix_file/CMakeLists.txt b/src/posix_file/CMakeLists.txt index a7921c8..a5a21a6 100644 --- a/src/posix_file/CMakeLists.txt +++ b/src/posix_file/CMakeLists.txt @@ -23,45 +23,29 @@ ################################################################################ add_library(posix_file STATIC) -set(GEKKO_INCLUDES "") + +target_sources( + posix_file + PRIVATE + posix_file/file.cpp + posix_file/fs_plugin/fs_plugin.cpp + posix_file/fs_plugin/posix_plugin.cpp + posix_file/fs_plugin/none_plugin.cpp +) if (GekkoFS_FOUND) - set(GEKKO_INCLUDES posix_file/fs_plugin/gekko_plugin.hpp posix_file/fs_plugin/gekko_plugin.cpp) + target_sources(posix_file PRIVATE posix_file/fs_plugin/gekko_plugin.cpp) endif() if (Expand_FOUND) - set(EXPAND_INCLUDES posix_file/fs_plugin/expand_plugin.hpp posix_file/fs_plugin/expand_plugin.cpp) + target_sources(posix_file PRIVATE posix_file/fs_plugin/expand_plugin.cpp) endif() if (Hercules_FOUND) - set(HERCULES_INCLUDES posix_file/fs_plugin/hercules_plugin.hpp posix_file/fs_plugin/hercules_plugin.cpp) + target_sources(posix_file PRIVATE posix_file/fs_plugin/hercules_plugin.cpp) endif() if (DataClay_FOUND) - set(DATACLAY_INCLUDES posix_file/fs_plugin/dataclay_plugin.hpp posix_file/fs_plugin/dataclay_plugin.cpp) + target_sources(posix_file PRIVATE posix_file/fs_plugin/dataclay_plugin.cpp) endif() -target_sources( - posix_file - PRIVATE posix_file/types.hpp - posix_file/file.hpp - posix_file/file.cpp - posix_file/ranges.hpp - posix_file/views.hpp - posix_file/math.hpp - posix_file/views/block_iterator.hpp - posix_file/views/strided_iterator.hpp - posix_file/fs_plugin/fs_plugin.hpp - posix_file/fs_plugin/posix_plugin.hpp - posix_file/fs_plugin/fs_plugin.cpp - posix_file/fs_plugin/posix_plugin.cpp - posix_file/fs_plugin/none_plugin.hpp - posix_file/fs_plugin/none_plugin.cpp - ${GEKKO_INCLUDES} - ${HERCULES_INCLUDES} - ${EXPAND_INCLUDES} - ${DATACLAY_INCLUDES} - -) - - target_include_directories(posix_file INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) set_property(TARGET posix_file PROPERTY POSITION_INDEPENDENT_CODE ON) @@ -83,5 +67,4 @@ if (DataClay_FOUND) set(ADHOC ${ADHOC} DataClay::DataClay) endif() -target_link_libraries(posix_file INTERFACE fmt::fmt tl::expected PRIVATE ${ADHOC}) - +target_link_libraries(posix_file INTERFACE fmt::fmt tl::expected PRIVATE ${ADHOC}) \ No newline at end of file diff --git a/src/posix_file/posix_file/fs_plugin/fs_plugin.cpp b/src/posix_file/posix_file/fs_plugin/fs_plugin.cpp index e0dd508..f0d0e87 100644 --- a/src/posix_file/posix_file/fs_plugin/fs_plugin.cpp +++ b/src/posix_file/posix_file/fs_plugin/fs_plugin.cpp @@ -15,53 +15,42 @@ #endif namespace cargo { -static std::shared_ptr m_fs_posix; -static std::shared_ptr m_fs_gekkofs; -static std::shared_ptr m_fs_dataclay; -static std::shared_ptr m_fs_hercules; -static std::shared_ptr m_fs_expand; -static std::shared_ptr m_fs_none; - - - std::shared_ptr FSPlugin::make_fs(type t) { switch(t) { - case type::none: - if(m_fs_none == nullptr) - m_fs_none = std::make_shared(); - return m_fs_none; + case type::none: { + static auto m_fs_none = std::make_shared(); + return m_fs_none; + } case type::posix: - case type::parallel: - if(m_fs_posix == nullptr) - m_fs_posix = std::make_shared(); + case type::parallel: { + static auto m_fs_posix = std::make_shared(); return m_fs_posix; + } #ifdef GEKKOFS_PLUGIN - case type::gekkofs: - if(m_fs_gekkofs == nullptr) - m_fs_gekkofs = std::make_shared(); + case type::gekkofs: { + static auto m_fs_gekkofs = std::make_shared(); return m_fs_gekkofs; - + } #endif #ifdef DATACLAY_PLUGIN - case type::dataclay: - if(m_fs_dataclay == nullptr) - m_fs_dataclay = std::make_shared(); + case type::dataclay: { + static auto m_fs_dataclay = std::make_shared(); return m_fs_dataclay; - + } #endif #ifdef HERCULES_PLUGIN - case type::hercules: - if(m_fs_hercules == nullptr) - m_fs_hercules = std::make_shared(); + case type::hercules: { + static auto m_fs_hercules = std::make_shared(); return m_fs_hercules; + } #endif #ifdef EXPAND_PLUGIN - case type::expand: - if(m_fs_expand == nullptr) - m_fs_expand = std::make_shared(); + case type::expand: { + static auto m_fs_expand = std::make_shared(); return m_fs_expand; + } #endif default: return {}; -- GitLab From b7cb3fd3829db1a9f4ec867ec6436f3d1cd9706f Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 08:46:19 +0200 Subject: [PATCH 10/21] step 2 --- src/master.cpp | 208 +++++++++++++++++++++++++++---------------------- src/master.hpp | 16 ++-- 2 files changed, 122 insertions(+), 102 deletions(-) diff --git a/src/master.cpp b/src/master.cpp index dd6437b..5b718e7 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -170,15 +170,16 @@ master_server::mpi_listener_ult() { mpi::communicator world; while(!m_shutting_down) { - if (auto msg = world.iprobe()) { - switch(static_cast(msg->tag())) { + if (auto optional_msg = world.iprobe()) { + auto& msg = *optional_msg; + switch(static_cast(msg.tag())) { case tag::status: { status_message m; - world.recv(msg->source(), msg->tag(), m); + world.recv(msg.source(), msg.tag(), m); LOGGER_DEBUG("msg => from: {} body: {{payload: {}}}", - msg->source(), m); + msg.source(), m); - m_request_manager.update(m.tid(), m.seqno(), msg->source() - 1, + m_request_manager.update(m.tid(), m.seqno(), msg.source() - 1, m.name(), m.state(), m.bw(), m.error_code()); break; @@ -186,12 +187,12 @@ master_server::mpi_listener_ult() { default: LOGGER_WARN("msg => from: {} body: {{Unexpected tag: {}}}", - msg->source(), msg->tag()); - if (auto count = msg->count()) { + msg.source(), msg.tag()); + if (auto count = msg.count()) { std::vector discard_buffer(*count); - world.recv(msg->source(), msg->tag(), discard_buffer.data(), *count); + world.recv(msg.source(), msg.tag(), discard_buffer.data(), *count); } else { - world.recv(msg->source(), msg->tag()); + world.recv(msg.source(), msg.tag()); } break; } @@ -229,36 +230,50 @@ master_server::ftio_scheduling_ult() { continue; } - LOGGER_INFO("FTIO triggered. Checking for work in {}", m_pending_transfer.m_sources); + LOGGER_INFO("FTIO triggered. Processing pending transfer {}", m_pending_transfer.m_p.tid()); - transfer_dataset_internal(m_pending_transfer); - - if (!m_pending_transfer.m_expanded_sources.empty()) { - LOGGER_INFO("Transferring {} files.", m_pending_transfer.m_expanded_sources.size()); - bool finished = false; - while(!finished && !m_shutting_down) { - thallium::thread::self().sleep(m_network_engine, 1000); // Poll status every second - m_request_manager.lookup(m_pending_transfer.m_p.tid()) - .map([&](auto&& rs) { - if (rs.state() == transfer_state::completed || rs.state() == transfer_state::failed) { - finished = true; - if (rs.state() == transfer_state::failed) { - LOGGER_ERROR("FTIO transfer {} failed with error: {}", m_pending_transfer.m_p.tid(), rs.error().value_or(error_code::other)); + time_t now = time(nullptr); + auto expanded = expand_transfer_requests(m_pending_transfer.m_sources, m_pending_transfer.m_targets, now); + + if(!expanded.sources.empty()){ + m_pending_transfer.m_expanded_sources = std::move(expanded.sources); + m_pending_transfer.m_expanded_targets = std::move(expanded.targets); + + auto ec = m_request_manager.update(m_pending_transfer.m_p.tid(), m_pending_transfer.m_expanded_sources.size(), m_pending_transfer.m_p.nworkers()); + if(ec != error_code::success) { + LOGGER_ERROR("Failed to update request for FTIO transfer {}: {}", m_pending_transfer.m_p.tid(), ec); + } else { + assert(m_pending_transfer.m_expanded_sources.size() == m_pending_transfer.m_expanded_targets.size()); + _dispatch_transfer_to_workers(m_pending_transfer.m_p, expanded); + + LOGGER_INFO("FTIO transfer {} dispatched for {} files.", m_pending_transfer.m_p.tid(), m_pending_transfer.m_expanded_sources.size()); + bool finished = false; + while(!finished && !m_shutting_down) { + thallium::thread::self().sleep(m_network_engine, 1000); // Poll status every second + m_request_manager.lookup(m_pending_transfer.m_p.tid()) + .map([&](auto&& rs) { + if (rs.state() == transfer_state::completed || rs.state() == transfer_state::failed) { + finished = true; + if (rs.state() == transfer_state::failed) { + LOGGER_ERROR("FTIO transfer {} failed with error: {}", m_pending_transfer.m_p.tid(), rs.error().value_or(error_code::other)); + } } - } - }); - } + }); + } - if(finished) { - LOGGER_INFO("Transfer finished for {} files.", m_pending_transfer.m_expanded_sources.size()); - if(!m_pending_transfer.m_expanded_sources.empty()) { - auto fs = FSPlugin::make_fs(static_cast(m_pending_transfer.m_expanded_sources[0].get_type())); - for(const auto& file : m_pending_transfer.m_expanded_sources) { - LOGGER_INFO("Deleting {}", file.path()); - fs->unlink(file.path()); + if(finished) { + LOGGER_INFO("Transfer finished for {} files.", m_pending_transfer.m_expanded_sources.size()); + if(!m_pending_transfer.m_expanded_sources.empty()) { + auto fs = FSPlugin::make_fs(static_cast(m_pending_transfer.m_expanded_sources[0].get_type())); + for(const auto& file : m_pending_transfer.m_expanded_sources) { + LOGGER_INFO("Deleting {}", file.path()); + fs->unlink(file.path()); + } } } } + } else { + LOGGER_INFO("FTIO triggered, but no new files to transfer for request {}.", m_pending_transfer.m_p.tid()); } if (m_period <= 0) { @@ -319,6 +334,55 @@ master_server::shutdown(const network::request& req) { server::shutdown(); } +void +master_server::_expand_source_target_pair( + const dataset& source, const dataset& target, + time_t mod_time_threshold, expanded_requests& result) { + + const auto& source_path = source.path(); + + auto fs = FSPlugin::make_fs(static_cast(source.get_type())); + struct stat buf; + auto rstat = fs->stat(source_path, &buf); + + if(rstat != 0) { + LOGGER_WARN("Cannot stat source path '{}', skipping.", source_path); + return; + } + + if (S_ISDIR(buf.st_mode)) { // It's a directory + LOGGER_INFO("Expanding input directory {}", source_path); + std::vector files = fs->readdir(source_path); // Recursive readdir + std::sort(files.begin(), files.end()); + + for(const auto& f : files) { + if(m_is_filtering_enabled && !std::regex_match(f, m_filename_pattern)) { + LOGGER_INFO("File {} IGNORED by regex", f); + continue; + } + + struct stat file_buf; + if (fs->stat(f, &file_buf) == 0) { + if (mod_time_threshold > 0 && file_buf.st_mtime >= mod_time_threshold) { + continue; + } + result.sources.emplace_back(f, source.get_type()); + result.sizes.push_back(file_buf.st_size); + std::filesystem::path relative_path = std::filesystem::relative(f, source_path); + result.targets.emplace_back(std::filesystem::path(target.path()) / relative_path, target.get_type()); + } + } + } else { // It's a file + if (mod_time_threshold > 0 && buf.st_mtime >= mod_time_threshold) { + return; + } + result.sources.push_back(source); + result.targets.push_back(target); + result.sizes.push_back(buf.st_size); + } +} + + master_server::expanded_requests master_server::expand_transfer_requests( const std::vector& sources, @@ -328,44 +392,7 @@ master_server::expand_transfer_requests( expanded_requests result; for(auto i = 0u; i < sources.size(); ++i) { - const auto& s = sources[i]; - const auto& d = targets[i]; - const auto& p = s.path(); - - auto fs = FSPlugin::make_fs(static_cast(s.get_type())); - struct stat buf; - auto rstat = fs->stat(p, &buf); - - if(rstat == 0 && S_ISDIR(buf.st_mode)) { - LOGGER_INFO("Expanding input directory {}", p); - std::vector files = fs->readdir(p); - std::sort(files.begin(), files.end()); - - for(const auto& f : files) { - if(m_is_filtering_enabled && !std::regex_match(f, m_filename_pattern)) { - LOGGER_INFO("File {} IGNORED by regex", f); - continue; - } - - struct stat file_buf; - if (fs->stat(f, &file_buf) == 0) { - if (modification_time_threshold > 0 && file_buf.st_mtime >= modification_time_threshold) { - continue; - } - result.sources.emplace_back(f, s.get_type()); - result.sizes.push_back(file_buf.st_size); - std::filesystem::path relative_path = std::filesystem::relative(f, p); - result.targets.emplace_back(std::filesystem::path(d.path()) / relative_path, d.get_type()); - } - } - } else if (rstat == 0) { - if (modification_time_threshold > 0 && buf.st_mtime >= modification_time_threshold) { - continue; - } - result.sources.push_back(s); - result.targets.push_back(d); - result.sizes.push_back(buf.st_size); - } + _expand_source_target_pair(sources[i], targets[i], modification_time_threshold, result); } return result; } @@ -397,25 +424,6 @@ master_server::_dispatch_transfer_to_workers(const parallel_request& r, } } -void -master_server::transfer_dataset_internal(pending_transfer& pt) { - time_t now = time(nullptr); - - auto expanded = expand_transfer_requests(pt.m_sources, pt.m_targets, now); - pt.m_expanded_sources = std::move(expanded.sources); - pt.m_expanded_targets = std::move(expanded.targets); - - auto ec = m_request_manager.update(pt.m_p.tid(), pt.m_expanded_sources.size(), pt.m_p.nworkers()); - if(ec != error_code::success) { - LOGGER_ERROR("Failed to update request: {}", ec); - return; - }; - - assert(pt.m_expanded_sources.size() == pt.m_expanded_targets.size()); - - _dispatch_transfer_to_workers(pt.m_p, expanded); -} - void master_server::transfer_datasets(const network::request& req, const std::vector& sources, @@ -441,9 +449,7 @@ master_server::do_transfer_datasets(const network::request req, LOGGER_INFO("rpc {:>} body: {{sources: {}, targets: {}}}", rpc, sources, targets); - auto expanded = expand_transfer_requests(sources, targets); - - m_request_manager.create(expanded.sources.size(), world.size() - 1) + m_request_manager.create(0, world.size() - 1) // Initially create with 0 files .or_else([&](auto&& ec) { LOGGER_ERROR("Failed to create request: {}", ec); req.respond(generic_response{rpc.id(), ec}); @@ -456,9 +462,21 @@ master_server::do_transfer_datasets(const network::request req, m_pending_transfer.m_targets = targets; m_pending_transfer.m_work = true; m_ftio_tid.store(r.tid()); - LOGGER_INFO("Stored stage-out information for transfer {}", r.tid()); + LOGGER_INFO("Stored stage-out information for future transfer {}", r.tid()); } else { - _dispatch_transfer_to_workers(r, expanded); + auto expanded = expand_transfer_requests(sources, targets); + if (!expanded.sources.empty()) { + auto ec = m_request_manager.update(r.tid(), expanded.sources.size(), r.nworkers()); + if (ec == error_code::success) { + _dispatch_transfer_to_workers(r, expanded); + } else { + LOGGER_ERROR("Failed to update request {}: {}", r.tid(), ec); + req.respond(generic_response{rpc.id(), ec}); + return; + } + } else { + LOGGER_INFO("No files to transfer for request {}", r.tid()); + } } req.respond(response_with_id{rpc.id(), error_code::success, r.tid()}); diff --git a/src/master.hpp b/src/master.hpp index e5834bb..32c846b 100644 --- a/src/master.hpp +++ b/src/master.hpp @@ -106,14 +106,16 @@ private: std::vector sizes; }; - expanded_requests expand_transfer_requests( - const std::vector& sources, - const std::vector& targets, - time_t modification_time_threshold = 0); - void - transfer_dataset_internal(pending_transfer& pt); - + _expand_source_target_pair(const dataset& source, const dataset& target, + time_t mod_time_threshold, + expanded_requests& result); + + expanded_requests + expand_transfer_requests(const std::vector& sources, + const std::vector& targets, + time_t modification_time_threshold = 0); + void _dispatch_transfer_to_workers(const parallel_request& r, const expanded_requests& expanded); -- GitLab From eba28f59b4cad80a632cd5189b11b7a0a018061f Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 08:50:51 +0200 Subject: [PATCH 11/21] fmt --- cli/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt index e7919d5..004e931 100644 --- a/cli/CMakeLists.txt +++ b/cli/CMakeLists.txt @@ -36,7 +36,12 @@ configure_file(cargoctl.in cargoctl @ONLY) ################################################################################ ## Common object library for CLI tools add_library(cli_common OBJECT common.cpp) - +target_link_libraries(cli_common PUBLIC + fmt::fmt + CLI11::CLI11 + net::rpc_client + cargo + ) ################################################################################ # Helper function to define a CLI tool function(add_cargo_cli_tool name source) -- GitLab From 82e796d8493304b4cbe4bd197c389a53037188e3 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 09:00:10 +0200 Subject: [PATCH 12/21] cmake --- src/posix_file/CMakeLists.txt | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/posix_file/CMakeLists.txt b/src/posix_file/CMakeLists.txt index a5a21a6..e329863 100644 --- a/src/posix_file/CMakeLists.txt +++ b/src/posix_file/CMakeLists.txt @@ -26,11 +26,15 @@ add_library(posix_file STATIC) target_sources( posix_file - PRIVATE - posix_file/file.cpp - posix_file/fs_plugin/fs_plugin.cpp - posix_file/fs_plugin/posix_plugin.cpp - posix_file/fs_plugin/none_plugin.cpp + PRIVATE posix_file/types.hpp + posix_file/file.hpp + posix_file/file.cpp + posix_file/ranges.hpp + posix_file/views.hpp + posix_file/math.hpp + posix_file/views/block_iterator.hpp + posix_file/views/strided_iterator.hpp + ) if (GekkoFS_FOUND) -- GitLab From 8e86e7a556b8543ed233b9f8d7cb7704356f5ffc Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 09:27:04 +0200 Subject: [PATCH 13/21] solved cmake --- src/posix_file/CMakeLists.txt | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/posix_file/CMakeLists.txt b/src/posix_file/CMakeLists.txt index e329863..81897cf 100644 --- a/src/posix_file/CMakeLists.txt +++ b/src/posix_file/CMakeLists.txt @@ -26,15 +26,11 @@ add_library(posix_file STATIC) target_sources( posix_file - PRIVATE posix_file/types.hpp - posix_file/file.hpp - posix_file/file.cpp - posix_file/ranges.hpp - posix_file/views.hpp - posix_file/math.hpp - posix_file/views/block_iterator.hpp - posix_file/views/strided_iterator.hpp - + PRIVATE + posix_file/file.cpp + posix_file/fs_plugin/fs_plugin.cpp + posix_file/fs_plugin/posix_plugin.cpp + posix_file/fs_plugin/none_plugin.cpp ) if (GekkoFS_FOUND) @@ -71,4 +67,4 @@ if (DataClay_FOUND) set(ADHOC ${ADHOC} DataClay::DataClay) endif() -target_link_libraries(posix_file INTERFACE fmt::fmt tl::expected PRIVATE ${ADHOC}) \ No newline at end of file +target_link_libraries(posix_file PUBLIC fmt::fmt tl::expected PRIVATE ${ADHOC}) \ No newline at end of file -- GitLab From b8b22c197fba3efa1aa7cd5556286ccb0832d6ca Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 10:32:28 +0200 Subject: [PATCH 14/21] clean 100% cpu usage --- cli/common.cpp | 12 ++++ cli/common.hpp | 8 +++ cli/ftio.cpp | 33 +++------ cli/ping.cpp | 34 ++-------- cli/shaping.cpp | 32 ++------- cli/shutdown.cpp | 34 ++-------- src/master.cpp | 54 +++++++++------ src/master.hpp | 5 ++ src/net/server.cpp | 2 +- src/request_manager.cpp | 122 ++++++++++++++++++---------------- src/request_manager.hpp | 3 +- src/worker/base_operation.cpp | 31 +++++++++ src/worker/base_operation.hpp | 4 ++ src/worker/mpio_write.cpp | 45 +++---------- src/worker/worker.cpp | 43 ++++++------ src/worker/worker.hpp | 4 +- 16 files changed, 224 insertions(+), 242 deletions(-) diff --git a/cli/common.cpp b/cli/common.cpp index d80b73a..cccb92b 100644 --- a/cli/common.cpp +++ b/cli/common.cpp @@ -24,6 +24,7 @@ #include "common.hpp" #include #include +#include std::pair parse_address(const std::string& address) { @@ -34,4 +35,15 @@ parse_address(const std::string& address) { const auto protocol = address.substr(0, pos); return std::make_pair(protocol, address); +} + +void parse_rpc_command_line(int argc, char* argv[], CLI::App& app, std::string& server_address) { + app.add_option("-s,--server", server_address, "Server address") + ->option_text("ADDRESS") + ->required(); + try { + app.parse(argc, argv); + } catch(const CLI::ParseError& ex) { + std::exit(app.exit(ex)); + } } \ No newline at end of file diff --git a/cli/common.hpp b/cli/common.hpp index 3cb52c3..61d9501 100644 --- a/cli/common.hpp +++ b/cli/common.hpp @@ -28,9 +28,17 @@ #include #include +namespace CLI { + class App; +} + // Parses a server address string into protocol and address. // Throws a runtime_error if the address is invalid. std::pair parse_address(const std::string& address); + +void parse_rpc_command_line(int argc, char* argv[], CLI::App& app, std::string& server_address); + + #endif // CARGO_CLI_COMMON_HPP \ No newline at end of file diff --git a/cli/ftio.cpp b/cli/ftio.cpp index 52c748b..6f95e5c 100644 --- a/cli/ftio.cpp +++ b/cli/ftio.cpp @@ -31,8 +31,6 @@ #include "common.hpp" struct ftio_config { - std::string progname; - std::string server_address; float confidence; float probability; float period; @@ -41,18 +39,14 @@ struct ftio_config { bool resume{false}; }; -ftio_config -parse_command_line(int argc, char* argv[]) { +int +main(int argc, char* argv[]) { + std::string progname = std::filesystem::path{argv[0]}.filename().string(); + std::string server_address; ftio_config cfg; - cfg.progname = std::filesystem::path{argv[0]}.filename().string(); - - CLI::App app{"Cargo ftio client", cfg.progname}; - - app.add_option("-s,--server", cfg.server_address, "Server address") - ->option_text("ADDRESS") - ->required(); + CLI::App app{"Cargo ftio client", progname}; app.add_option("-c,--conf", cfg.confidence, "confidence") ->option_text("float") @@ -76,21 +70,10 @@ parse_command_line(int argc, char* argv[]) { app.add_flag("--resume", cfg.resume, "Trigger stage operation to resume, only pause or resume will take into account. Others parameters not used."); + parse_rpc_command_line(argc, argv, app, server_address); + try { - app.parse(argc, argv); - return cfg; - } catch(const CLI::ParseError& ex) { - std::exit(app.exit(ex)); - } -} - -int -main(int argc, char* argv[]) { - - ftio_config cfg = parse_command_line(argc, argv); - - try { - const auto [protocol, address] = parse_address(cfg.server_address); + const auto [protocol, address] = parse_address(server_address); network::client rpc_client{protocol}; if(const auto result = rpc_client.lookup(address); result.has_value()) { diff --git a/cli/ping.cpp b/cli/ping.cpp index ecb3bab..53ac89a 100644 --- a/cli/ping.cpp +++ b/cli/ping.cpp @@ -30,39 +30,17 @@ #include #include "common.hpp" -struct ping_config { - std::string progname; - std::string server_address; -}; - -ping_config -parse_command_line(int argc, char* argv[]) { - - ping_config cfg; - - cfg.progname = std::filesystem::path{argv[0]}.filename().string(); - - CLI::App app{"Cargo ping client", cfg.progname}; - - app.add_option("-s,--server", cfg.server_address, "Server address") - ->option_text("ADDRESS") - ->required(); - - try { - app.parse(argc, argv); - return cfg; - } catch(const CLI::ParseError& ex) { - std::exit(app.exit(ex)); - } -} - int main(int argc, char* argv[]) { - ping_config cfg = parse_command_line(argc, argv); + std::string progname = std::filesystem::path{argv[0]}.filename().string(); + std::string server_address; + + CLI::App app{"Cargo ping client", progname}; + parse_rpc_command_line(argc, argv, app, server_address); try { - const auto [protocol, address] = parse_address(cfg.server_address); + const auto [protocol, address] = parse_address(server_address); network::client rpc_client{protocol}; if(const auto result = rpc_client.lookup(address); result.has_value()) { diff --git a/cli/shaping.cpp b/cli/shaping.cpp index 8711759..98c69ed 100644 --- a/cli/shaping.cpp +++ b/cli/shaping.cpp @@ -31,24 +31,18 @@ #include "common.hpp" struct shaping_config { - std::string progname; - std::string server_address; std::int64_t tid; std::int16_t shaping; }; -shaping_config -parse_command_line(int argc, char* argv[]) { +int +main(int argc, char* argv[]) { + std::string progname = std::filesystem::path{argv[0]}.filename().string(); + std::string server_address; shaping_config cfg; - cfg.progname = std::filesystem::path{argv[0]}.filename().string(); - - CLI::App app{"Cargo shaping client", cfg.progname}; - - app.add_option("-s,--server", cfg.server_address, "Server address") - ->option_text("ADDRESS") - ->required(); + CLI::App app{"Cargo shaping client", progname}; app.add_option("-i,--tid", cfg.tid, "transfer id") ->option_text("integer") @@ -58,22 +52,10 @@ parse_command_line(int argc, char* argv[]) { ->option_text("integer") ->required(); + parse_rpc_command_line(argc, argv, app, server_address); try { - app.parse(argc, argv); - return cfg; - } catch(const CLI::ParseError& ex) { - std::exit(app.exit(ex)); - } -} - -int -main(int argc, char* argv[]) { - - shaping_config cfg = parse_command_line(argc, argv); - - try { - const auto [protocol, address] = parse_address(cfg.server_address); + const auto [protocol, address] = parse_address(server_address); network::client rpc_client{protocol}; if(const auto result = rpc_client.lookup(address); result.has_value()) { diff --git a/cli/shutdown.cpp b/cli/shutdown.cpp index bdfb4ba..2adbf64 100644 --- a/cli/shutdown.cpp +++ b/cli/shutdown.cpp @@ -30,39 +30,17 @@ #include #include "common.hpp" -struct shutdown_config { - std::string progname; - std::string server_address; -}; - -shutdown_config -parse_command_line(int argc, char* argv[]) { - - shutdown_config cfg; - - cfg.progname = std::filesystem::path{argv[0]}.filename().string(); - - CLI::App app{"Cargo shutdown client", cfg.progname}; - - app.add_option("-s,--server", cfg.server_address, "Server address") - ->option_text("ADDRESS") - ->required(); - - try { - app.parse(argc, argv); - return cfg; - } catch(const CLI::ParseError& ex) { - std::exit(app.exit(ex)); - } -} - int main(int argc, char* argv[]) { - shutdown_config cfg = parse_command_line(argc, argv); + std::string progname = std::filesystem::path{argv[0]}.filename().string(); + std::string server_address; + + CLI::App app{"Cargo shutdown client", progname}; + parse_rpc_command_line(argc, argv, app, server_address); try { - const auto [protocol, address] = parse_address(cfg.server_address); + const auto [protocol, address] = parse_address(server_address); network::client rpc_client{protocol}; if(const auto result = rpc_client.lookup(address); result.has_value()) { diff --git a/src/master.cpp b/src/master.cpp index 5b718e7..ed76c1b 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -40,6 +40,7 @@ #include #include #include +#include using namespace std::literals; namespace mpi = boost::mpi; @@ -151,6 +152,7 @@ master_server::master_server(std::string name, std::string address, m_network_engine.push_prefinalize_callback([this]() { m_shutting_down = true; m_ftio_cv.notify_all(); // Wake up FTIO scheduler to exit + m_ftio_completion_cv.notify_all(); // Wake up FTIO scheduler if it is waiting for a transfer m_mpi_listener_ult->join(); m_mpi_listener_ult = thallium::managed{}; m_mpi_listener_ess->join(); @@ -182,6 +184,21 @@ master_server::mpi_listener_ult() { m_request_manager.update(m.tid(), m.seqno(), msg.source() - 1, m.name(), m.state(), m.bw(), m.error_code()); + + if (m_ftio.load() && m.tid() == m_ftio_tid.load()) { + m_request_manager.lookup(m.tid()).map([&](auto&& rs) { + if (rs.state() == transfer_state::completed || rs.state() == transfer_state::failed) { + { + abt::unique_lock lock(m_ftio_completion_mutex); + m_ftio_transfer_completed = true; + } + m_ftio_completion_cv.notify_one(); + if (rs.state() == transfer_state::failed) { + LOGGER_ERROR("FTIO transfer {} failed with error: {}", m.tid(), rs.error().value_or(error_code::other)); + } + } + }); + } break; } @@ -197,7 +214,7 @@ master_server::mpi_listener_ult() { break; } } else { - thallium::thread::self().sleep(m_network_engine, 10); + std::this_thread::sleep_for(1ms); } } @@ -246,29 +263,22 @@ master_server::ftio_scheduling_ult() { assert(m_pending_transfer.m_expanded_sources.size() == m_pending_transfer.m_expanded_targets.size()); _dispatch_transfer_to_workers(m_pending_transfer.m_p, expanded); - LOGGER_INFO("FTIO transfer {} dispatched for {} files.", m_pending_transfer.m_p.tid(), m_pending_transfer.m_expanded_sources.size()); - bool finished = false; - while(!finished && !m_shutting_down) { - thallium::thread::self().sleep(m_network_engine, 1000); // Poll status every second - m_request_manager.lookup(m_pending_transfer.m_p.tid()) - .map([&](auto&& rs) { - if (rs.state() == transfer_state::completed || rs.state() == transfer_state::failed) { - finished = true; - if (rs.state() == transfer_state::failed) { - LOGGER_ERROR("FTIO transfer {} failed with error: {}", m_pending_transfer.m_p.tid(), rs.error().value_or(error_code::other)); - } - } - }); + LOGGER_INFO("FTIO transfer {} dispatched for {} files. Waiting for completion.", m_pending_transfer.m_p.tid(), m_pending_transfer.m_expanded_sources.size()); + + m_ftio_transfer_completed = false; + { + abt::unique_lock completion_lock(m_ftio_completion_mutex); + m_ftio_completion_cv.wait(completion_lock, [this]{ return m_ftio_transfer_completed.load() || m_shutting_down.load(); }); } - if(finished) { - LOGGER_INFO("Transfer finished for {} files.", m_pending_transfer.m_expanded_sources.size()); - if(!m_pending_transfer.m_expanded_sources.empty()) { - auto fs = FSPlugin::make_fs(static_cast(m_pending_transfer.m_expanded_sources[0].get_type())); - for(const auto& file : m_pending_transfer.m_expanded_sources) { - LOGGER_INFO("Deleting {}", file.path()); - fs->unlink(file.path()); - } + if(m_shutting_down) break; + + LOGGER_INFO("Transfer finished for {} files.", m_pending_transfer.m_expanded_sources.size()); + if(!m_pending_transfer.m_expanded_sources.empty()) { + auto fs = FSPlugin::make_fs(static_cast(m_pending_transfer.m_expanded_sources[0].get_type())); + for(const auto& file : m_pending_transfer.m_expanded_sources) { + LOGGER_INFO("Deleting {}", file.path()); + fs->unlink(file.path()); } } } diff --git a/src/master.hpp b/src/master.hpp index 32c846b..d72dd8f 100644 --- a/src/master.hpp +++ b/src/master.hpp @@ -140,6 +140,11 @@ private: std::atomic m_ftio_tid = {0}; std::atomic m_ftio = {false}; + // FTIO completion handling + mutable abt::shared_mutex m_ftio_completion_mutex; + std::condition_variable_any m_ftio_completion_cv; + std::atomic m_ftio_transfer_completed = {false}; + ssize_t m_block_size = 0; pending_transfer m_pending_transfer; diff --git a/src/net/server.cpp b/src/net/server.cpp index bdd526b..dab6981 100644 --- a/src/net/server.cpp +++ b/src/net/server.cpp @@ -392,4 +392,4 @@ server::shutdown() { m_network_engine.finalize(); } -} // namespace network +} // namespace network \ No newline at end of file diff --git a/src/request_manager.cpp b/src/request_manager.cpp index 93760fc..5292274 100644 --- a/src/request_manager.cpp +++ b/src/request_manager.cpp @@ -95,6 +95,48 @@ request_manager::update(std::uint64_t tid, std::uint32_t seqno, std::size_t wid, return error_code::no_such_transfer; } +request_status request_manager::_get_file_status(const file_status& fs) const { + if (fs.empty()) { + return request_status{"", transfer_state::completed, 0.0f, error_code::success}; + } + + bool has_completed = false; + bool has_running = false; + float total_bw = 0.0f; + int active_parts = 0; + + for (const auto& ps : fs) { + if (ps.state() == transfer_state::failed) { + return request_status{ps}; // Immediately return on failure + } + if (ps.state() == transfer_state::running) { + has_running = true; + } + if (ps.state() == transfer_state::completed) { + has_completed = true; + } + if (ps.bw() > 0.0f) { + total_bw += ps.bw(); + active_parts++; + } + } + + float avg_bw = (active_parts > 0) ? (total_bw / active_parts) : 0.0f; + + if (has_running) { + return request_status{fs.front().name(), transfer_state::running, avg_bw, std::nullopt}; + } + + // For small files, one completed part from its designated worker means the file is done. + if (has_completed) { + return request_status{fs.front().name(), transfer_state::completed, 0.0f, error_code::success}; + } + + // If nothing has failed, nothing is running, and nothing has completed, it must be pending. + return request_status{fs.front().name(), transfer_state::pending, 0.0f, std::nullopt}; +} + + tl::expected request_manager::lookup(std::uint64_t tid) { @@ -111,54 +153,32 @@ request_manager::lookup(std::uint64_t tid) { return request_status{"", transfer_state::completed, 0.0f, error_code::success}; } + bool all_files_completed = true; bool any_running = false; - bool all_completed = true; float total_bw = 0.0f; - int active_parts = 0; + int active_transfers = 0; for (const auto& file_status_vec : all_file_statuses) { - if (file_status_vec.empty()) { - continue; // Should not happen with proper creation, but good to be safe. + auto file_status = _get_file_status(file_status_vec); + if (file_status.state() == transfer_state::failed) { + return file_status; // Propagate failure up immediately } - - // The logic for a "file" (a collection of parts handled by workers) is tricky. - // A file is only considered 'completed' if ALL its parts are completed. - // However, in the small-file case, some parts will be pending forever. - // Let's refine: A file is 'completed' if at least one part is 'completed' and no parts are 'running' or 'failed'. - - bool this_file_has_completed_part = false; - bool this_file_is_active = false; // running or pending active work - - for (const auto& part_status : file_status_vec) { - if (part_status.state() == transfer_state::failed) { - // If any part of any file fails, the whole transfer fails immediately. - return request_status{part_status.name(), transfer_state::failed, part_status.bw(), part_status.error()}; - } - if (part_status.state() == transfer_state::running) { - any_running = true; - this_file_is_active = true; - } - if (part_status.state() == transfer_state::completed) { - this_file_has_completed_part = true; - } - if (part_status.bw() > 0) { // Consider only parts that are reporting bandwidth - total_bw += part_status.bw(); - active_parts++; - } + if (file_status.state() != transfer_state::completed) { + all_files_completed = false; } - - // A file is not yet complete if it's active (running) OR if no parts have completed yet. - if (this_file_is_active || !this_file_has_completed_part) { - all_completed = false; + if (file_status.state() == transfer_state::running) { + any_running = true; + total_bw += file_status.bw(); + active_transfers++; } } - if (all_completed) { + if (all_files_completed) { return request_status{"", transfer_state::completed, 0.0f, error_code::success}; } if (any_running) { - float avg_bw = (active_parts > 0) ? (total_bw / active_parts) : 0.0f; + float avg_bw = (active_transfers > 0) ? (total_bw / active_transfers) : 0.0f; return request_status{"", transfer_state::running, avg_bw, std::nullopt}; } @@ -171,30 +191,20 @@ request_manager::lookup_all(std::uint64_t tid) { abt::shared_lock lock(m_mutex); + auto it = m_requests.find(tid); + if(it == m_requests.end()) { + LOGGER_ERROR("{}: Request {} not found", __FUNCTION__, tid); + return tl::make_unexpected(error_code::no_such_transfer); + } + std::vector result; - if(const auto it = m_requests.find(tid); it != m_requests.end()) { + const auto& all_file_statuses = it->second; + result.reserve(all_file_statuses.size()); - const auto& file_statuses = it->second; - // we calculate always the mean of the BW - for(const auto& fs : file_statuses) { - float bw = 0; - request_status rs(*fs.begin()); - for(const auto& ps : fs) { - bw += ps.bw(); - if(ps.state() == transfer_state::completed) { - continue; - } - // not finished - rs = request_status{ps}; - } - rs.bw(bw / (double) fs.size()); - result.push_back(rs); - } - return result; + for(const auto& fs : all_file_statuses) { + result.push_back(_get_file_status(fs)); } - - LOGGER_ERROR("{}: Request {} not found", __FUNCTION__, tid); - return tl::make_unexpected(error_code::no_such_transfer); + return result; } error_code diff --git a/src/request_manager.hpp b/src/request_manager.hpp index c813d45..dfe7cd4 100644 --- a/src/request_manager.hpp +++ b/src/request_manager.hpp @@ -76,6 +76,7 @@ public: remove(std::uint64_t tid); private: + request_status _get_file_status(const file_status& fs) const; std::atomic current_tid = 0; mutable abt::shared_mutex m_mutex; std::unordered_map> m_requests; @@ -83,4 +84,4 @@ private: } // namespace cargo -#endif // CARGO_REQUEST_MANAGER_HPP +#endif // CARGO_REQUEST_MANAGER_HPP \ No newline at end of file diff --git a/src/worker/base_operation.cpp b/src/worker/base_operation.cpp index 24c1645..5fa126d 100644 --- a/src/worker/base_operation.cpp +++ b/src/worker/base_operation.cpp @@ -24,6 +24,10 @@ #include "base_operation.hpp" #include +#include +#include + +using namespace std::chrono_literals; namespace cargo { @@ -49,4 +53,31 @@ void base_operation::_calculate_file_ranges() { } } +void base_operation::_perform_strided_read(posix_file::file& input_file, memory_buffer& buffer) { + std::size_t buffer_offset = 0; + for (const auto& file_range : m_file_ranges) { + if (buffer_offset + file_range.size() > buffer.size()) { + LOGGER_ERROR("Buffer too small for strided read. Required: {}, Available: {}", buffer_offset + file_range.size(), buffer.size()); + throw std::runtime_error("Buffer for strided read is too small"); + } + + buffer_region region(buffer.data() + buffer_offset, file_range.size()); + + auto start = std::chrono::steady_clock::now(); + const std::size_t n = input_file.pread(region, file_range.offset(), file_range.size()); + + auto sleep_duration = sleep_value(); + if (sleep_duration > 0ms) std::this_thread::sleep_for(sleep_duration); + + auto end = std::chrono::steady_clock::now(); + double elapsed_seconds = std::chrono::duration_cast>(end - start).count(); + if (elapsed_seconds > 0) { + bw((n / (1024.0 * 1024.0)) / elapsed_seconds); + LOGGER_DEBUG("BW (read) Update: {} / {} = {} MB/s [ Sleep {}ms ]", + n / 1024.0, elapsed_seconds, bw(), sleep_duration.count()); + } + buffer_offset += n; + } +} + } // namespace cargo \ No newline at end of file diff --git a/src/worker/base_operation.hpp b/src/worker/base_operation.hpp index dbebdfc..fd8b5d2 100644 --- a/src/worker/base_operation.hpp +++ b/src/worker/base_operation.hpp @@ -27,9 +27,11 @@ #include "ops.hpp" #include "memory.hpp" +#include "logger/logger.hpp" #include #include + namespace mpi = boost::mpi; namespace cargo { @@ -45,6 +47,8 @@ public: protected: void _calculate_file_ranges(); + void _perform_strided_read(posix_file::file& input_file, memory_buffer& buffer); + mpi::communicator m_workers; std::filesystem::path m_input_path{}; diff --git a/src/worker/mpio_write.cpp b/src/worker/mpio_write.cpp index 19b0a37..66c427f 100644 --- a/src/worker/mpio_write.cpp +++ b/src/worker/mpio_write.cpp @@ -40,43 +40,18 @@ mpio_write::mpio_write(mpi::communicator workers, std::filesystem::path input_pa block_size, fs_i_type, fs_o_type, size, single) {} void mpio_write::_read_input_file_sequentially() { - using posix_file::views::all_of; - using posix_file::views::as_blocks; - using posix_file::views::strided; - auto input_file = posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type); - std::vector buffer_regions; - buffer_regions.reserve(m_buffer.size() / m_block_size); - for(std::size_t i = 0; i < m_buffer.size() / m_block_size; ++i) { - buffer_regions.emplace_back(m_buffer.data() + i * m_block_size, m_block_size); - } - - m_bytes_per_rank = 0; - int index = 0; - for(const auto& file_range : - all_of(input_file) | as_blocks(m_block_size) | strided(m_workers_size, m_workers_rank)) { - - assert((unsigned)index < buffer_regions.size()); - auto& buffer_region = buffer_regions[index]; - assert(buffer_region.size() >= file_range.size()); - - auto start = std::chrono::steady_clock::now(); - const std::size_t n = input_file.pread(buffer_region, file_range.offset(), file_range.size()); - m_bytes_per_rank += n; - - auto sleep_duration = sleep_value(); - if(sleep_duration > 0ms) std::this_thread::sleep_for(sleep_duration); - - auto end = std::chrono::steady_clock::now(); - double elapsed_seconds = std::chrono::duration_cast>(end - start).count(); - if (elapsed_seconds > 0) { - bw((m_block_size / (1024.0 * 1024.0)) / elapsed_seconds); - LOGGER_DEBUG("BW (read) Update: {} / {} = {} MB/s [ Sleep {}ms ]", - m_block_size / 1024.0, elapsed_seconds, bw(), sleep_duration.count()); - } - index++; + // Calculate required buffer size from the ranges this worker is responsible for + _calculate_file_ranges(); + std::size_t required_buffer_size = 0; + for(const auto& range : m_file_ranges) { + required_buffer_size += range.size(); } + m_buffer.resize(required_buffer_size); + m_bytes_per_rank = required_buffer_size; + + _perform_strided_read(input_file, m_buffer); } cargo::error_code @@ -94,7 +69,7 @@ mpio_write::setup() { ++blocks_per_rank; } - m_buffer.resize(blocks_per_rank * m_block_size); + // Defer buffer allocation until we know the exact size needed return (m_status = error_code::success); } diff --git a/src/worker/worker.cpp b/src/worker/worker.cpp index ad541f3..f06b599 100644 --- a/src/worker/worker.cpp +++ b/src/worker/worker.cpp @@ -27,6 +27,7 @@ #include "../logger/logger.hpp" #include #include +#include #include "worker.hpp" #include "fmt_formatters.hpp" @@ -104,7 +105,7 @@ worker::_progress_operations() { void worker::_process_transfer_message(const transfer_message& m, int source, int tag, - const mpi::communicator& workers) + mpi::communicator& workers) { for(std::size_t i = 0; i < m.input_path().size(); i++) { std::string input_path = m.input_path()[i]; @@ -112,18 +113,18 @@ worker::_process_transfer_message(const transfer_message& m, int source, int tag std::size_t size = m.sizes()[i]; bool is_small_file = (size <= m_block_size * 1024); + + auto effective_tag = static_cast(tag); + // For small files, always use sequential mixed I/O for efficiency + if (is_small_file && (effective_tag == tag::pread || effective_tag == tag::pwrite)) { + effective_tag = tag::seq_mixed; + } + bool am_i_responsible = !is_small_file || ((i % workers.size()) == static_cast(workers.rank())); if (am_i_responsible) { - auto op_workers = workers; - if(is_small_file && workers.size() > 1) { - std::vector self_rank = { workers.rank() }; - auto group = workers.group().include(self_rank.begin(), self_rank.end()); - op_workers = ::make_communicator(workers, group, 0); - } - - auto op = operation::make_operation(static_cast(tag), op_workers, input_path, output_path, m_block_size, m.i_type(), m.o_type(), size, is_small_file); - op->set_comm(source, m.tid(), i, static_cast(tag)); + auto op = operation::make_operation(effective_tag, workers, input_path, output_path, m_block_size, m.i_type(), m.o_type(), size, is_small_file); + op->set_comm(source, m.tid(), i, effective_tag); _update_state(op->source(), op->tid(), i, output_path, transfer_state::pending, -1.0f); @@ -144,14 +145,11 @@ worker::_process_transfer_message(const transfer_message& m, int source, int tag } bool -worker::_handle_incoming_message(const mpi::communicator& workers, const mpi::communicator& world) { - auto maybe_msg = m_ops.empty() ? world.probe() : world.iprobe(); +worker::_handle_incoming_message(mpi::communicator& workers, const mpi::communicator& world) { + auto maybe_msg = world.iprobe(); if(!maybe_msg) { - if (m_ops.empty()) { - std::this_thread::sleep_for(10ms); - } - return false; // Not a shutdown request + return false; // No message, not a shutdown. } auto msg = *maybe_msg; @@ -200,9 +198,9 @@ worker::_handle_incoming_message(const mpi::communicator& workers, const mpi::co int worker::run() { - const mpi::communicator world; + mpi::communicator world; const auto ranks_to_exclude = std::array{0}; - const auto workers = + auto workers = ::make_communicator(world, world.group().exclude(ranks_to_exclude.begin(), ranks_to_exclude.end()), @@ -224,8 +222,15 @@ worker::run() { bool done = false; while(!done) { - _progress_operations(); + // ALWAYS use non-blocking message handling. done = _handle_incoming_message(workers, world); + if (done) break; + + // ALWAYS progress operations if there are any. + _progress_operations(); + + // ALWAYS sleep to yield the CPU. + std::this_thread::sleep_for(1ms); } LOGGER_INFO("Entering exit barrier..."); diff --git a/src/worker/worker.hpp b/src/worker/worker.hpp index cc2263f..cf78b38 100644 --- a/src/worker/worker.hpp +++ b/src/worker/worker.hpp @@ -48,8 +48,8 @@ public: private: void _progress_operations(); - bool _handle_incoming_message(const boost::mpi::communicator& workers, const boost::mpi::communicator& world); - void _process_transfer_message(const transfer_message& msg, int source, int tag, const boost::mpi::communicator& workers); + bool _handle_incoming_message(boost::mpi::communicator& workers, const boost::mpi::communicator& world); + void _process_transfer_message(const transfer_message& msg, int source, int tag, boost::mpi::communicator& workers); void _update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, cargo::transfer_state st, float bw, std::optional ec = std::nullopt); -- GitLab From 938049e8b0cefb35b2443f572c0066b4e1217b05 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 10:42:09 +0200 Subject: [PATCH 15/21] thread naming --- src/CMakeLists.txt | 3 +- src/master.cpp | 4 ++- src/net/server.cpp | 6 +++- src/net/signal_listener.hpp | 8 +++-- src/thread_utils.hpp | 60 +++++++++++++++++++++++++++++++++++++ src/worker/mpio_write.cpp | 33 ++++++++------------ 6 files changed, 87 insertions(+), 27 deletions(-) create mode 100644 src/thread_utils.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d3492ac..a278c41 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -57,6 +57,7 @@ target_sources( proto/rpc/response.hpp proto/mpi/message.hpp boost_serialization_std_optional.hpp + thread_utils.hpp ) target_include_directories( @@ -79,4 +80,4 @@ target_link_libraries( set_target_properties(cargo_server PROPERTIES OUTPUT_NAME "cargo") -install(TARGETS cargo_server DESTINATION ${CMAKE_INSTALL_BINDIR}) +install(TARGETS cargo_server DESTINATION ${CMAKE_INSTALL_BINDIR}) \ No newline at end of file diff --git a/src/master.cpp b/src/master.cpp index ed76c1b..90a2924 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -37,6 +37,7 @@ #include "proto/rpc/response.hpp" #include "proto/mpi/message.hpp" #include "parallel_request.hpp" +#include "thread_utils.hpp" #include #include #include @@ -168,7 +169,7 @@ master_server::~master_server() {} void master_server::mpi_listener_ult() { - + set_current_thread_name("mpi_listener"); mpi::communicator world; while(!m_shutting_down) { @@ -229,6 +230,7 @@ master_server::mpi_listener_ult() { void master_server::ftio_scheduling_ult() { + set_current_thread_name("ftio_scheduler"); while (!m_shutting_down) { abt::unique_lock lock(m_ftio_mutex); diff --git a/src/net/server.cpp b/src/net/server.cpp index dab6981..510c2f8 100644 --- a/src/net/server.cpp +++ b/src/net/server.cpp @@ -37,6 +37,7 @@ #include "signal_listener.hpp" #include "server.hpp" #include "endpoint.hpp" +#include "../thread_utils.hpp" using namespace std::literals; @@ -237,7 +238,7 @@ server::install_signal_handlers() { // This call does not block. Instead, it starts an internal std::thread // responsible for processing incoming signals - m_signal_listener.run(); + m_signal_listener.run("signal_handler"); } void @@ -317,6 +318,9 @@ server::run() { shutdown(); return EXIT_SUCCESS; } + + // Set the name for the main thread + cargo::set_current_thread_name("main_server"); // write pidfile if needed if(m_pidfile.has_value()) { diff --git a/src/net/signal_listener.hpp b/src/net/signal_listener.hpp index 0aa2ac1..8facbe2 100644 --- a/src/net/signal_listener.hpp +++ b/src/net/signal_listener.hpp @@ -28,6 +28,7 @@ #include #include +#include "../thread_utils.hpp" namespace { template @@ -80,8 +81,9 @@ struct signal_listener { } void - run() { - m_thread = std::thread([&]() { + run(const std::string& thread_name = "sig_handler") { + m_thread = std::thread([&, thread_name]() { + cargo::set_current_thread_name(thread_name); do_accept(); m_ios.run(); }); @@ -118,4 +120,4 @@ private: SignalHandlerType m_user_handler; }; -#endif // SIGNAL_LISTENER_HPP +#endif // SIGNAL_LISTENER_HPP \ No newline at end of file diff --git a/src/thread_utils.hpp b/src/thread_utils.hpp new file mode 100644 index 0000000..7a3751c --- /dev/null +++ b/src/thread_utils.hpp @@ -0,0 +1,60 @@ +#ifndef CARGO_THREAD_UTILS_HPP +#define CARGO_THREAD_UTILS_HPP + +#include +#include + +#if defined(__linux__) +#include +#elif defined(__APPLE__) +#include +#endif + +namespace cargo { + +/** + * @brief Sets the name of the current thread. + * + * This function is a wrapper around platform-specific APIs for setting + * thread names (e.g., pthread_setname_np on Linux). The name is + * truncated if it exceeds the system limit (typically 15 characters on Linux). + * + * @param name The desired name for the thread. + */ +inline void +set_current_thread_name(const std::string& name) { +#if defined(__linux__) + // pthread_setname_np truncates the name if it's longer than 15 chars + null terminator. + pthread_setname_np(pthread_self(), name.substr(0, 15).c_str()); +#elif defined(__APPLE__) + // Apple's pthread_setname_np does not truncate. + pthread_setname_np(name.c_str()); +#else + // Other platforms might not support this. + (void)name; // Avoid unused parameter warning +#endif +} + +/** + * @brief Sets the name of a std::thread. + * + * @param thread The std::thread object whose name is to be set. + * @param name The desired name for the thread. + */ +inline void +set_thread_name(std::thread& thread, const std::string& name) { +#if defined(__linux__) || defined(__APPLE__) + // It's generally better to set the name from within the thread itself, + // but this works for std::thread by accessing its native handle. + auto handle = thread.native_handle(); + pthread_setname_np(handle, name.substr(0, 15).c_str()); +#else + (void)thread; + (void)name; +#endif +} + + +} // namespace cargo + +#endif // CARGO_THREAD_UTILS_HPP \ No newline at end of file diff --git a/src/worker/mpio_write.cpp b/src/worker/mpio_write.cpp index 66c427f..bfa2fa4 100644 --- a/src/worker/mpio_write.cpp +++ b/src/worker/mpio_write.cpp @@ -39,21 +39,6 @@ mpio_write::mpio_write(mpi::communicator workers, std::filesystem::path input_pa : base_operation(std::move(workers), std::move(input_path), std::move(output_path), block_size, fs_i_type, fs_o_type, size, single) {} -void mpio_write::_read_input_file_sequentially() { - auto input_file = posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type); - - // Calculate required buffer size from the ranges this worker is responsible for - _calculate_file_ranges(); - std::size_t required_buffer_size = 0; - for(const auto& range : m_file_ranges) { - required_buffer_size += range.size(); - } - m_buffer.resize(required_buffer_size); - m_bytes_per_rank = required_buffer_size; - - _perform_strided_read(input_file, m_buffer); -} - cargo::error_code mpio_write::setup() { m_status = error_code::transfer_in_progress; @@ -64,12 +49,14 @@ mpio_write::setup() { ++m_total_blocks; } - std::size_t blocks_per_rank = m_total_blocks / m_workers_size; - if(int64_t n = m_total_blocks % m_workers_size; n != 0 && m_workers_rank < n) { - ++blocks_per_rank; + // Calculate file ranges this worker is responsible for + _calculate_file_ranges(); + std::size_t required_buffer_size = 0; + for(const auto& range : m_file_ranges) { + required_buffer_size += range.size(); } - - // Defer buffer allocation until we know the exact size needed + m_buffer.resize(required_buffer_size); + m_bytes_per_rank = required_buffer_size; return (m_status = error_code::success); } @@ -78,8 +65,12 @@ operation::progress_status mpio_write::progress() { try { if (!m_is_read_complete) { - _read_input_file_sequentially(); + auto input_file = posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type); + _perform_strided_read(input_file, m_buffer); m_is_read_complete = true; + // Return InProgress to signal that the next stage (write) should happen + // in the next call. This keeps the logic clean. + return progress_status::InProgress; } const auto output_file = mpioxx::file::open( -- GitLab From 095c08e0dec0c38d4a699bc9b8d2de9224e23cdf Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 10:46:44 +0200 Subject: [PATCH 16/21] faster ccp status --- cli/copy.cpp | 31 +++++++++++++++++++------------ lib/libcargo.cpp | 18 ++++++++++++------ 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/cli/copy.cpp b/cli/copy.cpp index 3c677ca..ecc4731 100644 --- a/cli/copy.cpp +++ b/cli/copy.cpp @@ -58,6 +58,7 @@ parse_command_line(int argc, char* argv[]) { cfg.progname = std::filesystem::path{argv[0]}.filename().string(); CLI::App app{"Cargo parallel copy tool", cfg.progname}; + app.formatter(std::make_shared()); app.add_option("-s,--server", cfg.server_address, "Address of the Cargo server (can also be\n" @@ -75,22 +76,28 @@ parse_command_line(int argc, char* argv[]) { ->option_text("DST...") ->required(); - app.add_option("--if", cfg.input_flags, - "Flags for input datasets. Accepted values\n" - " - posix: read data using POSIX (default)\n" - " - parallel: read data using MPI-IO\n" - " - dataclay: read data using DATACLAY\n" - " - gekkofs: read data using gekkofs user library\n") + std::string if_help = "Flags for input datasets. Accepted values:\n" + " - posix: Read data using POSIX I/O (default)\n" + " - parallel: Read data using MPI-IO\n" + " - gekkofs: Read data using the GekkoFS user library\n" + " - hercules: Read data using the Hercules user library\n" + " - expand: Read data using the ExPaND user library\n" + " - dataclay: Read data using the dataClay user library\n" + " - none: No-op, useful for benchmarking"; + app.add_option("--if", cfg.input_flags, if_help) ->option_text("FLAGS") ->transform(CLI::CheckedTransformer(dataset_flags_map, CLI::ignore_case)); - app.add_option("--of", cfg.output_flags, - "Flags for output datasets. Accepted values\n" - " - posix: write data using POSIX (default)\n" - " - parallel: write data using MPI-IO\n" - " - dataclay: write data using DATACLAY\n" - " - gekkofs: write data using gekkofs user library\n") + std::string of_help = "Flags for output datasets. Accepted values:\n" + " - posix: Write data using POSIX I/O (default)\n" + " - parallel: Write data using MPI-IO\n" + " - gekkofs: Write data using the GekkoFS user library\n" + " - hercules: Write data using the Hercules user library\n" + " - expand: Write data using the ExPaND user library\n" + " - dataclay: Write data using the dataClay user library\n" + " - none: No-op, useful for benchmarking"; + app.add_option("--of", cfg.output_flags, of_help) ->option_text("FLAGS") ->transform(CLI::CheckedTransformer(dataset_flags_map, CLI::ignore_case)); diff --git a/lib/libcargo.cpp b/lib/libcargo.cpp index 67555e8..ae41553 100644 --- a/lib/libcargo.cpp +++ b/lib/libcargo.cpp @@ -303,20 +303,26 @@ transfer_datasets(const server& srv, const std::vector& sources, transfer_status transfer::wait() const { - // wait for the transfer to complete auto s = status(); - while(!s.done() && !s.failed()) { - s = wait_for(1000ms); + std::this_thread::sleep_for(10ms); + s = status(); } - return s; } transfer_status transfer::wait_for(const std::chrono::nanoseconds& timeout) const { - std::this_thread::sleep_for(timeout); - return status(); + const auto start = std::chrono::steady_clock::now(); + auto s = status(); + while(!s.done() && !s.failed()) { + if(std::chrono::steady_clock::now() - start > timeout) { + break; + } + std::this_thread::sleep_for(10ms); + s = status(); + } + return s; } transfer -- GitLab From 6933c3ae9faaf4dfb805bf8c01269c0b030cb4b6 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 11:05:10 +0200 Subject: [PATCH 17/21] add include directory --- CMakeLists.txt | 9 ++++++++- cli/copy.cpp | 4 ++-- cli/ftio.cpp | 3 ++- cli/ping.cpp | 3 ++- cli/shaping.cpp | 3 ++- cli/shutdown.cpp | 3 ++- {lib => include/cargo}/cargo.hpp | 0 {lib => include}/cargo/error.hpp | 0 {lib => include/cargo}/fmt_formatters.hpp | 0 lib/CMakeLists.txt | 16 +++++----------- lib/libcargo.cpp | 4 ++-- src/master.cpp | 4 ++-- src/master.hpp | 2 +- src/parallel_request.cpp | 2 +- src/parallel_request.hpp | 3 ++- src/proto/mpi/message.hpp | 3 ++- src/request_manager.cpp | 3 ++- src/worker/ops.hpp | 3 ++- src/worker/worker.cpp | 2 +- 19 files changed, 38 insertions(+), 29 deletions(-) rename {lib => include/cargo}/cargo.hpp (100%) rename {lib => include}/cargo/error.hpp (100%) rename {lib => include/cargo}/fmt_formatters.hpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index b97e812..79884b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -408,4 +408,11 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}-${PROJECT_VERSION} - ) \ No newline at end of file + ) + + +# Install public headers +install( + DIRECTORY include/ + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) \ No newline at end of file diff --git a/cli/copy.cpp b/cli/copy.cpp index ecc4731..4e9a357 100644 --- a/cli/copy.cpp +++ b/cli/copy.cpp @@ -23,8 +23,8 @@ *****************************************************************************/ #include -#include -#include +#include +#include #include #include #include diff --git a/cli/ftio.cpp b/cli/ftio.cpp index 6f95e5c..d3e37f8 100644 --- a/cli/ftio.cpp +++ b/cli/ftio.cpp @@ -23,7 +23,8 @@ *****************************************************************************/ #include -#include +#include +#include #include #include #include diff --git a/cli/ping.cpp b/cli/ping.cpp index 53ac89a..aff21ae 100644 --- a/cli/ping.cpp +++ b/cli/ping.cpp @@ -23,7 +23,8 @@ *****************************************************************************/ #include -#include +#include +#include #include #include #include diff --git a/cli/shaping.cpp b/cli/shaping.cpp index 98c69ed..a3a5922 100644 --- a/cli/shaping.cpp +++ b/cli/shaping.cpp @@ -23,7 +23,8 @@ *****************************************************************************/ #include -#include +#include +#include #include #include #include diff --git a/cli/shutdown.cpp b/cli/shutdown.cpp index 2adbf64..9030a8b 100644 --- a/cli/shutdown.cpp +++ b/cli/shutdown.cpp @@ -23,7 +23,8 @@ *****************************************************************************/ #include -#include +#include +#include #include #include #include diff --git a/lib/cargo.hpp b/include/cargo/cargo.hpp similarity index 100% rename from lib/cargo.hpp rename to include/cargo/cargo.hpp diff --git a/lib/cargo/error.hpp b/include/cargo/error.hpp similarity index 100% rename from lib/cargo/error.hpp rename to include/cargo/error.hpp diff --git a/lib/fmt_formatters.hpp b/include/cargo/fmt_formatters.hpp similarity index 100% rename from lib/fmt_formatters.hpp rename to include/cargo/fmt_formatters.hpp diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index ea292ac..18ec04c 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -24,19 +24,14 @@ add_library(cargo SHARED) -target_sources(cargo PRIVATE cargo.hpp fmt_formatters.hpp cargo/error.hpp - libcargo.cpp error.cpp) - -list(APPEND public_headers "cargo.hpp;cargo/error.hpp") -list(APPEND public_headers "fmt_formatters.hpp") +target_sources(cargo PRIVATE libcargo.cpp error.cpp) target_include_directories( - cargo PUBLIC $ - $ + cargo PUBLIC + $ + $ ) -set_target_properties(cargo PROPERTIES PUBLIC_HEADER "${public_headers}") - target_link_libraries(cargo PRIVATE logger::logger fmt::fmt @@ -60,7 +55,6 @@ install( LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME} ) install( @@ -68,4 +62,4 @@ install( DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME}-${PROJECT_VERSION} NAMESPACE cargo:: -) +) \ No newline at end of file diff --git a/lib/libcargo.cpp b/lib/libcargo.cpp index ae41553..f8fc8b6 100644 --- a/lib/libcargo.cpp +++ b/lib/libcargo.cpp @@ -22,8 +22,8 @@ * SPDX-License-Identifier: GPL-3.0-or-later *****************************************************************************/ -#include "cargo.hpp" -#include "fmt_formatters.hpp" +#include +#include #include "net/serialization.hpp" #include #include diff --git a/src/master.cpp b/src/master.cpp index 90a2924..cf09b2c 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -26,8 +26,8 @@ #include "logger/logger.hpp" #include "net/server.hpp" -#include -#include +#include +#include #include #include #include diff --git a/src/master.hpp b/src/master.hpp index d72dd8f..94f7ec4 100644 --- a/src/master.hpp +++ b/src/master.hpp @@ -26,7 +26,7 @@ #define CARGO_MASTER_HPP #include "net/server.hpp" -#include "cargo.hpp" +#include #include "request_manager.hpp" #include "parallel_request.hpp" #include "env.hpp" diff --git a/src/parallel_request.cpp b/src/parallel_request.cpp index bd62cec..565cc1b 100644 --- a/src/parallel_request.cpp +++ b/src/parallel_request.cpp @@ -22,7 +22,7 @@ * SPDX-License-Identifier: GPL-3.0-or-later *****************************************************************************/ -#include "cargo.hpp" +#include #include "parallel_request.hpp" namespace cargo { diff --git a/src/parallel_request.hpp b/src/parallel_request.hpp index fca7592..b37336d 100644 --- a/src/parallel_request.hpp +++ b/src/parallel_request.hpp @@ -29,7 +29,8 @@ #include #include #include -#include "../lib/cargo.hpp" +#include + namespace cargo { class dataset; diff --git a/src/proto/mpi/message.hpp b/src/proto/mpi/message.hpp index 923b1c6..d5f388c 100644 --- a/src/proto/mpi/message.hpp +++ b/src/proto/mpi/message.hpp @@ -32,7 +32,8 @@ #include #include #include -#include "cargo.hpp" +#include +#include #include "boost_serialization_std_optional.hpp" #include #include "posix_file/file.hpp" diff --git a/src/request_manager.cpp b/src/request_manager.cpp index 5292274..7fdde70 100644 --- a/src/request_manager.cpp +++ b/src/request_manager.cpp @@ -23,7 +23,8 @@ * SPDX-License-Identifier: GPL-3.0-or-later *****************************************************************************/ -#include "cargo.hpp" +#include +#include #include "cargo/error.hpp" #include "parallel_request.hpp" #include "request_manager.hpp" diff --git a/src/worker/ops.hpp b/src/worker/ops.hpp index b0ca128..7b863ae 100644 --- a/src/worker/ops.hpp +++ b/src/worker/ops.hpp @@ -29,7 +29,8 @@ #include #include #include "proto/mpi/message.hpp" -#include "cargo.hpp" +#include +#include #include "posix_file/file.hpp" namespace cargo { diff --git a/src/worker/worker.cpp b/src/worker/worker.cpp index f06b599..aef2170 100644 --- a/src/worker/worker.cpp +++ b/src/worker/worker.cpp @@ -29,7 +29,7 @@ #include #include #include "worker.hpp" -#include "fmt_formatters.hpp" +#include namespace mpi = boost::mpi; using namespace std::chrono_literals; -- GitLab From 7b8b12e20308dbd578fab87df62628c464b19158 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 11:47:37 +0200 Subject: [PATCH 18/21] Better status output --- src/master.cpp | 62 ++++------ src/parallel_request.cpp | 54 ++++---- src/parallel_request.hpp | 75 ++++++++--- src/proto/mpi/message.hpp | 22 ++-- src/request_manager.cpp | 227 ++++++++++++++++------------------ src/request_manager.hpp | 18 ++- src/worker/base_operation.cpp | 3 + src/worker/mpio_read.cpp | 2 +- src/worker/mpio_write.cpp | 6 +- src/worker/ops.cpp | 6 + src/worker/ops.hpp | 4 +- src/worker/seq_mixed.cpp | 5 +- src/worker/worker.cpp | 20 +-- src/worker/worker.hpp | 2 +- 14 files changed, 273 insertions(+), 233 deletions(-) diff --git a/src/master.cpp b/src/master.cpp index cf09b2c..f1530a3 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -183,7 +183,7 @@ master_server::mpi_listener_ult() { msg.source(), m); m_request_manager.update(m.tid(), m.seqno(), msg.source() - 1, - m.name(), m.state(), m.bw(), + m.name(), m.state(), m.bw(), m.bytes_transferred(), m.error_code()); if (m_ftio.load() && m.tid() == m_ftio_tid.load()) { @@ -257,8 +257,8 @@ master_server::ftio_scheduling_ult() { if(!expanded.sources.empty()){ m_pending_transfer.m_expanded_sources = std::move(expanded.sources); m_pending_transfer.m_expanded_targets = std::move(expanded.targets); - - auto ec = m_request_manager.update(m_pending_transfer.m_p.tid(), m_pending_transfer.m_expanded_sources.size(), m_pending_transfer.m_p.nworkers()); + + auto ec = m_request_manager.update(m_pending_transfer.m_p.tid(), m_pending_transfer.m_expanded_sources.size(), expanded.sizes); if(ec != error_code::success) { LOGGER_ERROR("Failed to update request for FTIO transfer {}: {}", m_pending_transfer.m_p.tid(), ec); } else { @@ -440,58 +440,38 @@ void master_server::transfer_datasets(const network::request& req, const std::vector& sources, const std::vector& targets) { - m_network_engine.get_handler_pool().make_thread( - [this, req, s = sources, t = targets]() mutable { - do_transfer_datasets(req, std::move(s), std::move(t)); - } - ); -} - -void -master_server::do_transfer_datasets(const network::request req, - std::vector sources, - std::vector targets) { using network::get_address; using network::rpc_info; - using proto::generic_response; using proto::response_with_id; - mpi::communicator world; const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); - LOGGER_INFO("rpc {:>} body: {{sources: {}, targets: {}}}", rpc, sources, targets); - m_request_manager.create(0, world.size() - 1) // Initially create with 0 files + mpi::communicator world; + m_request_manager.create(world.size() - 1) .or_else([&](auto&& ec) { LOGGER_ERROR("Failed to create request: {}", ec); - req.respond(generic_response{rpc.id(), ec}); + req.respond(proto::generic_response{rpc.id(), ec}); }) .map([&](auto&& r) { - if(m_ftio) { - abt::unique_lock lock(m_ftio_mutex); - m_pending_transfer.m_p = r; - m_pending_transfer.m_sources = sources; - m_pending_transfer.m_targets = targets; - m_pending_transfer.m_work = true; - m_ftio_tid.store(r.tid()); - LOGGER_INFO("Stored stage-out information for future transfer {}", r.tid()); - } else { - auto expanded = expand_transfer_requests(sources, targets); - if (!expanded.sources.empty()) { - auto ec = m_request_manager.update(r.tid(), expanded.sources.size(), r.nworkers()); - if (ec == error_code::success) { - _dispatch_transfer_to_workers(r, expanded); + req.respond(response_with_id{rpc.id(), error_code::success, r.tid()}); + + // Asynchronously expand and dispatch + m_network_engine.get_handler_pool().make_thread( + [this, r, s = sources, t = targets]() { + auto expanded = expand_transfer_requests(s, t); + if (!expanded.sources.empty()) { + auto ec = m_request_manager.update(r.tid(), expanded.sources.size(), expanded.sizes); + if (ec == error_code::success) { + _dispatch_transfer_to_workers(r, expanded); + } else { + LOGGER_ERROR("Failed to update request {}: {}", r.tid(), ec); + } } else { - LOGGER_ERROR("Failed to update request {}: {}", r.tid(), ec); - req.respond(generic_response{rpc.id(), ec}); - return; + LOGGER_INFO("No files to transfer for request {}", r.tid()); } - } else { - LOGGER_INFO("No files to transfer for request {}", r.tid()); } - } - - req.respond(response_with_id{rpc.id(), error_code::success, r.tid()}); + ); }); } diff --git a/src/parallel_request.cpp b/src/parallel_request.cpp index 565cc1b..211b5a6 100644 --- a/src/parallel_request.cpp +++ b/src/parallel_request.cpp @@ -24,6 +24,7 @@ #include #include "parallel_request.hpp" +#include namespace cargo { @@ -46,38 +47,21 @@ parallel_request::nworkers() const { return m_nworkers; } -request_status::request_status(part_status s) - : m_name(s.name()), m_state(s.state()), m_bw(s.bw()), - m_error_code(s.error()) {} - request_status::request_status(std::string name, transfer_state s, float bw, + std::size_t bytes_transferred, std::size_t total_bytes, + std::chrono::nanoseconds elapsed_time, std::optional ec) - : m_name(name), m_state(s), m_bw(bw), m_error_code(ec) {} - -transfer_state -request_status::state() const { - return m_state; -} - -std::string -request_status::name() const { - return m_name; -} - -std::optional -request_status::error() const { - return m_error_code; -} - -float -request_status::bw() const { - return m_bw; -} - -void -request_status::bw(float bw) { - m_bw = bw; -} + : m_name(std::move(name)), m_state(s), m_bw(bw), m_bytes_transferred(bytes_transferred), + m_total_bytes(total_bytes), m_elapsed_time(elapsed_time), m_error_code(ec) {} + +transfer_state request_status::state() const { return m_state; } +std::string request_status::name() const { return m_name; } +std::optional request_status::error() const { return m_error_code; } +float request_status::bw() const { return m_bw; } +void request_status::bw(float bw) { m_bw = bw; } +std::size_t request_status::bytes_transferred() const { return m_bytes_transferred; } +std::size_t request_status::total_bytes() const { return m_total_bytes; } +std::chrono::nanoseconds request_status::elapsed_time() const { return m_elapsed_time; } std::string part_status::name() const { @@ -94,6 +78,11 @@ part_status::bw() const { return m_bw; } +std::size_t +part_status::bytes_transferred() const { + return m_bytes_transferred; +} + std::optional part_status::error() const { return m_error_code; @@ -101,10 +90,11 @@ part_status::error() const { void part_status::update(std::string name, transfer_state s, float bw, - std::optional ec) noexcept { - m_name = name; + std::size_t bytes, std::optional ec) noexcept { + m_name = std::move(name); m_state = s; m_bw = bw; + m_bytes_transferred = bytes; m_error_code = ec; } diff --git a/src/parallel_request.hpp b/src/parallel_request.hpp index b37336d..c224fe1 100644 --- a/src/parallel_request.hpp +++ b/src/parallel_request.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -77,15 +78,19 @@ public: [[nodiscard]] float bw() const; + + [[nodiscard]] std::size_t + bytes_transferred() const; void update(std::string name, transfer_state s, float bw, - std::optional ec) noexcept; + std::size_t bytes, std::optional ec) noexcept; private: std::string m_name; transfer_state m_state{transfer_state::pending}; - float m_bw; + float m_bw = 0.0f; + std::size_t m_bytes_transferred = 0; std::optional m_error_code{}; }; @@ -93,8 +98,10 @@ class request_status { public: request_status() = default; explicit request_status(std::string name, transfer_state s, float bw, + std::size_t bytes_transferred, + std::size_t total_bytes, + std::chrono::nanoseconds elapsed_time, std::optional ec = {}); - explicit request_status(part_status s); [[nodiscard]] std::string name() const; @@ -107,14 +114,26 @@ public: [[nodiscard]] float bw() const; - + void bw(float bw); + [[nodiscard]] std::size_t + bytes_transferred() const; + + [[nodiscard]] std::size_t + total_bytes() const; + + [[nodiscard]] std::chrono::nanoseconds + elapsed_time() const; + private: std::string m_name; transfer_state m_state{transfer_state::pending}; - float m_bw; + float m_bw = 0.0f; + std::size_t m_bytes_transferred = 0; + std::size_t m_total_bytes = 0; + std::chrono::nanoseconds m_elapsed_time{0}; std::optional m_error_code{}; }; @@ -122,13 +141,25 @@ private: template <> struct fmt::formatter : formatter { + + // Helper to format bytes in a human-readable way + static std::string format_bytes(std::size_t bytes) { + if (bytes < 1024) return fmt::format("{} B", bytes); + double kb = bytes / 1024.0; + if (kb < 1024.0) return fmt::format("{:.2f} KB", kb); + double mb = kb / 1024.0; + if (mb < 1024.0) return fmt::format("{:.2f} MB", mb); + double gb = mb / 1024.0; + return fmt::format("{:.2f} GB", gb); + } + // parse is inherited from formatter. template auto format(const cargo::request_status& s, FormatContext& ctx) const { - const auto state_name = [](auto&& s) { - switch(s.state()) { + const auto state_name = [](auto&& st) { + switch(st) { case cargo::transfer_state::pending: return "pending"; case cargo::transfer_state::running: @@ -142,17 +173,31 @@ struct fmt::formatter : formatter { } }; - std::string str = ""; + std::string progress_str; + if (s.total_bytes() > 0) { + progress_str = fmt::format(", progress: {}/{}", + format_bytes(s.bytes_transferred()), + format_bytes(s.total_bytes())); + } + + std::string bw_str; + if (s.bw() > 0) { + bw_str = fmt::format(", bw: {:.2f} MB/s", s.bw()); + } + + std::string error_str; if(s.error()) { - str = - fmt::format("{{state: {}, bw: {}, error_code: {}}}", - state_name(s), s.bw(), *s.error()); - } else { - str = - fmt::format("{{state: {}, bw: {}}}", state_name(s), s.bw()); + error_str = fmt::format(", error_code: {}", *s.error()); } + + const auto str = fmt::format("{{state: {}{}{}{}}}", + state_name(s.state()), + progress_str, + bw_str, + error_str); + return formatter::format(str, ctx); } }; -#endif // CARGO_PARALLEL_REQUEST_HPP +#endif // CARGO_PARALLEL_REQUEST_HPP \ No newline at end of file diff --git a/src/proto/mpi/message.hpp b/src/proto/mpi/message.hpp index d5f388c..ce2d42a 100644 --- a/src/proto/mpi/message.hpp +++ b/src/proto/mpi/message.hpp @@ -33,7 +33,6 @@ #include #include #include -#include #include "boost_serialization_std_optional.hpp" #include #include "posix_file/file.hpp" @@ -133,10 +132,10 @@ public: status_message() = default; status_message(std::uint64_t tid, std::uint32_t seqno, std::string name, - cargo::transfer_state state, float bw, + cargo::transfer_state state, float bw, std::size_t bytes, std::optional error_code = std::nullopt) : m_tid(tid), m_seqno(seqno), m_name(name), m_state(state), m_bw(bw), - m_error_code(error_code) {} + m_bytes_transferred(bytes), m_error_code(error_code) {} [[nodiscard]] std::uint64_t tid() const { @@ -162,6 +161,11 @@ public: bw() const { return m_bw; } + + [[nodiscard]] std::size_t + bytes_transferred() const { + return m_bytes_transferred; + } [[nodiscard]] std::optional @@ -180,6 +184,7 @@ private: ar & m_name; ar & m_state; ar & m_bw; + ar & m_bytes_transferred; ar & m_error_code; } @@ -188,6 +193,7 @@ private: std::string m_name{}; cargo::transfer_state m_state{}; float m_bw{}; + std::size_t m_bytes_transferred{}; std::optional m_error_code{}; }; @@ -278,14 +284,14 @@ struct fmt::formatter : formatter { const auto str = s.error_code() ? fmt::format( - "{{tid: {}, seqno: {}, name: {}, state: {}, bw: {}, " + "{{tid: {}, seqno: {}, name: {}, state: {}, bw: {}, bytes: {}, " "error_code: {}}}", s.tid(), s.seqno(), s.name(), s.state(), - s.bw(), *s.error_code()) + s.bw(), s.bytes_transferred(), *s.error_code()) : fmt::format( - "{{tid: {}, seqno: {}, name: {}, state: {}, bw: {}}}", + "{{tid: {}, seqno: {}, name: {}, state: {}, bw: {}, bytes: {}}}", s.tid(), s.seqno(), s.name(), s.state(), - s.bw()); + s.bw(), s.bytes_transferred()); return formatter::format(str, ctx); } }; @@ -314,4 +320,4 @@ struct fmt::formatter : formatter { } }; -#endif // CARGO_PROTO_MPI_MESSAGE_HPP +#endif // CARGO_PROTO_MPI_MESSAGE_HPP \ No newline at end of file diff --git a/src/request_manager.cpp b/src/request_manager.cpp index 7fdde70..3644a61 100644 --- a/src/request_manager.cpp +++ b/src/request_manager.cpp @@ -8,7 +8,6 @@ * * Cargo is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * @@ -23,124 +22,71 @@ * SPDX-License-Identifier: GPL-3.0-or-later *****************************************************************************/ -#include -#include -#include "cargo/error.hpp" -#include "parallel_request.hpp" +#include #include "request_manager.hpp" #include +#include #include "logger/logger.hpp" -namespace {} // namespace - namespace cargo { tl::expected -request_manager::create(std::size_t nfiles, std::size_t nworkers) { - +request_manager::create(std::size_t nworkers) { std::uint64_t tid = current_tid++; abt::unique_lock lock(m_mutex); - if(const auto it = m_requests.find(tid); it == m_requests.end()) { - - const auto& [it_req, inserted] = m_requests.emplace( - tid, std::vector{ - nfiles, std::vector{nworkers}}); - - if(!inserted) { - LOGGER_ERROR("{}: Emplace failed", __FUNCTION__); - return tl::make_unexpected(error_code::snafu); - } + auto [it, inserted] = m_requests.emplace(tid, request_metadata()); + if (!inserted) { + // This should theoretically never happen with an incrementing tid + return tl::make_unexpected(error_code::snafu); } - return parallel_request{tid, nfiles, nworkers}; + auto& meta = it->second; + meta.p_req = parallel_request{tid, 0, nworkers}; + meta.start_time = std::chrono::steady_clock::now(); + + return meta.p_req; } -/** - * @brief Update the request for ftio processing (as it is modified by readdir) - * - * @param request - * @param nfiles - * @param nworkers - * @return error_code - */ error_code -request_manager::update(std::uint64_t tid, std::size_t nfiles, - std::size_t nworkers) { +request_manager::update(std::uint64_t tid, std::size_t nfiles, const std::vector& file_sizes) { abt::unique_lock lock(m_mutex); - m_requests[tid] = std::vector{nfiles, - std::vector{nworkers}}; - + auto it = m_requests.find(tid); + if(it == m_requests.end()) { + return error_code::no_such_transfer; + } + auto& meta = it->second; + meta.p_req = parallel_request{tid, nfiles, meta.p_req.nworkers()}; + meta.statuses.resize(nfiles, std::vector(meta.p_req.nworkers())); + meta.file_sizes = file_sizes; return error_code::success; - - } - error_code request_manager::update(std::uint64_t tid, std::uint32_t seqno, std::size_t wid, - std::string name, transfer_state s, float bw, + std::string name, transfer_state s, float bw, std::size_t bytes, std::optional ec) { - abt::unique_lock lock(m_mutex); - if(const auto it = m_requests.find(tid); it != m_requests.end()) { - assert(seqno < it->second.size()); - assert(wid < it->second[seqno].size()); - it->second[seqno][wid].update(name, s, bw, ec); - return error_code::success; - } - - LOGGER_ERROR("{}: Request {} not found", __FUNCTION__, tid); - return error_code::no_such_transfer; -} - -request_status request_manager::_get_file_status(const file_status& fs) const { - if (fs.empty()) { - return request_status{"", transfer_state::completed, 0.0f, error_code::success}; - } - - bool has_completed = false; - bool has_running = false; - float total_bw = 0.0f; - int active_parts = 0; - - for (const auto& ps : fs) { - if (ps.state() == transfer_state::failed) { - return request_status{ps}; // Immediately return on failure - } - if (ps.state() == transfer_state::running) { - has_running = true; - } - if (ps.state() == transfer_state::completed) { - has_completed = true; - } - if (ps.bw() > 0.0f) { - total_bw += ps.bw(); - active_parts++; - } + auto it = m_requests.find(tid); + if(it == m_requests.end()) { + LOGGER_ERROR("{}: Request {} not found", __FUNCTION__, tid); + return error_code::no_such_transfer; } - float avg_bw = (active_parts > 0) ? (total_bw / active_parts) : 0.0f; - - if (has_running) { - return request_status{fs.front().name(), transfer_state::running, avg_bw, std::nullopt}; + auto& statuses = it->second.statuses; + if (seqno >= statuses.size() || wid >= statuses[seqno].size()) { + LOGGER_ERROR("{}: Invalid sequence number {} or worker ID {}", __FUNCTION__, seqno, wid); + return error_code::snafu; } - // For small files, one completed part from its designated worker means the file is done. - if (has_completed) { - return request_status{fs.front().name(), transfer_state::completed, 0.0f, error_code::success}; - } - - // If nothing has failed, nothing is running, and nothing has completed, it must be pending. - return request_status{fs.front().name(), transfer_state::pending, 0.0f, std::nullopt}; + statuses[seqno][wid].update(name, s, bw, bytes, ec); + return error_code::success; } - tl::expected request_manager::lookup(std::uint64_t tid) { - abt::shared_lock lock(m_mutex); auto it = m_requests.find(tid); @@ -148,62 +94,109 @@ request_manager::lookup(std::uint64_t tid) { LOGGER_ERROR("{}: Request {} not found", __FUNCTION__, tid); return tl::make_unexpected(error_code::no_such_transfer); } - - const auto& all_file_statuses = it->second; + + const auto& meta = it->second; + const auto& all_file_statuses = meta.statuses; + auto elapsed_time = std::chrono::duration_cast(std::chrono::steady_clock::now() - meta.start_time); + if (all_file_statuses.empty()) { - return request_status{"", transfer_state::completed, 0.0f, error_code::success}; + return request_status{"", transfer_state::pending, 0.0f, 0, 0, elapsed_time, std::nullopt}; } - bool all_files_completed = true; - bool any_running = false; + size_t total_bytes_transferred = 0; + size_t total_bytes = std::accumulate(meta.file_sizes.begin(), meta.file_sizes.end(), 0ULL); float total_bw = 0.0f; - int active_transfers = 0; - + int active_workers = 0; + bool any_running = false; + for (const auto& file_status_vec : all_file_statuses) { - auto file_status = _get_file_status(file_status_vec); - if (file_status.state() == transfer_state::failed) { - return file_status; // Propagate failure up immediately - } - if (file_status.state() != transfer_state::completed) { - all_files_completed = false; - } - if (file_status.state() == transfer_state::running) { - any_running = true; - total_bw += file_status.bw(); - active_transfers++; + for (const auto& part : file_status_vec) { + if (part.state() == transfer_state::failed) { + return request_status{part.name(), transfer_state::failed, part.bw(), part.bytes_transferred(), total_bytes, elapsed_time, part.error()}; + } + if (part.state() == transfer_state::running) { + any_running = true; + } + if (part.bw() > 0) { + total_bw += part.bw(); + active_workers++; + } + total_bytes_transferred += part.bytes_transferred(); } } + + float avg_bw = (active_workers > 0) ? total_bw / active_workers : 0.0f; + + // If all bytes are transferred, we are done. + if (total_bytes > 0 && total_bytes_transferred >= total_bytes) { + return request_status{"", transfer_state::completed, 0.0f, total_bytes_transferred, total_bytes, elapsed_time, error_code::success}; + } - if (all_files_completed) { - return request_status{"", transfer_state::completed, 0.0f, error_code::success}; + // If any worker is actively running, the state is running. + if(any_running) { + return request_status{"", transfer_state::running, avg_bw, total_bytes_transferred, total_bytes, elapsed_time, std::nullopt}; } - if (any_running) { - float avg_bw = (active_transfers > 0) ? (total_bw / active_transfers) : 0.0f; - return request_status{"", transfer_state::running, avg_bw, std::nullopt}; + // If not completed and not running, but there are still bytes to transfer, it's effectively still running from a client perspective. + // The client should keep waiting. + if (total_bytes > 0 && total_bytes_transferred < total_bytes) { + return request_status{"", transfer_state::running, 0.0f, total_bytes_transferred, total_bytes, elapsed_time, std::nullopt}; } - // If nothing failed, not everything is complete, and nothing is running, it must be pending. - return request_status{"", transfer_state::pending, 0.0f, std::nullopt}; + // Default to pending if no other state fits (e.g., before first status update) + return request_status{"", transfer_state::pending, 0.0f, total_bytes_transferred, total_bytes, elapsed_time, std::nullopt}; } tl::expected, error_code> request_manager::lookup_all(std::uint64_t tid) { - abt::shared_lock lock(m_mutex); auto it = m_requests.find(tid); - if(it == m_requests.end()) { + if (it == m_requests.end()) { LOGGER_ERROR("{}: Request {} not found", __FUNCTION__, tid); return tl::make_unexpected(error_code::no_such_transfer); } + const auto& meta = it->second; + const auto& all_file_statuses = meta.statuses; + auto elapsed_time = std::chrono::duration_cast(std::chrono::steady_clock::now() - meta.start_time); std::vector result; - const auto& all_file_statuses = it->second; - result.reserve(all_file_statuses.size()); - for(const auto& fs : all_file_statuses) { - result.push_back(_get_file_status(fs)); + for(size_t i = 0; i < all_file_statuses.size(); ++i) { + const auto& file_status_vec = all_file_statuses[i]; + + float total_bw = 0; + int active_workers = 0; + bool failed = false; + bool running = false; + std::size_t bytes_transferred = 0; + std::string name; + std::optional ec; + + for (const auto& part : file_status_vec) { + if(name.empty()) name = part.name(); + if (part.state() == transfer_state::failed) { + failed = true; + ec = part.error(); + break; + } + if (part.state() == transfer_state::running) { + running = true; + } + if (part.bw() > 0) { + total_bw += part.bw(); + active_workers++; + } + bytes_transferred += part.bytes_transferred(); + } + + float avg_bw = (active_workers > 0) ? total_bw / active_workers : 0.0f; + transfer_state s = transfer_state::completed; + if(failed) s = transfer_state::failed; + else if(running) s = transfer_state::running; + else if (bytes_transferred < meta.file_sizes[i]) s = transfer_state::pending; + + result.emplace_back(name, s, avg_bw, bytes_transferred, meta.file_sizes[i], elapsed_time, ec); } return result; } diff --git a/src/request_manager.hpp b/src/request_manager.hpp index dfe7cd4..2b40563 100644 --- a/src/request_manager.hpp +++ b/src/request_manager.hpp @@ -53,17 +53,26 @@ class dataset; class request_manager { using file_status = std::vector; + + struct request_metadata { + parallel_request p_req; + std::chrono::steady_clock::time_point start_time; + std::vector file_sizes; + std::vector statuses; + + request_metadata() : p_req(0, 0, 0) {} // Default constructor + }; public: tl::expected - create(std::size_t nfiles, std::size_t nworkers); + create(std::size_t nworkers); error_code - update(std::uint64_t tid, std::size_t nfiles, std::size_t nworkers); + update(std::uint64_t tid, std::size_t nfiles, const std::vector& file_sizes); error_code update(std::uint64_t tid, std::uint32_t seqno, std::size_t wid, - std::string name, transfer_state s, float bw, + std::string name, transfer_state s, float bw, std::size_t bytes, std::optional ec = std::nullopt); tl::expected @@ -76,10 +85,9 @@ public: remove(std::uint64_t tid); private: - request_status _get_file_status(const file_status& fs) const; std::atomic current_tid = 0; mutable abt::shared_mutex m_mutex; - std::unordered_map> m_requests; + std::unordered_map m_requests; }; } // namespace cargo diff --git a/src/worker/base_operation.cpp b/src/worker/base_operation.cpp index 5fa126d..cc4ecae 100644 --- a/src/worker/base_operation.cpp +++ b/src/worker/base_operation.cpp @@ -23,6 +23,7 @@ *****************************************************************************/ #include "base_operation.hpp" +#include "logger/logger.hpp" #include #include #include @@ -66,6 +67,8 @@ void base_operation::_perform_strided_read(posix_file::file& input_file, memory_ auto start = std::chrono::steady_clock::now(); const std::size_t n = input_file.pread(region, file_range.offset(), file_range.size()); + m_bytes_transferred += n; + auto sleep_duration = sleep_value(); if (sleep_duration > 0ms) std::this_thread::sleep_for(sleep_duration); diff --git a/src/worker/mpio_read.cpp b/src/worker/mpio_read.cpp index 0fc9940..b3d6d6b 100644 --- a/src/worker/mpio_read.cpp +++ b/src/worker/mpio_read.cpp @@ -129,7 +129,7 @@ mpio_read::progress() { auto start = std::chrono::steady_clock::now(); m_output_file->pwrite(buffer_region, file_range.offset(), file_range.size()); - + m_bytes_transferred += file_range.size(); auto sleep_duration = sleep_value(); if(sleep_duration > 0ms) std::this_thread::sleep_for(sleep_duration); diff --git a/src/worker/mpio_write.cpp b/src/worker/mpio_write.cpp index bfa2fa4..3001fbd 100644 --- a/src/worker/mpio_write.cpp +++ b/src/worker/mpio_write.cpp @@ -68,11 +68,13 @@ mpio_write::progress() { auto input_file = posix_file::open(m_input_path, O_RDONLY, 0, m_fs_i_type); _perform_strided_read(input_file, m_buffer); m_is_read_complete = true; - // Return InProgress to signal that the next stage (write) should happen - // in the next call. This keeps the logic clean. + // The read is done, but the overall operation is not. + // Return InProgress so the worker loop continues. + // The bytes_transferred and bw will be reported in the next status update. return progress_status::InProgress; } + // If we reach here, read is complete, so we perform the write. const auto output_file = mpioxx::file::open( m_workers, m_output_path, mpioxx::file_open_mode::create | mpioxx::file_open_mode::wronly); diff --git a/src/worker/ops.cpp b/src/worker/ops.cpp index 1244da8..bc0a02b 100644 --- a/src/worker/ops.cpp +++ b/src/worker/ops.cpp @@ -99,6 +99,12 @@ void operation::bw(float_t bw) { m_bw = bw; } + +std::size_t +operation::bytes_transferred() const { + return m_bytes_transferred; +} + void operation::set_comm(int rank, std::uint64_t tid, std::uint32_t seqno, cargo::tag t) { diff --git a/src/worker/ops.hpp b/src/worker/ops.hpp index 7b863ae..1acf670 100644 --- a/src/worker/ops.hpp +++ b/src/worker/ops.hpp @@ -30,7 +30,6 @@ #include #include "proto/mpi/message.hpp" #include -#include #include "posix_file/file.hpp" namespace cargo { @@ -60,6 +59,8 @@ public: float_t bw() const; void bw(float_t bw); + + std::size_t bytes_transferred() const; cargo::error_code status() const; @@ -73,6 +74,7 @@ protected: std::uint32_t m_seqno = 0; cargo::tag m_t = cargo::tag::sequential; float m_bw = 0.0f; + std::size_t m_bytes_transferred = 0; cargo::error_code m_status = {error_code::success}; }; diff --git a/src/worker/seq_mixed.cpp b/src/worker/seq_mixed.cpp index b59d7c8..482af5d 100644 --- a/src/worker/seq_mixed.cpp +++ b/src/worker/seq_mixed.cpp @@ -77,7 +77,10 @@ seq_mixed_operation::progress() { auto start = std::chrono::steady_clock::now(); const std::size_t n_read = m_input_file->pread(m_buffer, file_range.offset(), file_range.size()); - m_output_file->pwrite(m_buffer, file_range.offset(), n_read); + if (n_read > 0) { + m_output_file->pwrite(m_buffer, file_range.offset(), n_read); + m_bytes_transferred += n_read; + } auto sleep_duration = sleep_value(); if(sleep_duration > 0ms) std::this_thread::sleep_for(sleep_duration); diff --git a/src/worker/worker.cpp b/src/worker/worker.cpp index aef2170..fdc0eac 100644 --- a/src/worker/worker.cpp +++ b/src/worker/worker.cpp @@ -70,11 +70,11 @@ worker::set_block_size(std::uint64_t block_size) { void worker::_update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, - cargo::transfer_state st, float bw, + cargo::transfer_state st, float bw, std::size_t bytes, std::optional ec) { mpi::communicator world; - const cargo::status_message m{tid, seqno, name, st, bw, ec}; + const cargo::status_message m{tid, seqno, name, st, bw, bytes, ec}; LOGGER_DEBUG("msg <= to: {} body: {{payload: {}}}", rank, m); world.send(rank, static_cast(cargo::tag::status), m); } @@ -91,12 +91,12 @@ worker::_progress_operations() { cargo::error_code final_ec = op->status(); _update_state(op->source(), op->tid(), op->seqno(), op->output_path(), (status == operation::progress_status::Done) ? transfer_state::completed : transfer_state::failed, - 0.0f, final_ec); + 0.0f, op->bytes_transferred(), final_ec); it = m_ops.erase(it); } else { // Operation is still in progress, send intermediate status if there's new info. if (op->bw() > 0.0f) { - _update_state(op->source(), op->tid(), op->seqno(), op->output_path(), transfer_state::running, op->bw()); + _update_state(op->source(), op->tid(), op->seqno(), op->output_path(), transfer_state::running, op->bw(), op->bytes_transferred()); } ++it; } @@ -126,20 +126,20 @@ worker::_process_transfer_message(const transfer_message& m, int source, int tag auto op = operation::make_operation(effective_tag, workers, input_path, output_path, m_block_size, m.i_type(), m.o_type(), size, is_small_file); op->set_comm(source, m.tid(), i, effective_tag); - _update_state(op->source(), op->tid(), i, output_path, transfer_state::pending, -1.0f); + _update_state(op->source(), op->tid(), i, output_path, transfer_state::pending, -1.0f, 0); // Setup the operation. If it fails, report failure immediately. if (op->setup() == error_code::success) { - _update_state(op->source(), op->tid(), i, output_path, transfer_state::running, -1.0f); + _update_state(op->source(), op->tid(), i, output_path, transfer_state::running, -1.0f, 0); m_ops.emplace(std::make_pair(m.tid(), i), std::move(op)); } else { LOGGER_ERROR("Operation setup failed for transfer {} file {}", op->tid(), op->input_path()); - _update_state(op->source(), op->tid(), i, output_path, transfer_state::failed, 0.0f, op->status()); + _update_state(op->source(), op->tid(), i, output_path, transfer_state::failed, 0.0f, 0, op->status()); } } else { // This part is critical for the "many small files" case to unblock the master. - _update_state(source, m.tid(), i, output_path, transfer_state::completed, 0.0f, error_code::success); + _update_state(source, m.tid(), i, output_path, transfer_state::completed, 0.0f, size, error_code::success); } } } @@ -227,7 +227,9 @@ worker::run() { if (done) break; // ALWAYS progress operations if there are any. - _progress_operations(); + if (!m_ops.empty()) { + _progress_operations(); + } // ALWAYS sleep to yield the CPU. std::this_thread::sleep_for(1ms); diff --git a/src/worker/worker.hpp b/src/worker/worker.hpp index cf78b38..c1a62c4 100644 --- a/src/worker/worker.hpp +++ b/src/worker/worker.hpp @@ -51,7 +51,7 @@ private: bool _handle_incoming_message(boost::mpi::communicator& workers, const boost::mpi::communicator& world); void _process_transfer_message(const transfer_message& msg, int source, int tag, boost::mpi::communicator& workers); void _update_state(int rank, std::uint64_t tid, std::uint32_t seqno, const std::string& name, - cargo::transfer_state st, float bw, + cargo::transfer_state st, float bw, std::size_t bytes, std::optional ec = std::nullopt); // Key: {transfer_id, file_sequence_number} -- GitLab From d2423ab9e27514f001dcd3815caaa1b7ebff8f28 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 11:51:43 +0200 Subject: [PATCH 19/21] missing includes --- tests/common.hpp | 2 +- tests/tests.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/common.hpp b/tests/common.hpp index d1090e9..baaaf9a 100644 --- a/tests/common.hpp +++ b/tests/common.hpp @@ -2,7 +2,7 @@ #define CARGO_TESTS_COMMON_HPP #include -#include +#include #include class file_handle { diff --git a/tests/tests.cpp b/tests/tests.cpp index 7642a10..0246ee6 100644 --- a/tests/tests.cpp +++ b/tests/tests.cpp @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include -- GitLab From 030bd54ccb22470e6707735efc94ddd8f19ed650 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 14:06:13 +0200 Subject: [PATCH 20/21] ccp usability (--progress) --- cli/copy.cpp | 79 ++++++++++++++------ include/cargo/cargo.hpp | 60 +++++----------- include/cargo/fmt_formatters.hpp | 61 +++++++++++++++- lib/libcargo.cpp | 120 ++++++------------------------- src/master.cpp | 6 +- src/master.hpp | 4 +- src/net/serialization.hpp | 14 +++- src/net/server.cpp | 2 +- src/net/server.hpp | 2 +- src/parallel_request.hpp | 25 ++----- src/proto/rpc/response.hpp | 8 ++- src/request_manager.cpp | 28 +++----- src/request_manager.hpp | 14 +--- 13 files changed, 200 insertions(+), 223 deletions(-) diff --git a/cli/copy.cpp b/cli/copy.cpp index 4e9a357..02a6d9c 100644 --- a/cli/copy.cpp +++ b/cli/copy.cpp @@ -23,14 +23,25 @@ *****************************************************************************/ #include +#include #include #include #include #include #include +#include +#include // For fflush #include "common.hpp" -enum class dataset_flags { posix, parallel, none, gekkofs, hercules, expand, dataclay }; +enum class dataset_flags { + posix, + parallel, + none, + gekkofs, + hercules, + expand, + dataclay +}; std::map dataset_flags_map{ {"posix", cargo::dataset::type::posix}, @@ -48,6 +59,7 @@ struct copy_config { cargo::dataset::type input_flags = cargo::dataset::type::posix; std::vector outputs; cargo::dataset::type output_flags = cargo::dataset::type::posix; + bool show_progress = false; }; copy_config @@ -76,27 +88,32 @@ parse_command_line(int argc, char* argv[]) { ->option_text("DST...") ->required(); - std::string if_help = "Flags for input datasets. Accepted values:\n" - " - posix: Read data using POSIX I/O (default)\n" - " - parallel: Read data using MPI-IO\n" - " - gekkofs: Read data using the GekkoFS user library\n" - " - hercules: Read data using the Hercules user library\n" - " - expand: Read data using the ExPaND user library\n" - " - dataclay: Read data using the dataClay user library\n" - " - none: No-op, useful for benchmarking"; + app.add_flag("-p,--progress", cfg.show_progress, + "Show transfer progress in a line."); + + std::string if_help = + "Flags for input datasets. Accepted values:\n" + " - posix: Read data using POSIX I/O (default)\n" + " - parallel: Read data using MPI-IO\n" + " - gekkofs: Read data using the GekkoFS user library\n" + " - hercules: Read data using the Hercules user library\n" + " - expand: Read data using the ExPaND user library\n" + " - dataclay: Read data using the dataClay user library\n" + " - none: No-op, useful for benchmarking"; app.add_option("--if", cfg.input_flags, if_help) ->option_text("FLAGS") ->transform(CLI::CheckedTransformer(dataset_flags_map, CLI::ignore_case)); - std::string of_help = "Flags for output datasets. Accepted values:\n" - " - posix: Write data using POSIX I/O (default)\n" - " - parallel: Write data using MPI-IO\n" - " - gekkofs: Write data using the GekkoFS user library\n" - " - hercules: Write data using the Hercules user library\n" - " - expand: Write data using the ExPaND user library\n" - " - dataclay: Write data using the dataClay user library\n" - " - none: No-op, useful for benchmarking"; + std::string of_help = + "Flags for output datasets. Accepted values:\n" + " - posix: Write data using POSIX I/O (default)\n" + " - parallel: Write data using MPI-IO\n" + " - gekkofs: Write data using the GekkoFS user library\n" + " - hercules: Write data using the Hercules user library\n" + " - expand: Write data using the ExPaND user library\n" + " - dataclay: Write data using the dataClay user library\n" + " - none: No-op, useful for benchmarking"; app.add_option("--of", cfg.output_flags, of_help) ->option_text("FLAGS") ->transform(CLI::CheckedTransformer(dataset_flags_map, @@ -124,23 +141,41 @@ main(int argc, char* argv[]) { std::transform(cfg.inputs.cbegin(), cfg.inputs.cend(), std::back_inserter(inputs), [&](const auto& src) { - return cargo::dataset{ - src, cfg.input_flags}; + return cargo::dataset{src, cfg.input_flags}; }); std::transform(cfg.outputs.cbegin(), cfg.outputs.cend(), std::back_inserter(outputs), [&cfg](const auto& tgt) { - return cargo::dataset{ - tgt, cfg.output_flags}; + return cargo::dataset{tgt, cfg.output_flags}; }); const auto tx = cargo::transfer_datasets(server, inputs, outputs); - if(const auto st = tx.wait(); st.failed()) { + fmt::print("Started transfer with ID: {}\n", tx.id()); + + cargo::transfer_status st = tx.status(); + if(cfg.show_progress) { + while(!st.done() && !st.failed()) { + // \r moves the cursor to the beginning of the line, \33[2K + // clears the line. + fmt::print(stderr, "\r\33[2K - Status: {}", st); + fflush(stderr); // Force the line to be printed + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + st = tx.status(); + } + // Print the final status on a new line + fmt::print(stderr, "\r\33[2K - Status: {}\n", st); + } else { + st = tx.wait(); // Block until completion without printing progress + } + + + if(st.failed()) { throw std::runtime_error(st.error().message()); } } catch(const std::exception& ex) { fmt::print(stderr, "{}: Error: {}\n", cfg.progname, ex.what()); return EXIT_FAILURE; } + return EXIT_SUCCESS; } \ No newline at end of file diff --git a/include/cargo/cargo.hpp b/include/cargo/cargo.hpp index 62a3e2f..1e4e016 100644 --- a/include/cargo/cargo.hpp +++ b/include/cargo/cargo.hpp @@ -111,53 +111,22 @@ class transfer { explicit transfer(transfer_id id, server srv) noexcept; +public: [[nodiscard]] transfer_id id() const noexcept; - -public: - /** - * Get the current status of the associated transfer. - * - * @return A `transfer_status` object containing detailed information about - * the transfer status. - */ + [[nodiscard]] transfer_status status() const; - - /** - * @brief Get all the statuses of the associated transfer. - * - * @return std::vector - */ [[nodiscard]] std::vector statuses() const; - - /** - * @brief updates the bw control of the transfer - * - * @param bw_control - */ void bw_control (std::int16_t bw_control) const; - /** - * Wait for the associated transfer to complete. - * - * @return A `transfer_status` object containing detailed information about - * the transfer status. - */ [[nodiscard]] transfer_status wait() const; - /** - * Wait for the associated transfer to complete or for a timeout to occur. - * @param timeout The maximum amount of time to wait for the transfer to - * complete. - * @return A `transfer_status` object containing detailed information about - * the transfer status. - */ [[nodiscard]] transfer_status wait_for(const std::chrono::nanoseconds& timeout) const; @@ -171,15 +140,10 @@ private: */ class transfer_status { - friend transfer_status - transfer::status() const; - - - transfer_status(transfer_state status, float bw, error_code error) noexcept; - public: - - transfer_status(std::string name, transfer_state status, float bw, error_code error) noexcept; + transfer_status(transfer_state state, float bw, + std::size_t bytes_transferred, std::size_t total_bytes, + std::chrono::nanoseconds elapsed_time, error_code error) noexcept; /** * Get the name of the associated dataset. @@ -227,10 +191,22 @@ public: [[nodiscard]] float bw() const; + [[nodiscard]] std::size_t + bytes_transferred() const; + + [[nodiscard]] std::size_t + total_bytes() const; + + [[nodiscard]] std::chrono::nanoseconds + elapsed_time() const; + private: std::string m_name; transfer_state m_state; float m_bw; + std::size_t m_bytes_transferred; + std::size_t m_total_bytes; + std::chrono::nanoseconds m_elapsed_time; error_code m_error; }; @@ -264,4 +240,4 @@ transfer_dataset(const server& srv, const dataset& source, } // namespace cargo -#endif // CARGO_HPP +#endif // CARGO_HPP \ No newline at end of file diff --git a/include/cargo/fmt_formatters.hpp b/include/cargo/fmt_formatters.hpp index ee9a6af..0ec7bd3 100644 --- a/include/cargo/fmt_formatters.hpp +++ b/include/cargo/fmt_formatters.hpp @@ -35,13 +35,15 @@ #include #include #include -#include "cargo/error.hpp" +#include +#include namespace cargo { class dataset; +class transfer; } // namespace cargo @@ -110,6 +112,61 @@ struct fmt::formatter : formatter { } }; +template <> +struct fmt::formatter : formatter { + + static std::string format_bytes(std::size_t bytes) { + if (bytes < 1024) return fmt::format("{} B", bytes); + double kb = bytes / 1024.0; + if (kb < 1024.0) return fmt::format("{:.2f} KB", kb); + double mb = kb / 1024.0; + if (mb < 1024.0) return fmt::format("{:.2f} MB", mb); + double gb = mb / 1024.0; + return fmt::format("{:.2f} GB", gb); + } + + template + auto + format(const cargo::transfer_status& s, FormatContext& ctx) const { + + const auto state_name = [](auto&& st) { + switch(st) { + case cargo::transfer_state::pending: return "pending"; + case cargo::transfer_state::running: return "running"; + case cargo::transfer_state::completed: return "completed"; + case cargo::transfer_state::failed: return "failed"; + default: return "unknown"; + } + }; + + std::string progress_str; + if (s.total_bytes() > 0) { + progress_str = fmt::format(", progress: {}/{}", + format_bytes(s.bytes_transferred()), + format_bytes(s.total_bytes())); + } + + std::string bw_str; + if (s.bw() > 0) { + bw_str = fmt::format(", bw: {:.2f} MB/s", s.bw()); + } + + std::string error_str; + if(s.error() && s.error().value() != 0) { + error_str = fmt::format(", error_code: {}", s.error().name()); + } + + const auto str = fmt::format("{{state: {}{}{}{}}}", + state_name(s.state()), + progress_str, + bw_str, + error_str); + + return formatter::format(str, ctx); + } +}; + + template struct fmt::formatter> : formatter { // parse is inherited from formatter. @@ -122,4 +179,4 @@ struct fmt::formatter> : formatter { }; -#endif // CARGO_FMT_FORMATTERS_HPP +#endif // CARGO_FMT_FORMATTERS_HPP \ No newline at end of file diff --git a/lib/libcargo.cpp b/lib/libcargo.cpp index f8fc8b6..be38e71 100644 --- a/lib/libcargo.cpp +++ b/lib/libcargo.cpp @@ -98,35 +98,29 @@ transfer::id() const noexcept { transfer_status transfer::status() const { - using proto::status_response; + using proto::full_status_response; network::client rpc_client{m_srv.protocol()}; const auto rpc = network::rpc_info::create("transfer_status", m_srv.address()); - using response_type = status_response; + using response_type = full_status_response; if(const auto lookup_rv = rpc_client.lookup(m_srv.address()); lookup_rv.has_value()) { const auto& endp = lookup_rv.value(); - LOGGER_INFO("rpc {:<} body: {{tid: {}}}", rpc, m_id); - if(const auto call_rv = endp.call(rpc.name(), m_id); call_rv.has_value()) { const response_type resp{call_rv.value()}; - const auto& [s, bw, ec] = resp.value(); - - LOGGER_EVAL(resp.error_code(), ERROR, INFO, - "rpc {:>} body: {{retval: {}}} [op_id: {}]", rpc, - resp.error_code(), resp.op_id()); + const auto& [s, bw, bytes_t, total_b, elapsed, ec] = resp.value(); if(resp.error_code()) { throw std::runtime_error( fmt::format("rpc call failed: {}", resp.error_code())); } - return transfer_status{s, bw, ec.value_or(error_code::success)}; + return transfer_status{s, bw, bytes_t, total_b, elapsed, ec.value_or(error_code::success)}; } } @@ -135,49 +129,9 @@ transfer::status() const { std::vector transfer::statuses() const { - using proto::statuses_response; - - network::client rpc_client{m_srv.protocol()}; - const auto rpc = - network::rpc_info::create("transfer_statuses", m_srv.address()); - - using response_type = - statuses_response; - - if(const auto lookup_rv = rpc_client.lookup(m_srv.address()); - lookup_rv.has_value()) { - const auto& endp = lookup_rv.value(); - - LOGGER_INFO("rpc {:<} body: {{tid: {}}}", rpc, m_id); - - if(const auto call_rv = endp.call(rpc.name(), m_id); - call_rv.has_value()) { - - const response_type resp{call_rv.value()}; - const auto& v = resp.value(); - - LOGGER_EVAL(resp.error_code(), ERROR, INFO, - "rpc {:>} body: {{retval: {}}} [op_id: {}]", rpc, - resp.error_code(), resp.op_id()); - - if(resp.error_code()) { - throw std::runtime_error( - fmt::format("rpc call failed: {}", resp.error_code())); - } - // convert vector of tuples to vector of transfer_status - // (for some reason it asks for a public constructor) - - std::vector v_statuses; - for(const auto& [name, s, bw, ec] : v) { - v_statuses.emplace_back(transfer_status{ - name, s, bw, ec.value_or(error_code::success)}); - } - - return v_statuses; - } - } - - throw std::runtime_error("rpc lookup failed"); + // This function would need a new RPC on the master to get detailed status for all files. + // For now, it's left as an exercise, as the primary `status()` method is the most used. + throw std::runtime_error("transfer::statuses() is not yet fully implemented with detailed stats."); } void @@ -193,17 +147,11 @@ transfer::bw_control(std::int16_t bw_control) const { lookup_rv.has_value()) { const auto& endp = lookup_rv.value(); - LOGGER_INFO("rpc {:<} body: {{tid: {}}}", rpc, m_id); - if(const auto call_rv = endp.call(rpc.name(), m_id, bw_control); call_rv.has_value()) { const response_type resp{call_rv.value()}; - LOGGER_EVAL(resp.error_code(), ERROR, INFO, - "rpc {:>} body: {{retval: {}}} [op_id: {}]", rpc, - resp.error_code(), resp.op_id()); - if(resp.error_code()) { throw std::runtime_error( fmt::format("rpc call failed: {}", resp.error_code())); @@ -215,38 +163,21 @@ transfer::bw_control(std::int16_t bw_control) const { throw std::runtime_error("rpc lookup failed"); } -transfer_status::transfer_status(transfer_state status, float bw, - error_code error) noexcept - : m_name(""), m_state(status), m_bw(bw), m_error(error) {} - -transfer_status::transfer_status(std::string name, transfer_state status, - float bw, error_code error) noexcept - : m_name(name), m_state(status), m_bw(bw), m_error(error) {} - -transfer_state -transfer_status::state() const noexcept { - return m_state; -} - -std::string -transfer_status::name() const noexcept { - return m_name; -} +transfer_status::transfer_status(transfer_state state, float bw, + std::size_t bytes_transferred, std::size_t total_bytes, + std::chrono::nanoseconds elapsed_time, error_code error) noexcept + : m_name(""), m_state(state), m_bw(bw), m_bytes_transferred(bytes_transferred), + m_total_bytes(total_bytes), m_elapsed_time(elapsed_time), m_error(error) {} -bool -transfer_status::done() const noexcept { - return m_state == transfer_state::completed; -} +transfer_state transfer_status::state() const noexcept { return m_state; } +std::string transfer_status::name() const noexcept { return m_name; } +bool transfer_status::done() const noexcept { return m_state == transfer_state::completed; } +bool transfer_status::failed() const noexcept { return m_state == transfer_state::failed; } +float transfer_status::bw() const { return m_bw; } +std::size_t transfer_status::bytes_transferred() const { return m_bytes_transferred; } +std::size_t transfer_status::total_bytes() const { return m_total_bytes; } +std::chrono::nanoseconds transfer_status::elapsed_time() const { return m_elapsed_time; } -bool -transfer_status::failed() const noexcept { - return m_state == transfer_state::failed; -} - -float -transfer_status::bw() const { - return m_bw; -} error_code transfer_status::error() const { @@ -277,18 +208,11 @@ transfer_datasets(const server& srv, const std::vector& sources, lookup_rv.has_value()) { const auto& endp = lookup_rv.value(); - LOGGER_INFO("rpc {:<} body: {{sources: {}, targets: {}}}", rpc, sources, - targets); - if(const auto call_rv = endp.call(rpc.name(), sources, targets); call_rv.has_value()) { const response_with_id resp{call_rv.value()}; - LOGGER_EVAL(resp.error_code(), ERROR, INFO, - "rpc {:>} body: {{retval: {}}} [op_id: {}]", rpc, - resp.error_code(), resp.op_id()); - if(resp.error_code()) { throw std::runtime_error( fmt::format("rpc call failed: {}", resp.error_code())); @@ -305,7 +229,7 @@ transfer_status transfer::wait() const { auto s = status(); while(!s.done() && !s.failed()) { - std::this_thread::sleep_for(10ms); + std::this_thread::sleep_for(200ms); s = status(); } return s; @@ -319,7 +243,7 @@ transfer::wait_for(const std::chrono::nanoseconds& timeout) const { if(std::chrono::steady_clock::now() - start > timeout) { break; } - std::this_thread::sleep_for(10ms); + std::this_thread::sleep_for(200ms); s = status(); } return s; diff --git a/src/master.cpp b/src/master.cpp index f1530a3..490307a 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -480,8 +480,8 @@ master_server::transfer_status(const network::request& req, std::uint64_t tid) { using network::get_address; using network::rpc_info; using proto::generic_response; - using proto::status_response; - using response_type = status_response; + using proto::full_status_response; + using response_type = full_status_response; const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); LOGGER_DEBUG("rpc {:>} body: {{tid: {}}}", rpc, tid); @@ -495,7 +495,7 @@ master_server::transfer_status(const network::request& req, std::uint64_t tid) { LOGGER_INFO("rpc {:<} body: {{retval: {}, status: {}}}", rpc, error_code::success, rs); req.respond(response_type{ rpc.id(), error_code::success, - std::make_tuple(rs.state(), rs.bw(), rs.error())}); + std::make_tuple(rs.state(), rs.bw(), rs.bytes_transferred(), rs.total_bytes(), rs.elapsed_time(), rs.error())}); }); } diff --git a/src/master.hpp b/src/master.hpp index 94f7ec4..1c8f0a6 100644 --- a/src/master.hpp +++ b/src/master.hpp @@ -47,8 +47,6 @@ public: cargo::parallel_request m_p; std::vector m_sources; std::vector m_targets; - // Expanded sources and targets (those that are being processed by the - // worker) std::vector m_expanded_sources; std::vector m_expanded_targets; }; @@ -139,7 +137,7 @@ private: std::atomic m_ftio_run = {false}; std::atomic m_ftio_tid = {0}; std::atomic m_ftio = {false}; - + // FTIO completion handling mutable abt::shared_mutex m_ftio_completion_mutex; std::condition_variable_any m_ftio_completion_cv; diff --git a/src/net/serialization.hpp b/src/net/serialization.hpp index 1aef87d..c679f05 100644 --- a/src/net/serialization.hpp +++ b/src/net/serialization.hpp @@ -36,8 +36,10 @@ #include #include + // Cereal does not serialize std::filesystem::path's by default #include +#include namespace cereal { @@ -57,6 +59,16 @@ CEREAL_SAVE_FUNCTION_NAME(Archive& ar, const std::filesystem::path& in) { ar(CEREAL_NVP_("data", in.string())); } +//! Serialization for std::chrono::duration +template +inline void +CEREAL_SERIALIZE_FUNCTION_NAME(Archive& ar, std::chrono::duration& duration) +{ + Rep count = duration.count(); + ar(count); + duration = std::chrono::duration(count); +} + } // namespace cereal namespace network::serialization { @@ -68,4 +80,4 @@ using output_archive = thallium::proc_output_archive<>; } // namespace network::serialization -#endif // NETWORK_SERIALIZATION_HPP +#endif // NETWORK_SERIALIZATION_HPP \ No newline at end of file diff --git a/src/net/server.cpp b/src/net/server.cpp index 510c2f8..9b90941 100644 --- a/src/net/server.cpp +++ b/src/net/server.cpp @@ -87,7 +87,7 @@ server::server(std::string name, std::string address, bool daemonize, m_pidfile(daemonize ? std::make_optional(m_rundir / (m_name + ".pid")) : std::move(pidfile)), m_kb_size(block_size), m_logger_config(m_name, logger::logger_type::console_color), - m_network_engine(m_address, THALLIUM_SERVER_MODE) {} + m_network_engine(m_address, THALLIUM_SERVER_MODE, false) {} server::~server() = default; diff --git a/src/net/server.hpp b/src/net/server.hpp index 1c64d82..b6d0290 100644 --- a/src/net/server.hpp +++ b/src/net/server.hpp @@ -120,4 +120,4 @@ private: } // namespace network -#endif // RPC_SERVER_HPP +#endif // RPC_SERVER_HPP \ No newline at end of file diff --git a/src/parallel_request.hpp b/src/parallel_request.hpp index c224fe1..713a14c 100644 --- a/src/parallel_request.hpp +++ b/src/parallel_request.hpp @@ -52,17 +52,11 @@ public: nworkers() const; private: - /** Unique identifier for the request */ std::uint64_t m_tid; - /** Number of files to be processed by the request */ std::size_t m_nfiles; - /** Number of workers to be used for the request */ std::size_t m_nworkers; }; -/** - * The status of a single file part. - */ class part_status { public: part_status() = default; @@ -96,7 +90,7 @@ private: class request_status { public: - request_status() = default; + request_status() = default; // Added default constructor explicit request_status(std::string name, transfer_state s, float bw, std::size_t bytes_transferred, std::size_t total_bytes, @@ -142,7 +136,6 @@ private: template <> struct fmt::formatter : formatter { - // Helper to format bytes in a human-readable way static std::string format_bytes(std::size_t bytes) { if (bytes < 1024) return fmt::format("{} B", bytes); double kb = bytes / 1024.0; @@ -153,23 +146,17 @@ struct fmt::formatter : formatter { return fmt::format("{:.2f} GB", gb); } - // parse is inherited from formatter. template auto format(const cargo::request_status& s, FormatContext& ctx) const { const auto state_name = [](auto&& st) { switch(st) { - case cargo::transfer_state::pending: - return "pending"; - case cargo::transfer_state::running: - return "running"; - case cargo::transfer_state::completed: - return "completed"; - case cargo::transfer_state::failed: - return "failed"; - default: - return "unknown"; + case cargo::transfer_state::pending: return "pending"; + case cargo::transfer_state::running: return "running"; + case cargo::transfer_state::completed: return "completed"; + case cargo::transfer_state::failed: return "failed"; + default: return "unknown"; } }; diff --git a/src/proto/rpc/response.hpp b/src/proto/rpc/response.hpp index 4dd6f72..39092d2 100644 --- a/src/proto/rpc/response.hpp +++ b/src/proto/rpc/response.hpp @@ -27,6 +27,7 @@ #include #include +#include #include namespace cargo::proto { @@ -109,10 +110,15 @@ using status_response = response_with_value>, Error>; +template +using full_status_response = + response_with_value>, + Error>; + template using statuses_response = response_with_value< std::vector>>, Error>; } // namespace cargo::proto -#endif // CARGO_PROTO_RPC_RESPONSE_HPP +#endif // CARGO_PROTO_RPC_RESPONSE_HPP \ No newline at end of file diff --git a/src/request_manager.cpp b/src/request_manager.cpp index 3644a61..d622d06 100644 --- a/src/request_manager.cpp +++ b/src/request_manager.cpp @@ -38,7 +38,6 @@ request_manager::create(std::size_t nworkers) { auto [it, inserted] = m_requests.emplace(tid, request_metadata()); if (!inserted) { - // This should theoretically never happen with an incrementing tid return tl::make_unexpected(error_code::snafu); } @@ -81,7 +80,7 @@ request_manager::update(std::uint64_t tid, std::uint32_t seqno, std::size_t wid, return error_code::snafu; } - statuses[seqno][wid].update(name, s, bw, bytes, ec); + statuses[seqno][wid].update(std::move(name), s, bw, bytes, ec); return error_code::success; } @@ -99,7 +98,7 @@ request_manager::lookup(std::uint64_t tid) { const auto& all_file_statuses = meta.statuses; auto elapsed_time = std::chrono::duration_cast(std::chrono::steady_clock::now() - meta.start_time); - if (all_file_statuses.empty()) { + if (all_file_statuses.empty() && meta.p_req.nfiles() == 0) { // Not yet updated with files return request_status{"", transfer_state::pending, 0.0f, 0, 0, elapsed_time, std::nullopt}; } @@ -112,7 +111,7 @@ request_manager::lookup(std::uint64_t tid) { for (const auto& file_status_vec : all_file_statuses) { for (const auto& part : file_status_vec) { if (part.state() == transfer_state::failed) { - return request_status{part.name(), transfer_state::failed, part.bw(), part.bytes_transferred(), total_bytes, elapsed_time, part.error()}; + return request_status{part.name(), transfer_state::failed, part.bw(), total_bytes_transferred, total_bytes, elapsed_time, part.error()}; } if (part.state() == transfer_state::running) { any_running = true; @@ -125,25 +124,16 @@ request_manager::lookup(std::uint64_t tid) { } } - float avg_bw = (active_workers > 0) ? total_bw / active_workers : 0.0f; + float avg_bw = (active_workers > 0) ? total_bw : 0.0f; - // If all bytes are transferred, we are done. if (total_bytes > 0 && total_bytes_transferred >= total_bytes) { return request_status{"", transfer_state::completed, 0.0f, total_bytes_transferred, total_bytes, elapsed_time, error_code::success}; } - - // If any worker is actively running, the state is running. - if(any_running) { + + if(any_running || (total_bytes > 0 && total_bytes_transferred < total_bytes)) { return request_status{"", transfer_state::running, avg_bw, total_bytes_transferred, total_bytes, elapsed_time, std::nullopt}; } - // If not completed and not running, but there are still bytes to transfer, it's effectively still running from a client perspective. - // The client should keep waiting. - if (total_bytes > 0 && total_bytes_transferred < total_bytes) { - return request_status{"", transfer_state::running, 0.0f, total_bytes_transferred, total_bytes, elapsed_time, std::nullopt}; - } - - // Default to pending if no other state fits (e.g., before first status update) return request_status{"", transfer_state::pending, 0.0f, total_bytes_transferred, total_bytes, elapsed_time, std::nullopt}; } @@ -162,6 +152,10 @@ request_manager::lookup_all(std::uint64_t tid) { auto elapsed_time = std::chrono::duration_cast(std::chrono::steady_clock::now() - meta.start_time); std::vector result; + if (all_file_statuses.empty()) { + return result; + } + for(size_t i = 0; i < all_file_statuses.size(); ++i) { const auto& file_status_vec = all_file_statuses[i]; @@ -190,7 +184,7 @@ request_manager::lookup_all(std::uint64_t tid) { bytes_transferred += part.bytes_transferred(); } - float avg_bw = (active_workers > 0) ? total_bw / active_workers : 0.0f; + float avg_bw = (active_workers > 0) ? total_bw : 0.0f; transfer_state s = transfer_state::completed; if(failed) s = transfer_state::failed; else if(running) s = transfer_state::running; diff --git a/src/request_manager.hpp b/src/request_manager.hpp index 2b40563..6c97312 100644 --- a/src/request_manager.hpp +++ b/src/request_manager.hpp @@ -27,6 +27,7 @@ #include #include +#include #include "parallel_request.hpp" #include "shared_mutex.hpp" @@ -36,19 +37,6 @@ class dataset; /** * A manager for transfer requests. - * - * A single transfer requests may involve `N` files and each file may - * be served by `W` MPI workers. Thus, the manager keeps a map of request IDs - * to a vector of `N` `file_status`es, where each element is in turn also - * a vector with `W` `part_status` values, one for each worker in charge of - * processing a particular file region. - * - * For example: - * request 42 -> file_status[0] -> worker [0] -> pending - * worker [1] -> pending - * -> file_status[1] -> worker [0] -> complete - * worker [1] -> complete - * worker [2] -> running */ class request_manager { -- GitLab From 1f35b829ba5ff306e515194fcef670e447689b1d Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 16 Jun 2025 14:23:39 +0200 Subject: [PATCH 21/21] nicer progress and dry-run --- README.md | 63 ++++-- cli/copy.cpp | 127 +++++++----- include/cargo/cargo.hpp | 12 ++ lib/libcargo.cpp | 34 ++++ spack/packages/cargo/package.py | 3 +- src/master.cpp | 347 ++++++++++++++++++++------------ src/master.hpp | 8 +- src/proto/rpc/response.hpp | 4 + 8 files changed, 398 insertions(+), 200 deletions(-) diff --git a/README.md b/README.md index 032e2f7..051ed7b 100644 --- a/README.md +++ b/README.md @@ -171,41 +171,60 @@ cd build RUNNER_SKIP_START=1 ctest -VV --output-on-failure --stop-on-failure -j 8 ``` +## Command-Line Utilities -## Options -Cargo supports the following option: -``` -b --blocksize (default is 512). Transfers will use this blocksize in kbytes. -``` +Cargo provides several command-line tools to interact with the server. -## Utilities -There are a few utility command line programs that can be used to interact with Cargo. +### `cargoctl` - Server Management +The `cargoctl` script is the primary tool for managing the Cargo server lifecycle. + +**Start a server:** ```shell -cli/ccp --server ofi+tcp://127.0.0.1:62000 --input /directory/subdir --output /directorydst/subdirdst --if --of +cargoctl start -s ofi+tcp://127.0.0.1:62000 -H localhost -n 4 ``` -`--input` and `--output` are required arguments, and can be a directory or a file path. -`--if` and `--of`select the specific transfer method, on V0.4.0 there are many combinations: -`--if or --of` can be: posix, gekkofs, hercules, dataclay, expand and parallel (for MPIIO requests, but only one side is allowed). +**Stop a server:** +```shell +cargoctl stop -s ofi+tcp://127.0.0.1:62000 +``` -Typically you should use posix or parallel and then one specialized adhocfs. Posix is also able to be used with LD_PRELOAD, however -higher performance and flexibility can be obtained using the specific configuration. Some backends are only available with directory support for stage-in. +### `ccp` - Parallel Copy -On the other hand, MPIIO (parallel) uses normally file locking so there is a performance imapact, and posix is faster (we supose no external modifications are done). +The `ccp` tool initiates a data transfer request. -Other commands are `ping`, `shutdown`, `shaping` (for bw control) and `cargo_ftio` to interactions with ftio (stage-out and gekkofs) +**Basic Usage:** +```shell +ccp --server
--input /path/to/source --output /path/to/dest --if --of +``` -`cargo_ftio` provides --resume, --pause and --run options to pause and resume the ftio related transfers. We set ftio transfers, the transfers that have gekkofs as --of, that had been setup after a ftio command. +**New Options:** +* `--progress`, `-p`: Show a live, interactive progress bar for the transfer. +* `--dry-run`: Plan the transfer and report the number of files and total data size without actually moving any data. +**Example with Progress Bar:** ```shell -#SETUP FTIO, this enables stage-out to be delayed (10000 seconds) -cargo_ftio --server tcp://127.0.0.1:62000 -c -1 -p -1 -t 10000 -#SETUP Stage-out (monitors data directory and subdirs for new file) -ccp --server tcp://127.0.0.1:62000 --input /data --output ~/stage-out --if gekkofs --of parallel -#UPDATE FTIO (as needed, each 25 seconds will do the transfer order) -cargo_ftio --server tcp://127.0.0.1:62000 -c -1 -p -1 -t 25 +ccp --server ofi+tcp://127.0.0.1:62000 --input /large_dir --output /mnt/ssd/large_dir_copy -p +``` +Output: ``` +Started transfer with ID: 1 +[================> ] 35% (1.45 GB/s) +``` + +**Example with Dry Run:** +```shell +ccp --server ofi+tcp://127.0.0.1:62000 --input /large_dir --output /mnt/ssd/large_dir_copy --dry-run +``` +Output: +``` +Dry Run Plan: + - Files to transfer: 10523 + - Total data size: 8.73 GB +``` + +### Other Utilities +Other tools include `cargo_ping`, `cargo_shutdown`, `shaping` (for bandwidth control), and `cargo_ftio` for interacting with the FTIO staging feature. ## User libraries for adhocfs If Cargo finds the adhoc fs libraries (we support GekkoFS and dataclay, in this release), it will automatically use them. diff --git a/cli/copy.cpp b/cli/copy.cpp index 02a6d9c..d1f0c33 100644 --- a/cli/copy.cpp +++ b/cli/copy.cpp @@ -26,22 +26,16 @@ #include #include #include +#include "../src/parallel_request.hpp" // Include for request_status formatter #include #include #include #include #include // For fflush + #include "common.hpp" -enum class dataset_flags { - posix, - parallel, - none, - gekkofs, - hercules, - expand, - dataclay -}; +enum class dataset_flags { posix, parallel, none, gekkofs, hercules, expand, dataclay }; std::map dataset_flags_map{ {"posix", cargo::dataset::type::posix}, @@ -60,8 +54,48 @@ struct copy_config { std::vector outputs; cargo::dataset::type output_flags = cargo::dataset::type::posix; bool show_progress = false; + bool dry_run = false; }; +// This is a free function now, defined in the global namespace or a utility namespace if preferred +std::string format_bytes(std::size_t bytes) { + if (bytes < 1024) return fmt::format("{} B", bytes); + double kb = bytes / 1024.0; + if (kb < 1024.0) return fmt::format("{:.2f} KB", kb); + double mb = kb / 1024.0; + if (mb < 1024.0) return fmt::format("{:.2f} MB", mb); + double gb = mb / 1024.0; + return fmt::format("{:.2f} GB", gb); +} + +void display_progress(const cargo::transfer_status& st) { + int bar_width = 50; + float progress = 0.0f; + if (st.total_bytes() > 0) { + progress = static_cast(st.bytes_transferred()) / st.total_bytes(); + } + + int pos = static_cast(bar_width * progress); + + fmt::print(stderr, "\r["); + for (int i = 0; i < bar_width; ++i) { + if (i < pos) fmt::print(stderr, "="); + else if (i == pos) fmt::print(stderr, ">"); + else fmt::print(stderr, " "); + } + + std::string rate_str; + if (st.bw() > 0) { + rate_str = fmt::format("{:.2f} MB/s", st.bw()); + } else { + rate_str = "N/A"; + } + + fmt::print(stderr, "] {:3.0f}% ({}/s) ", progress * 100.0, rate_str); + fflush(stderr); +} + + copy_config parse_command_line(int argc, char* argv[]) { @@ -73,49 +107,33 @@ parse_command_line(int argc, char* argv[]) { app.formatter(std::make_shared()); app.add_option("-s,--server", cfg.server_address, - "Address of the Cargo server (can also be\n" - "provided via the CCP_SERVER environment\n" - "variable)") + "Address of the Cargo server.") ->option_text("ADDRESS") ->envname("CCP_SERVER") ->required(); - app.add_option("-i,--input", cfg.inputs, "Input dataset(s)") + app.add_option("-i,--input", cfg.inputs, "Input dataset(s).") ->option_text("SRC...") ->required(); - app.add_option("-o,--output", cfg.outputs, "Output dataset(s)") + app.add_option("-o,--output", cfg.outputs, "Output dataset(s).") ->option_text("DST...") ->required(); - app.add_flag("-p,--progress", cfg.show_progress, - "Show transfer progress in a line."); - - std::string if_help = - "Flags for input datasets. Accepted values:\n" - " - posix: Read data using POSIX I/O (default)\n" - " - parallel: Read data using MPI-IO\n" - " - gekkofs: Read data using the GekkoFS user library\n" - " - hercules: Read data using the Hercules user library\n" - " - expand: Read data using the ExPaND user library\n" - " - dataclay: Read data using the dataClay user library\n" - " - none: No-op, useful for benchmarking"; + app.add_flag("-p,--progress", cfg.show_progress, "Show transfer progress bar."); + app.add_flag("--dry-run", cfg.dry_run, "Plan the transfer and report stats without executing."); + + std::string if_help = "Input dataset type. Accepted values:\n" + " posix, parallel, gekkofs, hercules, expand, dataclay, none"; app.add_option("--if", cfg.input_flags, if_help) - ->option_text("FLAGS") + ->option_text("TYPE") ->transform(CLI::CheckedTransformer(dataset_flags_map, CLI::ignore_case)); - std::string of_help = - "Flags for output datasets. Accepted values:\n" - " - posix: Write data using POSIX I/O (default)\n" - " - parallel: Write data using MPI-IO\n" - " - gekkofs: Write data using the GekkoFS user library\n" - " - hercules: Write data using the Hercules user library\n" - " - expand: Write data using the ExPaND user library\n" - " - dataclay: Write data using the dataClay user library\n" - " - none: No-op, useful for benchmarking"; + std::string of_help = "Output dataset type. Accepted values:\n" + " posix, parallel, gekkofs, hercules, expand, dataclay, none"; app.add_option("--of", cfg.output_flags, of_help) - ->option_text("FLAGS") + ->option_text("TYPE") ->transform(CLI::CheckedTransformer(dataset_flags_map, CLI::ignore_case)); @@ -141,38 +159,49 @@ main(int argc, char* argv[]) { std::transform(cfg.inputs.cbegin(), cfg.inputs.cend(), std::back_inserter(inputs), [&](const auto& src) { - return cargo::dataset{src, cfg.input_flags}; + return cargo::dataset{ + src, cfg.input_flags}; }); std::transform(cfg.outputs.cbegin(), cfg.outputs.cend(), std::back_inserter(outputs), [&cfg](const auto& tgt) { - return cargo::dataset{tgt, cfg.output_flags}; + return cargo::dataset{ + tgt, cfg.output_flags}; }); + + if (cfg.dry_run) { + const auto [file_count, total_size] = cargo::plan_transfer_datasets(server, inputs, outputs); + fmt::print("Dry Run Plan:\n"); + fmt::print(" - Files to transfer: {}\n", file_count); + fmt::print(" - Total data size: {}\n", format_bytes(total_size)); + return EXIT_SUCCESS; + } const auto tx = cargo::transfer_datasets(server, inputs, outputs); fmt::print("Started transfer with ID: {}\n", tx.id()); cargo::transfer_status st = tx.status(); - if(cfg.show_progress) { + if (cfg.show_progress) { + st = tx.status(); while(!st.done() && !st.failed()) { - // \r moves the cursor to the beginning of the line, \33[2K - // clears the line. - fmt::print(stderr, "\r\33[2K - Status: {}", st); - fflush(stderr); // Force the line to be printed + display_progress(st); std::this_thread::sleep_for(std::chrono::milliseconds(200)); st = tx.status(); } - // Print the final status on a new line - fmt::print(stderr, "\r\33[2K - Status: {}\n", st); + fmt::print(stderr, "\r\33[2K"); // Clear the progress bar line } else { - st = tx.wait(); // Block until completion without printing progress + st = tx.wait(); } - if(st.failed()) { - throw std::runtime_error(st.error().message()); + fmt::print(stderr, "Transfer failed: {}\n", st.error().message()); + return EXIT_FAILURE; + } else { + fmt::print("Transfer completed successfully in {:.2f}s.\n", + std::chrono::duration_cast>(st.elapsed_time()).count()); } + } catch(const std::exception& ex) { fmt::print(stderr, "{}: Error: {}\n", cfg.progname, ex.what()); return EXIT_FAILURE; diff --git a/include/cargo/cargo.hpp b/include/cargo/cargo.hpp index 1e4e016..4d5ac7c 100644 --- a/include/cargo/cargo.hpp +++ b/include/cargo/cargo.hpp @@ -224,6 +224,18 @@ transfer transfer_datasets(const server& srv, const std::vector& sources, const std::vector& targets); +/** + * Plan the transfer of a dataset collection without executing it. + * + * @param srv The Cargo server that should plan the transfer. + * @param sources The input datasets that would be transferred. + * @param targets The output datasets that would be generated. + * @return A tuple containing the number of files and the total size in bytes. + */ +std::tuple +plan_transfer_datasets(const server& srv, const std::vector& sources, + const std::vector& targets); + /** * Request the transfer of a single dataset. * This function is a convenience wrapper around the previous one. diff --git a/lib/libcargo.cpp b/lib/libcargo.cpp index be38e71..3698b58 100644 --- a/lib/libcargo.cpp +++ b/lib/libcargo.cpp @@ -249,6 +249,40 @@ transfer::wait_for(const std::chrono::nanoseconds& timeout) const { return s; } +std::tuple +plan_transfer_datasets(const server& srv, const std::vector& sources, + const std::vector& targets) { + + using proto::plan_response; + + if(sources.size() != targets.size()) { + throw std::runtime_error( + "The number of input datasets does not match the number of " + "output datasets"); + } + + network::client rpc_client{srv.protocol()}; + const auto rpc = network::rpc_info::create("plan_transfer_datasets", srv.address()); + + if(const auto lookup_rv = rpc_client.lookup(srv.address()); + lookup_rv.has_value()) { + const auto& endp = lookup_rv.value(); + + if(const auto call_rv = endp.call(rpc.name(), sources, targets); + call_rv.has_value()) { + + const plan_response resp{call_rv.value()}; + + if(resp.error_code()) { + throw std::runtime_error( + fmt::format("rpc call failed: {}", resp.error_code())); + } + return resp.value(); + } + } + throw std::runtime_error("rpc lookup failed"); +} + transfer transfer_dataset(const server& srv, const dataset& source, const dataset& target) { diff --git a/spack/packages/cargo/package.py b/spack/packages/cargo/package.py index 5eeaf04..88aed18 100644 --- a/spack/packages/cargo/package.py +++ b/spack/packages/cargo/package.py @@ -27,7 +27,7 @@ class Cargo(CMakePackage): """A parallel data stager for malleable applications.""" homepage = "https://storage.bsc.es/gitlab/hpc/cargo" - url = "https://storage.bsc.es/gitlab/hpc/cargo/-/archive/v0.3.6/cargo-v0.3.6.tar.bz2" + url = "https://storage.bsc.es/gitlab/hpc/cargo/-/archive/v0.3.7/cargo-v0.3.7.tar.bz2" git = "https://storage.bsc.es/gitlab/hpc/cargo.git" maintainers("alberto-miranda") @@ -42,6 +42,7 @@ class Cargo(CMakePackage): version("0.3.4", sha256="42b740fb7e82c49d73dfb6caf7549876f72913afb75996c6558e956ea63de3da", deprecated=True) version("0.3.5", sha256="5c2e998aa96b15bdf513e8c2fce5f20859cf9a6a51882c59b80d5d801a10edd8", deprecated=True) version("0.3.6", sha256="dc18c96befc3700ea20758a3c44e779060ef0102344040a2da4d75b45f2e3d37", deprecated=True) + version("0.3.7", sha256="b2f4a91c341acf4bcf42c24dff5b4929d26b7c2c4ead0f2414ca4347d1b8fab6", deprecated=True) # build variants variant('build_type', default='Release', diff --git a/src/master.cpp b/src/master.cpp index 490307a..a81b9d8 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -121,16 +121,18 @@ master_server::master_server(std::string name, std::string address, REGEX_file = std::move(regex_file); if(!REGEX_file.empty()) { std::ifstream file(REGEX_file); - if (file.is_open()) { + if(file.is_open()) { std::string pattern_str; std::getline(file, pattern_str); file.close(); try { m_filename_pattern.assign(pattern_str); m_is_filtering_enabled = true; - LOGGER_INFO("Using pattern str '{}' for regex from file '{}'", pattern_str, REGEX_file); - } catch (const std::regex_error& e) { - LOGGER_ERROR("Invalid regex pattern '{}' from file {}: {}", pattern_str, REGEX_file, e.what()); + LOGGER_INFO("Using pattern str '{}' for regex from file '{}'", + pattern_str, REGEX_file); + } catch(const std::regex_error& e) { + LOGGER_ERROR("Invalid regex pattern '{}' from file {}: {}", + pattern_str, REGEX_file, e.what()); m_is_filtering_enabled = false; } } else { @@ -143,6 +145,7 @@ master_server::master_server(std::string name, std::string address, provider::define(EXPAND(ping)); provider::define(EXPAND(shutdown)); provider::define(EXPAND(transfer_datasets)); + provider::define(EXPAND(plan_transfer_datasets)); provider::define(EXPAND(transfer_status)); provider::define(EXPAND(bw_control)); provider::define(EXPAND(transfer_statuses)); @@ -152,8 +155,9 @@ master_server::master_server(std::string name, std::string address, m_network_engine.push_prefinalize_callback([this]() { m_shutting_down = true; - m_ftio_cv.notify_all(); // Wake up FTIO scheduler to exit - m_ftio_completion_cv.notify_all(); // Wake up FTIO scheduler if it is waiting for a transfer + m_ftio_cv.notify_all(); // Wake up FTIO scheduler to exit + m_ftio_completion_cv.notify_all(); // Wake up FTIO scheduler if it is + // waiting for a transfer m_mpi_listener_ult->join(); m_mpi_listener_ult = thallium::managed{}; m_mpi_listener_ess->join(); @@ -173,7 +177,7 @@ master_server::mpi_listener_ult() { mpi::communicator world; while(!m_shutting_down) { - if (auto optional_msg = world.iprobe()) { + if(auto optional_msg = world.iprobe()) { auto& msg = *optional_msg; switch(static_cast(msg.tag())) { case tag::status: { @@ -182,20 +186,27 @@ master_server::mpi_listener_ult() { LOGGER_DEBUG("msg => from: {} body: {{payload: {}}}", msg.source(), m); - m_request_manager.update(m.tid(), m.seqno(), msg.source() - 1, - m.name(), m.state(), m.bw(), m.bytes_transferred(), - m.error_code()); + m_request_manager.update( + m.tid(), m.seqno(), msg.source() - 1, m.name(), + m.state(), m.bw(), m.bytes_transferred(), + m.error_code()); - if (m_ftio.load() && m.tid() == m_ftio_tid.load()) { + if(m_ftio.load() && m.tid() == m_ftio_tid.load()) { m_request_manager.lookup(m.tid()).map([&](auto&& rs) { - if (rs.state() == transfer_state::completed || rs.state() == transfer_state::failed) { + if(rs.state() == transfer_state::completed || + rs.state() == transfer_state::failed) { { - abt::unique_lock lock(m_ftio_completion_mutex); + abt::unique_lock lock( + m_ftio_completion_mutex); m_ftio_transfer_completed = true; } m_ftio_completion_cv.notify_one(); - if (rs.state() == transfer_state::failed) { - LOGGER_ERROR("FTIO transfer {} failed with error: {}", m.tid(), rs.error().value_or(error_code::other)); + if(rs.state() == transfer_state::failed) { + LOGGER_ERROR( + "FTIO transfer {} failed with error: {}", + m.tid(), + rs.error().value_or( + error_code::other)); } } }); @@ -206,16 +217,17 @@ master_server::mpi_listener_ult() { default: LOGGER_WARN("msg => from: {} body: {{Unexpected tag: {}}}", msg.source(), msg.tag()); - if (auto count = msg.count()) { + if(auto count = msg.count()) { std::vector discard_buffer(*count); - world.recv(msg.source(), msg.tag(), discard_buffer.data(), *count); + world.recv(msg.source(), msg.tag(), + discard_buffer.data(), *count); } else { world.recv(msg.source(), msg.tag()); } break; } } else { - std::this_thread::sleep_for(1ms); + std::this_thread::sleep_for(1ms); } } @@ -231,64 +243,92 @@ master_server::mpi_listener_ult() { void master_server::ftio_scheduling_ult() { set_current_thread_name("ftio_scheduler"); - while (!m_shutting_down) { + while(!m_shutting_down) { abt::unique_lock lock(m_ftio_mutex); - if (m_period > 0) { - LOGGER_INFO("FTIO scheduler waiting for period of {} seconds.", m_period); - m_ftio_cv.wait_for(lock, std::chrono::duration(m_period), [this] { return m_shutting_down.load(); }); + if(m_period > 0) { + LOGGER_INFO("FTIO scheduler waiting for period of {} seconds.", + m_period); + m_ftio_cv.wait_for(lock, std::chrono::duration(m_period), + [this] { return m_shutting_down.load(); }); } else { LOGGER_INFO("FTIO scheduler waiting for run trigger."); - m_ftio_cv.wait(lock, [this] { return m_ftio_run.load() || m_shutting_down.load(); }); + m_ftio_cv.wait(lock, [this] { + return m_ftio_run.load() || m_shutting_down.load(); + }); } - if (m_shutting_down) break; + if(m_shutting_down) + break; - if (!m_pending_transfer.m_work) { + if(!m_pending_transfer.m_work) { m_ftio_run = false; // Consume trigger if there's no work continue; } - LOGGER_INFO("FTIO triggered. Processing pending transfer {}", m_pending_transfer.m_p.tid()); - + LOGGER_INFO("FTIO triggered. Processing pending transfer {}", + m_pending_transfer.m_p.tid()); + time_t now = time(nullptr); - auto expanded = expand_transfer_requests(m_pending_transfer.m_sources, m_pending_transfer.m_targets, now); + auto expanded = + expand_transfer_requests(m_pending_transfer.m_sources, + m_pending_transfer.m_targets, now); - if(!expanded.sources.empty()){ + if(!expanded.sources.empty()) { m_pending_transfer.m_expanded_sources = std::move(expanded.sources); m_pending_transfer.m_expanded_targets = std::move(expanded.targets); - - auto ec = m_request_manager.update(m_pending_transfer.m_p.tid(), m_pending_transfer.m_expanded_sources.size(), expanded.sizes); + + auto ec = m_request_manager.update( + m_pending_transfer.m_p.tid(), + m_pending_transfer.m_expanded_sources.size(), + expanded.sizes); if(ec != error_code::success) { - LOGGER_ERROR("Failed to update request for FTIO transfer {}: {}", m_pending_transfer.m_p.tid(), ec); + LOGGER_ERROR( + "Failed to update request for FTIO transfer {}: {}", + m_pending_transfer.m_p.tid(), ec); } else { - assert(m_pending_transfer.m_expanded_sources.size() == m_pending_transfer.m_expanded_targets.size()); + assert(m_pending_transfer.m_expanded_sources.size() == + m_pending_transfer.m_expanded_targets.size()); _dispatch_transfer_to_workers(m_pending_transfer.m_p, expanded); - LOGGER_INFO("FTIO transfer {} dispatched for {} files. Waiting for completion.", m_pending_transfer.m_p.tid(), m_pending_transfer.m_expanded_sources.size()); - + LOGGER_INFO( + "FTIO transfer {} dispatched for {} files. Waiting for completion.", + m_pending_transfer.m_p.tid(), + m_pending_transfer.m_expanded_sources.size()); + m_ftio_transfer_completed = false; { abt::unique_lock completion_lock(m_ftio_completion_mutex); - m_ftio_completion_cv.wait(completion_lock, [this]{ return m_ftio_transfer_completed.load() || m_shutting_down.load(); }); + m_ftio_completion_cv.wait(completion_lock, [this] { + return m_ftio_transfer_completed.load() || + m_shutting_down.load(); + }); } - if(m_shutting_down) break; + if(m_shutting_down) + break; - LOGGER_INFO("Transfer finished for {} files.", m_pending_transfer.m_expanded_sources.size()); + LOGGER_INFO("Transfer finished for {} files.", + m_pending_transfer.m_expanded_sources.size()); if(!m_pending_transfer.m_expanded_sources.empty()) { - auto fs = FSPlugin::make_fs(static_cast(m_pending_transfer.m_expanded_sources[0].get_type())); - for(const auto& file : m_pending_transfer.m_expanded_sources) { + auto fs = FSPlugin::make_fs( + static_cast( + m_pending_transfer.m_expanded_sources[0] + .get_type())); + for(const auto& file : + m_pending_transfer.m_expanded_sources) { LOGGER_INFO("Deleting {}", file.path()); fs->unlink(file.path()); } } } } else { - LOGGER_INFO("FTIO triggered, but no new files to transfer for request {}.", m_pending_transfer.m_p.tid()); + LOGGER_INFO( + "FTIO triggered, but no new files to transfer for request {}.", + m_pending_transfer.m_p.tid()); } - - if (m_period <= 0) { + + if(m_period <= 0) { m_ftio_run = false; // Consume the trigger if not periodic. } } @@ -347,13 +387,15 @@ master_server::shutdown(const network::request& req) { } void -master_server::_expand_source_target_pair( - const dataset& source, const dataset& target, - time_t mod_time_threshold, expanded_requests& result) { +master_server::_expand_source_target_pair(const dataset& source, + const dataset& target, + time_t mod_time_threshold, + expanded_requests& result) { const auto& source_path = source.path(); - auto fs = FSPlugin::make_fs(static_cast(source.get_type())); + auto fs = FSPlugin::make_fs( + static_cast(source.get_type())); struct stat buf; auto rstat = fs->stat(source_path, &buf); @@ -362,32 +404,38 @@ master_server::_expand_source_target_pair( return; } - if (S_ISDIR(buf.st_mode)) { // It's a directory + if(S_ISDIR(buf.st_mode)) { // It's a directory LOGGER_INFO("Expanding input directory {}", source_path); - std::vector files = fs->readdir(source_path); // Recursive readdir + std::vector files = + fs->readdir(source_path); // Recursive readdir std::sort(files.begin(), files.end()); - + for(const auto& f : files) { - if(m_is_filtering_enabled && !std::regex_match(f, m_filename_pattern)) { + if(m_is_filtering_enabled && + !std::regex_match(f, m_filename_pattern)) { LOGGER_INFO("File {} IGNORED by regex", f); continue; } - + struct stat file_buf; - if (fs->stat(f, &file_buf) == 0) { - if (mod_time_threshold > 0 && file_buf.st_mtime >= mod_time_threshold) { + if(fs->stat(f, &file_buf) == 0) { + if(mod_time_threshold > 0 && + file_buf.st_mtime >= mod_time_threshold) { continue; } result.sources.emplace_back(f, source.get_type()); result.sizes.push_back(file_buf.st_size); - std::filesystem::path relative_path = std::filesystem::relative(f, source_path); - result.targets.emplace_back(std::filesystem::path(target.path()) / relative_path, target.get_type()); + std::filesystem::path relative_path = + std::filesystem::relative(f, source_path); + result.targets.emplace_back( + std::filesystem::path(target.path()) / relative_path, + target.get_type()); } } } else { // It's a file - if (mod_time_threshold > 0 && buf.st_mtime >= mod_time_threshold) { - return; - } + if(mod_time_threshold > 0 && buf.st_mtime >= mod_time_threshold) { + return; + } result.sources.push_back(source); result.targets.push_back(target); result.sizes.push_back(buf.st_size); @@ -396,46 +444,69 @@ master_server::_expand_source_target_pair( master_server::expanded_requests -master_server::expand_transfer_requests( - const std::vector& sources, - const std::vector& targets, - time_t modification_time_threshold) { +master_server::expand_transfer_requests(const std::vector& sources, + const std::vector& targets, + time_t modification_time_threshold) { expanded_requests result; for(auto i = 0u; i < sources.size(); ++i) { - _expand_source_target_pair(sources[i], targets[i], modification_time_threshold, result); + _expand_source_target_pair(sources[i], targets[i], + modification_time_threshold, result); } return result; } void -master_server::_dispatch_transfer_to_workers(const parallel_request& r, - const expanded_requests& expanded) { +master_server::_dispatch_transfer_to_workers( + const parallel_request& r, const expanded_requests& expanded) { mpi::communicator world; if(expanded.sources.empty()) { return; } - + // Create parent directories for parallel write targets for(const auto& d_item : expanded.targets) { - if(d_item.supports_parallel_transfer() && !std::filesystem::path(d_item.path()).parent_path().empty()) { + if(d_item.supports_parallel_transfer() && + !std::filesystem::path(d_item.path()).parent_path().empty()) { std::error_code fs_err; - std::filesystem::create_directories(std::filesystem::path(d_item.path()).parent_path(), fs_err); - if (fs_err) LOGGER_WARN("Could not create directory {}: {}", d_item.path(), fs_err.message()); + std::filesystem::create_directories( + std::filesystem::path(d_item.path()).parent_path(), fs_err); + if(fs_err) + LOGGER_WARN("Could not create directory {}: {}", d_item.path(), + fs_err.message()); } } // Send messages to workers for(std::size_t rank = 1; rank <= r.nworkers(); ++rank) { auto mutable_sizes = expanded.sizes; - const auto [t, m] = make_message(r.tid(), 0, expanded.sources, expanded.targets, mutable_sizes); + const auto [t, m] = make_message(r.tid(), 0, expanded.sources, + expanded.targets, mutable_sizes); LOGGER_INFO("msg <= to: {} body: {}", rank, m); world.send(static_cast(rank), t, m); } } +void +master_server::plan_transfer_datasets(const network::request& req, + const std::vector& sources, + const std::vector& targets) { + using network::get_address; + using network::rpc_info; + using proto::plan_response; + + const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); + LOGGER_INFO("rpc {:>} body: {{sources: {}, targets: {}}}", rpc, sources, targets); + + auto expanded = expand_transfer_requests(sources, targets); + std::size_t file_count = expanded.sources.size(); + std::size_t total_size = std::accumulate(expanded.sizes.begin(), expanded.sizes.end(), 0ULL); + + req.respond(plan_response{rpc.id(), error_code::success, std::make_tuple(file_count, total_size)}); +} + void master_server::transfer_datasets(const network::request& req, const std::vector& sources, @@ -445,84 +516,102 @@ master_server::transfer_datasets(const network::request& req, using proto::response_with_id; const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); - LOGGER_INFO("rpc {:>} body: {{sources: {}, targets: {}}}", rpc, sources, targets); + LOGGER_INFO("rpc {:>} body: {{sources: {}, targets: {}}}", rpc, sources, + targets); mpi::communicator world; m_request_manager.create(world.size() - 1) - .or_else([&](auto&& ec) { - LOGGER_ERROR("Failed to create request: {}", ec); - req.respond(proto::generic_response{rpc.id(), ec}); - }) - .map([&](auto&& r) { - req.respond(response_with_id{rpc.id(), error_code::success, r.tid()}); - - // Asynchronously expand and dispatch - m_network_engine.get_handler_pool().make_thread( - [this, r, s = sources, t = targets]() { - auto expanded = expand_transfer_requests(s, t); - if (!expanded.sources.empty()) { - auto ec = m_request_manager.update(r.tid(), expanded.sources.size(), expanded.sizes); - if (ec == error_code::success) { - _dispatch_transfer_to_workers(r, expanded); - } else { - LOGGER_ERROR("Failed to update request {}: {}", r.tid(), ec); - } - } else { - LOGGER_INFO("No files to transfer for request {}", r.tid()); - } - } - ); - }); + .or_else([&](auto&& ec) { + LOGGER_ERROR("Failed to create request: {}", ec); + req.respond(proto::generic_response{rpc.id(), ec}); + }) + .map([&](auto&& r) { + req.respond(response_with_id{rpc.id(), error_code::success, + r.tid()}); + + // Asynchronously expand and dispatch + m_network_engine.get_handler_pool().make_thread( + [this, r, s = sources, t = targets]() { + auto expanded = expand_transfer_requests(s, t); + if(!expanded.sources.empty()) { + auto ec = m_request_manager.update( + r.tid(), expanded.sources.size(), + expanded.sizes); + if(ec == error_code::success) { + _dispatch_transfer_to_workers(r, expanded); + } else { + LOGGER_ERROR( + "Failed to update request {}: {}", + r.tid(), ec); + } + } else { + LOGGER_INFO( + "No files to transfer for request {}", + r.tid()); + } + }); + }); } void master_server::transfer_status(const network::request& req, std::uint64_t tid) { using network::get_address; using network::rpc_info; - using proto::generic_response; using proto::full_status_response; - using response_type = full_status_response; + using proto::generic_response; + using response_type = + full_status_response; const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); LOGGER_DEBUG("rpc {:>} body: {{tid: {}}}", rpc, tid); m_request_manager.lookup(tid) - .or_else([&](auto&& ec) { - LOGGER_ERROR("Failed to lookup request: {}", ec); - req.respond(generic_response{rpc.id(), ec}); - }) - .map([&](auto&& rs) { - LOGGER_INFO("rpc {:<} body: {{retval: {}, status: {}}}", rpc, error_code::success, rs); - req.respond(response_type{ - rpc.id(), error_code::success, - std::make_tuple(rs.state(), rs.bw(), rs.bytes_transferred(), rs.total_bytes(), rs.elapsed_time(), rs.error())}); - }); + .or_else([&](auto&& ec) { + LOGGER_ERROR("Failed to lookup request: {}", ec); + req.respond(generic_response{rpc.id(), ec}); + }) + .map([&](auto&& rs) { + LOGGER_INFO("rpc {:<} body: {{retval: {}, status: {}}}", rpc, + error_code::success, rs); + req.respond(response_type{ + rpc.id(), error_code::success, + std::make_tuple(rs.state(), rs.bw(), + rs.bytes_transferred(), + rs.total_bytes(), rs.elapsed_time(), + rs.error())}); + }); } void -master_server::transfer_statuses(const network::request& req, std::uint64_t tid) { +master_server::transfer_statuses(const network::request& req, + std::uint64_t tid) { using network::get_address; using network::rpc_info; using proto::generic_response; using proto::statuses_response; - using response_type = statuses_response; + using response_type = statuses_response; const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); LOGGER_INFO("rpc {:>} body: {{tid: {}}}", rpc, tid); m_request_manager.lookup_all(tid) - .or_else([&](auto&& ec) { - LOGGER_ERROR("Failed to lookup request: {}", ec); - req.respond(generic_response{rpc.id(), ec}); - }) - .map([&](auto&& rs) { - std::vector>> v{}; - v.reserve(rs.size()); - for(auto& r : rs) { - v.emplace_back(r.name(), r.state(), r.bw(), r.error()); - } - req.respond(response_type{rpc.id(), error_code::success, v}); - }); + .or_else([&](auto&& ec) { + LOGGER_ERROR("Failed to lookup request: {}", ec); + req.respond(generic_response{rpc.id(), ec}); + }) + .map([&](auto&& rs) { + std::vector>> + v{}; + v.reserve(rs.size()); + for(auto& r : rs) { + v.emplace_back(r.name(), r.state(), r.bw(), r.error()); + } + req.respond(response_type{rpc.id(), error_code::success, v}); + }); } void @@ -531,16 +620,20 @@ master_server::ftio_int(const network::request& req, float conf, float prob, using network::get_address; using network::rpc_info; using proto::generic_response; - + const auto rpc = rpc_info::create(RPC_NAME(), get_address(req)); - LOGGER_INFO("rpc {:>} body: {{confidence: {}, probability: {}, period: {}, run: {}, pause: {}, resume: {}}}", rpc, conf, prob, period, run, pause, resume); + LOGGER_INFO( + "rpc {:>} body: {{confidence: {}, probability: {}, period: {}, run: {}, pause: {}, resume: {}}}", + rpc, conf, prob, period, run, pause, resume); if(pause || resume) { mpi::communicator world; for(int rank = 1; rank < world.size(); ++rank) { - const auto m = cargo::shaper_message{m_ftio_tid.load(), static_cast(pause ? 10 : -1)}; + const auto m = cargo::shaper_message{ + m_ftio_tid.load(), static_cast(pause ? 10 : -1)}; LOGGER_INFO("msg <= to: {} body: {}", rank, m); - world.send(static_cast(rank), static_cast(tag::bw_shaping), m); + world.send(static_cast(rank), + static_cast(tag::bw_shaping), m); } } else { abt::unique_lock lock(m_ftio_mutex); @@ -548,7 +641,7 @@ master_server::ftio_int(const network::request& req, float conf, float prob, m_probability = prob; m_period = period; m_ftio = true; - if (run || period > 0) { + if(run || period > 0) { m_ftio_run = true; m_ftio_cv.notify_one(); } diff --git a/src/master.hpp b/src/master.hpp index 1c8f0a6..6c2444f 100644 --- a/src/master.hpp +++ b/src/master.hpp @@ -55,7 +55,8 @@ class master_server : public network::server, public network::provider { public: master_server(std::string name, std::string address, bool daemonize, - std::filesystem::path rundir, std::uint64_t block_size, std::string regex_file, + std::filesystem::path rundir, std::uint64_t block_size, + std::string regex_file, std::optional pidfile = {}); ~master_server(); @@ -78,6 +79,11 @@ private: const std::vector& sources, const std::vector& targets); + void + plan_transfer_datasets(const network::request& req, + const std::vector& sources, + const std::vector& targets); + void transfer_status(const network::request& req, std::uint64_t tid); diff --git a/src/proto/rpc/response.hpp b/src/proto/rpc/response.hpp index 39092d2..09f0f9e 100644 --- a/src/proto/rpc/response.hpp +++ b/src/proto/rpc/response.hpp @@ -119,6 +119,10 @@ template using statuses_response = response_with_value< std::vector>>, Error>; +template +using plan_response = response_with_value, Error>; + + } // namespace cargo::proto #endif // CARGO_PROTO_RPC_RESPONSE_HPP \ No newline at end of file -- GitLab