From 75b3ca895c5b8fc428a7d03a86538dcba4285f59 Mon Sep 17 00:00:00 2001 From: Alberto Miranda Date: Thu, 5 Oct 2023 16:07:46 +0200 Subject: [PATCH] Fix race condition when shutting down --- src/master.cpp | 12 ++++++++++-- src/worker/worker.cpp | 4 ++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/master.cpp b/src/master.cpp index 18953e1..4b106eb 100644 --- a/src/master.cpp +++ b/src/master.cpp @@ -93,7 +93,7 @@ master_server::master_server(std::string name, std::string address, // The push_prefinalize_callback() and push_finalize_callback() functions // serve this purpose. The former is called before Mercury is finalized, // while the latter is called in between that and Argobots finalization. - m_network_engine.push_finalize_callback([this]() { + m_network_engine.push_prefinalize_callback([this]() { m_mpi_listener_ult->join(); m_mpi_listener_ult = thallium::managed{}; m_mpi_listener_ess->join(); @@ -136,11 +136,18 @@ master_server::mpi_listener_ult() { } } + LOGGER_INFO("Shutting down. Notifying workers..."); + // shutting down, notify all workers for(int rank = 1; rank < world.size(); ++rank) { LOGGER_INFO("msg <= to: {} body: {{shutdown}}", rank); world.send(static_cast(rank), static_cast(tag::shutdown)); } + + LOGGER_INFO("Entering exit barrier..."); + world.barrier(); + + LOGGER_INFO("Exit"); } #define RPC_NAME() (__FUNCTION__) @@ -162,7 +169,8 @@ master_server::ping(const network::request& req) { req.respond(resp); } -void master_server::shutdown(const network::request& req) { +void +master_server::shutdown(const network::request& req) { using network::get_address; using network::rpc_info; using proto::generic_response; diff --git a/src/worker/worker.cpp b/src/worker/worker.cpp index 167891f..484f3b0 100644 --- a/src/worker/worker.cpp +++ b/src/worker/worker.cpp @@ -152,6 +152,10 @@ worker::run() { } } + LOGGER_INFO("Entering exit barrier..."); + world.barrier(); + LOGGER_INFO("Exit"); + return 0; } -- GitLab