From dc9aef7940e72532208b00e786e803db0e6d3838 Mon Sep 17 00:00:00 2001 From: rnou Date: Tue, 13 Sep 2022 09:55:15 +0200 Subject: [PATCH 01/20] deploy_adhoc_storage adaptation --- src/common/net/proto/rpc_types.h | 7 +++++-- src/lib/admire.cpp | 25 ++----------------------- src/lib/admire.hpp | 2 +- src/lib/c_wrapper.cpp | 5 ++++- src/scord/rpc_handlers.cpp | 4 ++-- 5 files changed, 14 insertions(+), 29 deletions(-) diff --git a/src/common/net/proto/rpc_types.h b/src/common/net/proto/rpc_types.h index 6eaf0a8e..f0ac288b 100644 --- a/src/common/net/proto/rpc_types.h +++ b/src/common/net/proto/rpc_types.h @@ -174,6 +174,9 @@ typedef struct adm_pfs_context { const char* c_mount; } adm_pfs_context; +hg_return_t +hg_proc_ADM_pfs_context_t(hg_proc_t proc, void* data); + // clang-format off MERCURY_GEN_STRUCT_PROC( adm_pfs_context, // NOLINT @@ -371,9 +374,9 @@ MERCURY_GEN_PROC( ); /// ADM_deploy_adhoc_storage -MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_in_t, ((int32_t) (reqs))) +MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_in_t, ((ADM_storage_t) (reqs))) -MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_out_t, ((int32_t) (ret))) +MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_out_t, ((int32_t) (retval))) /// ADM_register_pfs_storage MERCURY_GEN_PROC(ADM_register_pfs_storage_in_t, ((int32_t) (reqs))) diff --git a/src/lib/admire.cpp b/src/lib/admire.cpp index 98bc455a..b91e5e79 100644 --- a/src/lib/admire.cpp +++ b/src/lib/admire.cpp @@ -284,29 +284,8 @@ remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage) { } ADM_return_t -deploy_adhoc_storage(const server& srv, ADM_storage_t adhoc_storage) { - - (void) srv; - (void) adhoc_storage; - - scord::network::rpc_client rpc_client{srv.protocol(), rpc_registration_cb}; - - auto endp = rpc_client.lookup(srv.address()); - - LOGGER_INFO("ADM_deploy_adhoc_storage(...)"); - - ADM_deploy_adhoc_storage_in_t in{}; - ADM_deploy_adhoc_storage_out_t out; - - const auto rpc = endp.call("ADM_deploy_adhoc_storage", &in, &out); - - if(out.ret < 0) { - LOGGER_ERROR("ADM_deploy_adhoc_storage() = {}", out.ret); - return static_cast(out.ret); - } - - LOGGER_INFO("ADM_deploy_adhoc_storage() = {}", ADM_SUCCESS); - return ADM_SUCCESS; +deploy_adhoc_storage(const server& srv, const admire::adhoc_storage& adhoc_storage) { + return detail::deploy_adhoc_storage(srv, adhoc_storage); } ADM_return_t diff --git a/src/lib/admire.hpp b/src/lib/admire.hpp index 4e690b18..aa78cfa2 100644 --- a/src/lib/admire.hpp +++ b/src/lib/admire.hpp @@ -70,7 +70,7 @@ void remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage); ADM_return_t -deploy_adhoc_storage(const server& srv, ADM_storage_t adhoc_storage); +deploy_adhoc_storage(const server& srv, const adhoc_storage& reqs); ADM_return_t register_pfs_storage(const server& srv, ADM_pfs_context_t ctx, diff --git a/src/lib/c_wrapper.cpp b/src/lib/c_wrapper.cpp index 5cfd5eca..7b20e616 100644 --- a/src/lib/c_wrapper.cpp +++ b/src/lib/c_wrapper.cpp @@ -124,8 +124,11 @@ ADM_return_t ADM_deploy_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage) { const admire::server srv{server}; + auto storage = admire::adhoc_storage{ + static_cast(adhoc_storage->s_type), + adhoc_storage->s_id, adhoc_storage->s_adhoc_ctx}; - return admire::deploy_adhoc_storage(srv, adhoc_storage); + return admire::deploy_adhoc_storage(srv, storage); } ADM_return_t diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index 9ac3c96b..fe2b4642 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -439,11 +439,11 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { ret = margo_get_input(h, &in); assert(ret == HG_SUCCESS); - out.ret = -1; + out.retval = -1; LOGGER_INFO("ADM_deploy_adhoc_storage()"); - out.ret = 0; + out.retval = 0; ret = margo_respond(h, &out); assert(ret == HG_SUCCESS); -- GitLab From 03a981c5e621e9490fabc4bd7801d957620bc6f0 Mon Sep 17 00:00:00 2001 From: rnou Date: Tue, 13 Sep 2022 10:10:45 +0200 Subject: [PATCH 02/20] Updated deploy test --- examples/cxx/ADM_deploy_adhoc_storage.cpp | 6 +++++- src/common/net/proto/rpc_types.h | 2 +- src/scord/rpc_handlers.cpp | 16 +++++++++++++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/examples/cxx/ADM_deploy_adhoc_storage.cpp b/examples/cxx/ADM_deploy_adhoc_storage.cpp index c23cffd3..32f23afc 100644 --- a/examples/cxx/ADM_deploy_adhoc_storage.cpp +++ b/examples/cxx/ADM_deploy_adhoc_storage.cpp @@ -38,7 +38,11 @@ main(int argc, char* argv[]) { admire::server server{"tcp", argv[1]}; - ADM_storage_t adhoc_storage{}; + admire::adhoc_storage adhoc_storage( + admire::storage::type::gekkofs, "foobar", + admire::adhoc_storage::execution_mode::separate_new, + admire::adhoc_storage::access_type::read_write, 42, 100, false); + ADM_return_t ret = ADM_SUCCESS; try { diff --git a/src/common/net/proto/rpc_types.h b/src/common/net/proto/rpc_types.h index f0ac288b..e5551fe0 100644 --- a/src/common/net/proto/rpc_types.h +++ b/src/common/net/proto/rpc_types.h @@ -374,7 +374,7 @@ MERCURY_GEN_PROC( ); /// ADM_deploy_adhoc_storage -MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_in_t, ((ADM_storage_t) (reqs))) +MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_in_t, ((ADM_storage_t) (adhoc_storage))) MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_out_t, ((int32_t) (retval))) diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index fe2b4642..f54f7785 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -437,14 +437,28 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { [[maybe_unused]] margo_instance_id mid = margo_hg_handle_get_instance(h); ret = margo_get_input(h, &in); + + auto adhoc_storage = in.adhoc_storage; + assert(ret == HG_SUCCESS); out.retval = -1; LOGGER_INFO("ADM_deploy_adhoc_storage()"); - out.retval = 0; + /* Look inside adhoc_storage and launch gkfs script */ + + if (adhoc_storage->s_type == ADM_STORAGE_GEKKOFS) + { + /* Extract Job ID -> SLURM_JOB_ID */ + /* Extract paths */ + + /* Launch script */ + + out.retval = 0; + } + ret = margo_respond(h, &out); assert(ret == HG_SUCCESS); -- GitLab From 5633e72b1d7e35194f6f93eb6ff25fcb589b3aee Mon Sep 17 00:00:00 2001 From: rnou Date: Tue, 13 Sep 2022 12:07:20 +0200 Subject: [PATCH 03/20] Execute external script --- src/scord/rpc_handlers.cpp | 42 +++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index f54f7785..984ba1d1 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -31,6 +31,9 @@ #include "job_manager.hpp" #include "adhoc_storage_manager.hpp" +// Process running +#include + struct remote_procedure { static std::uint64_t new_id() { @@ -448,17 +451,46 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { /* Look inside adhoc_storage and launch gkfs script */ - if (adhoc_storage->s_type == ADM_STORAGE_GEKKOFS) - { + if (adhoc_storage->s_type == ADM_STORAGE_GEKKOFS) { /* Extract Job ID -> SLURM_JOB_ID */ + const std::string job_id = "SLURM_JOB_ID=42"; /* Extract paths */ + const std::string mountpoint = "-m /tmp/mnt"; + const std::string rootdir = "-r /tmp/root"; /* Launch script */ - - out.retval = 0; + pid_t pid = fork(); + switch (pid) { + case 0: { + std::vector args; + args.push_back("/usr/bin/echo"); + args.push_back(mountpoint.c_str()); + args.push_back(rootdir.c_str()); + args.push_back(NULL); + std::vector env; + env.push_back(job_id.c_str()); + env.push_back(NULL); + + execve("/usr/bin/echo", const_cast(args.data()), + const_cast(env.data())); + LOGGER_INFO("ADM_deploy_adhoc_storage() script didn't execute"); + exit(0); + break; + } + case -1: { + out.retval = -1; + LOGGER_ERROR("ADM_deploy_adhoc_storage() didn't fork"); + break; + } + default: { + LOGGER_INFO("ADM_deploy_adhoc_storage() executed"); + out.retval = 0; + break; + } + } } - + ret = margo_respond(h, &out); assert(ret == HG_SUCCESS); -- GitLab From 683ddad6403305b2a9c973d5bde2645450a7fb95 Mon Sep 17 00:00:00 2001 From: rnou Date: Thu, 15 Sep 2022 08:36:45 +0200 Subject: [PATCH 04/20] Adding number of nodes from context --- src/scord/rpc_handlers.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index 984ba1d1..67a6eb08 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -458,21 +458,24 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { /* Extract paths */ const std::string mountpoint = "-m /tmp/mnt"; const std::string rootdir = "-r /tmp/root"; + /* Number of nodes */ + const std::string nodes = "-n "+std::to_string(adhoc_storage->s_adhoc_ctx->c_nodes); /* Launch script */ pid_t pid = fork(); switch (pid) { case 0: { std::vector args; - args.push_back("/usr/bin/echo"); + args.push_back("gkfs"); args.push_back(mountpoint.c_str()); args.push_back(rootdir.c_str()); + args.push_back(nodes.c_str()); args.push_back(NULL); std::vector env; env.push_back(job_id.c_str()); env.push_back(NULL); - execve("/usr/bin/echo", const_cast(args.data()), + execve("gkfs", const_cast(args.data()), const_cast(env.data())); LOGGER_INFO("ADM_deploy_adhoc_storage() script didn't execute"); exit(0); -- GitLab From c862b31e763b88563a9961597f59c320c84cd140 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 14 Oct 2022 12:49:24 +0200 Subject: [PATCH 05/20] API changes --- examples/cxx/ADM_deploy_adhoc_storage.cpp | 7 +++++-- src/lib/c_wrapper.cpp | 1 + src/scord/rpc_handlers.cpp | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/cxx/ADM_deploy_adhoc_storage.cpp b/examples/cxx/ADM_deploy_adhoc_storage.cpp index 32f23afc..e694ae98 100644 --- a/examples/cxx/ADM_deploy_adhoc_storage.cpp +++ b/examples/cxx/ADM_deploy_adhoc_storage.cpp @@ -37,11 +37,14 @@ main(int argc, char* argv[]) { } admire::server server{"tcp", argv[1]}; + std::vector nodes{"node1","node2"}; + admire::adhoc_storage::resources res(nodes); admire::adhoc_storage adhoc_storage( - admire::storage::type::gekkofs, "foobar", + admire::storage::type::gekkofs, "foobar", 1, admire::adhoc_storage::execution_mode::separate_new, - admire::adhoc_storage::access_type::read_write, 42, 100, false); + admire::adhoc_storage::access_type::read_write, + res, 100, false); ADM_return_t ret = ADM_SUCCESS; diff --git a/src/lib/c_wrapper.cpp b/src/lib/c_wrapper.cpp index 7b20e616..fd40849c 100644 --- a/src/lib/c_wrapper.cpp +++ b/src/lib/c_wrapper.cpp @@ -126,6 +126,7 @@ ADM_deploy_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage) { const admire::server srv{server}; auto storage = admire::adhoc_storage{ static_cast(adhoc_storage->s_type), + adhoc_storage->s_name, adhoc_storage->s_id, adhoc_storage->s_adhoc_ctx}; return admire::deploy_adhoc_storage(srv, storage); diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index 67a6eb08..ee5e050e 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -459,7 +459,7 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { const std::string mountpoint = "-m /tmp/mnt"; const std::string rootdir = "-r /tmp/root"; /* Number of nodes */ - const std::string nodes = "-n "+std::to_string(adhoc_storage->s_adhoc_ctx->c_nodes); + const std::string nodes = "-n "+std::to_string(adhoc_storage->s_adhoc_ctx->c_resources->r_nodes->l_length); /* Launch script */ pid_t pid = fork(); -- GitLab From 99cfd9bef1b08f0aacab55f3f3fac66f8f4e9e60 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 14 Oct 2022 13:04:54 +0200 Subject: [PATCH 06/20] resource node in deploy --- examples/cxx/ADM_deploy_adhoc_storage.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/cxx/ADM_deploy_adhoc_storage.cpp b/examples/cxx/ADM_deploy_adhoc_storage.cpp index e694ae98..bad78b1f 100644 --- a/examples/cxx/ADM_deploy_adhoc_storage.cpp +++ b/examples/cxx/ADM_deploy_adhoc_storage.cpp @@ -37,7 +37,9 @@ main(int argc, char* argv[]) { } admire::server server{"tcp", argv[1]}; - std::vector nodes{"node1","node2"}; + admire::node n1{"node1"}; + admire::node n2("node2"); + std::vector nodes{n1,n2}; admire::adhoc_storage::resources res(nodes); admire::adhoc_storage adhoc_storage( -- GitLab From 11e2f42fc7f60c49d36889f37581bb02d8f2b0bd Mon Sep 17 00:00:00 2001 From: rnou Date: Tue, 18 Oct 2022 13:29:26 +0200 Subject: [PATCH 07/20] Updated gkfs launch --- src/scord/rpc_handlers.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index ee5e050e..912c909f 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -455,11 +455,13 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { /* Extract Job ID -> SLURM_JOB_ID */ const std::string job_id = "SLURM_JOB_ID=42"; - /* Extract paths */ - const std::string mountpoint = "-m /tmp/mnt"; - const std::string rootdir = "-r /tmp/root"; /* Number of nodes */ - const std::string nodes = "-n "+std::to_string(adhoc_storage->s_adhoc_ctx->c_resources->r_nodes->l_length); + int nnodes = adhoc_storage->s_adhoc_ctx->c_resources->r_nodes->l_length; + const std::string nodes = "-n "+std::to_string(nnodes); + + /* Walltime */ + int twalltime = adhoc_storage->s_adhoc_ctx->c_walltime; + const std::string walltime = std::to_string(twalltime); /* Launch script */ pid_t pid = fork(); @@ -467,9 +469,14 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { case 0: { std::vector args; args.push_back("gkfs"); - args.push_back(mountpoint.c_str()); - args.push_back(rootdir.c_str()); + args.push_back("-c"); + args.push_back("gkfs.conf"); + args.push_back("-n"); args.push_back(nodes.c_str()); + //args.push_back("-w"); + //args.push_back(walltime.c_str()); + args.push_back("--srun"); + args.push_back("start"); args.push_back(NULL); std::vector env; env.push_back(job_id.c_str()); -- GitLab From 32ae60aeecc8d828abc29ea8806e4150fa704264 Mon Sep 17 00:00:00 2001 From: rnou Date: Tue, 18 Oct 2022 13:45:23 +0200 Subject: [PATCH 08/20] Use C++ interface --- src/scord/rpc_handlers.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index 912c909f..69df54f9 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -442,6 +442,7 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { ret = margo_get_input(h, &in); auto adhoc_storage = in.adhoc_storage; + const admire::adhoc_storage::ctx ctx(adhoc_storage->s_adhoc_ctx); assert(ret == HG_SUCCESS); @@ -456,11 +457,11 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { const std::string job_id = "SLURM_JOB_ID=42"; /* Number of nodes */ - int nnodes = adhoc_storage->s_adhoc_ctx->c_resources->r_nodes->l_length; + int nnodes = ctx.resources().nodes().size(); const std::string nodes = "-n "+std::to_string(nnodes); /* Walltime */ - int twalltime = adhoc_storage->s_adhoc_ctx->c_walltime; + int twalltime = ctx.walltime(); const std::string walltime = std::to_string(twalltime); /* Launch script */ -- GitLab From 38548b2feeb989ef505c2178963664f04b77f157 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 20 Oct 2022 13:53:53 +0200 Subject: [PATCH 09/20] Execute gkfs with execvpe, search in the path --- src/scord/rpc_handlers.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index 69df54f9..6fb216e3 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -453,9 +453,6 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { /* Look inside adhoc_storage and launch gkfs script */ if (adhoc_storage->s_type == ADM_STORAGE_GEKKOFS) { - /* Extract Job ID -> SLURM_JOB_ID */ - const std::string job_id = "SLURM_JOB_ID=42"; - /* Number of nodes */ int nnodes = ctx.resources().nodes().size(); const std::string nodes = "-n "+std::to_string(nnodes); @@ -470,12 +467,12 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { case 0: { std::vector args; args.push_back("gkfs"); - args.push_back("-c"); - args.push_back("gkfs.conf"); + // args.push_back("-c"); + // args.push_back("gkfs.conf"); args.push_back("-n"); args.push_back(nodes.c_str()); - //args.push_back("-w"); - //args.push_back(walltime.c_str()); + // args.push_back("-w"); + // args.push_back(walltime.c_str()); args.push_back("--srun"); args.push_back("start"); args.push_back(NULL); @@ -483,7 +480,7 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { env.push_back(job_id.c_str()); env.push_back(NULL); - execve("gkfs", const_cast(args.data()), + execvpe("gkfs", const_cast(args.data()), const_cast(env.data())); LOGGER_INFO("ADM_deploy_adhoc_storage() script didn't execute"); exit(0); -- GitLab From 1cfbd55dec895cfd4d13c474c873e41149b89b45 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 20 Oct 2022 13:56:23 +0200 Subject: [PATCH 10/20] Remove job id from deploy, update test --- examples/cxx/ADM_deploy_adhoc_storage.cpp | 4 +--- src/scord/rpc_handlers.cpp | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/cxx/ADM_deploy_adhoc_storage.cpp b/examples/cxx/ADM_deploy_adhoc_storage.cpp index bad78b1f..91701ad0 100644 --- a/examples/cxx/ADM_deploy_adhoc_storage.cpp +++ b/examples/cxx/ADM_deploy_adhoc_storage.cpp @@ -37,9 +37,7 @@ main(int argc, char* argv[]) { } admire::server server{"tcp", argv[1]}; - admire::node n1{"node1"}; - admire::node n2("node2"); - std::vector nodes{n1,n2}; + std::vector nodes{admire::node{"node1"},admire::node{"node2"}}; admire::adhoc_storage::resources res(nodes); admire::adhoc_storage adhoc_storage( diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index 6fb216e3..c868fd24 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -477,7 +477,6 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { args.push_back("start"); args.push_back(NULL); std::vector env; - env.push_back(job_id.c_str()); env.push_back(NULL); execvpe("gkfs", const_cast(args.data()), -- GitLab From 6904180376611738be9a82716d997c428a610980 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 3 Nov 2022 13:36:38 +0100 Subject: [PATCH 11/20] missing deploy interface --- src/lib/detail/impl.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lib/detail/impl.hpp b/src/lib/detail/impl.hpp index 03bf2252..51aa88e9 100644 --- a/src/lib/detail/impl.hpp +++ b/src/lib/detail/impl.hpp @@ -65,6 +65,10 @@ update_adhoc_storage(const server& srv, admire::error_code remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage); +admire::error_code +deploy_adhoc_storage(const server& srv, + const adhoc_storage& adhoc_storage); + } // namespace admire::detail #endif // SCORD_ADMIRE_IMPL_HPP -- GitLab From f53bf79f90409dae478ed818615ba5c11bad133c Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 3 Nov 2022 13:51:39 +0100 Subject: [PATCH 12/20] deploy impl --- src/lib/detail/impl.cpp | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/lib/detail/impl.cpp b/src/lib/detail/impl.cpp index 6e5bd922..5c6ebd67 100644 --- a/src/lib/detail/impl.cpp +++ b/src/lib/detail/impl.cpp @@ -357,6 +357,43 @@ register_adhoc_storage(const server& srv, const std::string& name, return rpc_adhoc_storage; } +admire::error_code +deploy_adhoc_storage(const server& srv, + const adhoc_storage& adhoc_storage) { + + scord::network::rpc_client rpc_client{srv.protocol(), rpc_registration_cb}; + + const auto rpc_id = ::api::remote_procedure::new_id(); + auto endp = rpc_client.lookup(srv.address()); + + LOGGER_INFO("rpc id: {} name: {} from: {} => " + "body: {{adhoc_stx: {}}}", + rpc_id, std::quoted("ADM_"s + __FUNCTION__), + std::quoted(rpc_client.self_address()), adhoc_storage); + + const auto rpc_stx = api::convert(adhoc_storage); + + ADM_deploy_adhoc_storage_in_t in{rpc_stx.get()}; + ADM_deploy_adhoc_storage_out_t out; + + const auto rpc = endp.call("ADM_deploy_adhoc_storage", &in, &out); + + if(const auto rv = admire::error_code{out.retval}; !rv) { + LOGGER_ERROR("rpc id: {} name: {} from: {} <= " + "body: {{retval: {}}}", + rpc_id, std::quoted("ADM_"s + __FUNCTION__), + std::quoted(rpc_client.self_address()), rv); + return rv; + } + + LOGGER_INFO("rpc id: {} name: {} from: {} <= " + "body: {{retval: {}}}]", + rpc_id, std::quoted("ADM_"s + __FUNCTION__), + admire::error_code::success); + + return admire::error_code::success; +} + tl::expected transfer_datasets(const server& srv, const job& job, const std::vector& sources, -- GitLab From 30ed3913811de4fa94b00e6164b18851ab72477b Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 3 Nov 2022 15:54:29 +0100 Subject: [PATCH 13/20] Add RPC-ID --- src/scord/rpc_handlers.cpp | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index c868fd24..6d073c6b 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -440,15 +440,22 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { [[maybe_unused]] margo_instance_id mid = margo_hg_handle_get_instance(h); ret = margo_get_input(h, &in); + assert(ret == HG_SUCCESS); + auto adhoc_storage = in.adhoc_storage; const admire::adhoc_storage::ctx ctx(adhoc_storage->s_adhoc_ctx); + + const auto rpc_id = remote_procedure::new_id(); + LOGGER_INFO("rpc id: {} name: {} from: {} => " + "body: {{adhoc_ctx: {}}}", + rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), + ctx); - assert(ret == HG_SUCCESS); - - out.retval = -1; + admire::error_code ec; + ec = admire::error_code::other; - LOGGER_INFO("ADM_deploy_adhoc_storage()"); + out.retval = ec; /* Look inside adhoc_storage and launch gkfs script */ @@ -486,17 +493,25 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { break; } case -1: { - out.retval = -1; - LOGGER_ERROR("ADM_deploy_adhoc_storage() didn't fork"); + ec = ec.other; + LOGGER_ERROR("rpc id: {} name: {} to: {} <= " + "body: {{retval: {}}}", + rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), + ec); break; } default: { - LOGGER_INFO("ADM_deploy_adhoc_storage() executed"); - out.retval = 0; + ec = ec.success; + LOGGER_INFO("rpc id: {} name: {} to: {} <= " + "body: {{retval: {}}}", + rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), + ec); + break; } } } + out.retval = ec; ret = margo_respond(h, &out); assert(ret == HG_SUCCESS); -- GitLab From d3e171ad9738ab24f27c3a6d690f71e61ff4f160 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 3 Nov 2022 16:57:16 +0100 Subject: [PATCH 14/20] remove GekkoFS daemon from testing --- examples/c/ADM_deploy_adhoc_storage.c | 2 +- examples/cxx/ADM_register_adhoc_storage.cpp | 2 +- src/scord/rpc_handlers.cpp | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/c/ADM_deploy_adhoc_storage.c b/examples/c/ADM_deploy_adhoc_storage.c index 8f476ff2..94a504fe 100644 --- a/examples/c/ADM_deploy_adhoc_storage.c +++ b/examples/c/ADM_deploy_adhoc_storage.c @@ -96,7 +96,7 @@ main(int argc, char* argv[]) { } // 2. Register the adhoc storage - if(ADM_register_adhoc_storage(server, adhoc_name, ADM_STORAGE_GEKKOFS, + if(ADM_register_adhoc_storage(server, adhoc_name, ADM_STORAGE_DATACLAY, adhoc_ctx, &adhoc_storage) != ADM_SUCCESS) { fprintf(stderr, "ADM_register_adhoc_storage() failed: %s\n", ADM_strerror(ret)); diff --git a/examples/cxx/ADM_register_adhoc_storage.cpp b/examples/cxx/ADM_register_adhoc_storage.cpp index f877d7f2..b18c19b6 100644 --- a/examples/cxx/ADM_register_adhoc_storage.cpp +++ b/examples/cxx/ADM_register_adhoc_storage.cpp @@ -55,7 +55,7 @@ main(int argc, char* argv[]) { try { const auto adhoc_storage = admire::register_adhoc_storage( - server, name, admire::storage::type::gekkofs, + server, name, admire::storage::type::dataclay, adhoc_storage_ctx); fmt::print(stdout, diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index 6d073c6b..9d5927a9 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -448,12 +448,12 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { const auto rpc_id = remote_procedure::new_id(); LOGGER_INFO("rpc id: {} name: {} from: {} => " - "body: {{adhoc_ctx: {}}}", + "body: {{adhoc_storage: {}}}", rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), - ctx); + admire::adhoc_storage(adhoc_storage)); admire::error_code ec; - ec = admire::error_code::other; + ec = admire::error_code::success; out.retval = ec; @@ -502,17 +502,17 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { } default: { ec = ec.success; - LOGGER_INFO("rpc id: {} name: {} to: {} <= " - "body: {{retval: {}}}", - rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), - ec); - break; } } } + out.retval = ec; - + LOGGER_INFO("rpc id: {} name: {} to: {} <= " + "body: {{retval: {}}}", + rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), + ec); + ret = margo_respond(h, &out); assert(ret == HG_SUCCESS); -- GitLab From b73f0ac11aa4c8315aee91cb5f36a85ab7de5c77 Mon Sep 17 00:00:00 2001 From: rnou Date: Tue, 15 Nov 2022 08:59:58 +0100 Subject: [PATCH 15/20] GekkoFS -> Dataclay for tests, to avoid daemon launch --- examples/cxx/ADM_deploy_adhoc_storage.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cxx/ADM_deploy_adhoc_storage.cpp b/examples/cxx/ADM_deploy_adhoc_storage.cpp index 91701ad0..86b0cf42 100644 --- a/examples/cxx/ADM_deploy_adhoc_storage.cpp +++ b/examples/cxx/ADM_deploy_adhoc_storage.cpp @@ -41,7 +41,7 @@ main(int argc, char* argv[]) { admire::adhoc_storage::resources res(nodes); admire::adhoc_storage adhoc_storage( - admire::storage::type::gekkofs, "foobar", 1, + admire::storage::type::dataclay, "foobar", 1, admire::adhoc_storage::execution_mode::separate_new, admire::adhoc_storage::access_type::read_write, res, 100, false); -- GitLab From 2c1fdd050de83a9291b83e21dcb5a9e1a6805e58 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 18 Nov 2022 08:02:15 +0100 Subject: [PATCH 16/20] Added common, rebased with error code --- examples/cxx/ADM_deploy_adhoc_storage.cpp | 14 +++++++------- src/lib/admire.cpp | 11 +++++++++-- src/lib/admire.hpp | 2 +- src/lib/c_wrapper.cpp | 2 +- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/examples/cxx/ADM_deploy_adhoc_storage.cpp b/examples/cxx/ADM_deploy_adhoc_storage.cpp index 86b0cf42..fbbd5c7d 100644 --- a/examples/cxx/ADM_deploy_adhoc_storage.cpp +++ b/examples/cxx/ADM_deploy_adhoc_storage.cpp @@ -24,7 +24,7 @@ #include #include - +#include "common.hpp" int main(int argc, char* argv[]) { @@ -37,19 +37,19 @@ main(int argc, char* argv[]) { } admire::server server{"tcp", argv[1]}; - std::vector nodes{admire::node{"node1"},admire::node{"node2"}}; + + std::vector nodes = prepare_nodes(10); admire::adhoc_storage::resources res(nodes); admire::adhoc_storage adhoc_storage( - admire::storage::type::dataclay, "foobar", 1, - admire::adhoc_storage::execution_mode::separate_new, - admire::adhoc_storage::access_type::read_write, - res, 100, false); + admire::storage::type::dataclay, "foobar", 1, + admire::adhoc_storage::execution_mode::separate_new, + admire::adhoc_storage::access_type::read_write, res, 100, false); ADM_return_t ret = ADM_SUCCESS; try { - ret = admire::deploy_adhoc_storage(server, adhoc_storage); + admire::deploy_adhoc_storage(server, adhoc_storage); } catch(const std::exception& e) { fmt::print(stderr, "FATAL: ADM_deploy_adhoc_storage() failed: {}\n", e.what()); diff --git a/src/lib/admire.cpp b/src/lib/admire.cpp index b91e5e79..5d1e178e 100644 --- a/src/lib/admire.cpp +++ b/src/lib/admire.cpp @@ -283,9 +283,16 @@ remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage) { } } -ADM_return_t +void deploy_adhoc_storage(const server& srv, const admire::adhoc_storage& adhoc_storage) { - return detail::deploy_adhoc_storage(srv, adhoc_storage); + + const auto ec = detail::deploy_adhoc_storage(srv, adhoc_storage); + + if(!ec) { + throw std::runtime_error(fmt::format( + "ADM_deploy_adhoc_storage() error: {}", ec.message())); + } + } ADM_return_t diff --git a/src/lib/admire.hpp b/src/lib/admire.hpp index aa78cfa2..00ab6fb0 100644 --- a/src/lib/admire.hpp +++ b/src/lib/admire.hpp @@ -69,7 +69,7 @@ update_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage, void remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage); -ADM_return_t +void deploy_adhoc_storage(const server& srv, const adhoc_storage& reqs); ADM_return_t diff --git a/src/lib/c_wrapper.cpp b/src/lib/c_wrapper.cpp index fd40849c..cd17597a 100644 --- a/src/lib/c_wrapper.cpp +++ b/src/lib/c_wrapper.cpp @@ -129,7 +129,7 @@ ADM_deploy_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage) { adhoc_storage->s_name, adhoc_storage->s_id, adhoc_storage->s_adhoc_ctx}; - return admire::deploy_adhoc_storage(srv, storage); + return admire::detail::deploy_adhoc_storage(srv, storage); } ADM_return_t -- GitLab From a6296c9975b59f58c0843179fa59e80574da6a6a Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 18 Nov 2022 13:22:31 +0100 Subject: [PATCH 17/20] Examples with id (Deploy) --- examples/c/ADM_deploy_adhoc_storage.c | 5 +- examples/cxx/ADM_deploy_adhoc_storage.cpp | 35 +++++++++--- src/common/net/proto/rpc_types.h | 9 +-- src/lib/admire.cpp | 4 +- src/lib/admire.h | 5 +- src/lib/admire.hpp | 2 +- src/lib/c_wrapper.cpp | 8 +-- src/lib/detail/impl.cpp | 11 ++-- src/lib/detail/impl.hpp | 2 +- src/scord/rpc_handlers.cpp | 70 ++++++++++++++--------- 10 files changed, 91 insertions(+), 60 deletions(-) diff --git a/examples/c/ADM_deploy_adhoc_storage.c b/examples/c/ADM_deploy_adhoc_storage.c index 94a504fe..4444d88e 100644 --- a/examples/c/ADM_deploy_adhoc_storage.c +++ b/examples/c/ADM_deploy_adhoc_storage.c @@ -25,7 +25,9 @@ #include #include #include +#include #include "common.h" +#include #define NADHOC_NODES 25 #define NINPUTS 10 @@ -118,7 +120,8 @@ main(int argc, char* argv[]) { } // We can now request the deployment to the server - if((ret = ADM_deploy_adhoc_storage(server, adhoc_storage)) != ADM_SUCCESS) { + if((ret = ADM_deploy_adhoc_storage(server, adhoc_storage->s_id)) != + ADM_SUCCESS) { fprintf(stderr, "ADM_deploy_adhoc_storage() failed: %s\n", ADM_strerror(ret)); goto cleanup; diff --git a/examples/cxx/ADM_deploy_adhoc_storage.cpp b/examples/cxx/ADM_deploy_adhoc_storage.cpp index fbbd5c7d..93845899 100644 --- a/examples/cxx/ADM_deploy_adhoc_storage.cpp +++ b/examples/cxx/ADM_deploy_adhoc_storage.cpp @@ -26,6 +26,11 @@ #include #include "common.hpp" +#define NJOB_NODES 50 +#define NADHOC_NODES 25 +#define NINPUTS 10 +#define NOUTPUTS 5 + int main(int argc, char* argv[]) { @@ -38,24 +43,36 @@ main(int argc, char* argv[]) { admire::server server{"tcp", argv[1]}; - std::vector nodes = prepare_nodes(10); - admire::adhoc_storage::resources res(nodes); + const auto adhoc_nodes = prepare_nodes(NADHOC_NODES); + const auto inputs = prepare_datasets("input-dataset-{}", NINPUTS); + const auto outputs = prepare_datasets("output-dataset-{}", NOUTPUTS); - admire::adhoc_storage adhoc_storage( - admire::storage::type::dataclay, "foobar", 1, + std::string name = "adhoc_storage_42"; + const auto adhoc_storage_ctx = admire::adhoc_storage::ctx{ admire::adhoc_storage::execution_mode::separate_new, - admire::adhoc_storage::access_type::read_write, res, 100, false); - - ADM_return_t ret = ADM_SUCCESS; + admire::adhoc_storage::access_type::read_write, + admire::adhoc_storage::resources{adhoc_nodes}, 100, false}; try { - admire::deploy_adhoc_storage(server, adhoc_storage); + const auto adhoc_storage = admire::register_adhoc_storage( + server, name, admire::storage::type::dataclay, + adhoc_storage_ctx); + + fmt::print(stdout, + "ADM_register_adhoc_storage() remote procedure completed " + "successfully\n"); + + admire::deploy_adhoc_storage(server, adhoc_storage.id()); + } catch(const std::exception& e) { - fmt::print(stderr, "FATAL: ADM_deploy_adhoc_storage() failed: {}\n", + fmt::print(stderr, "FATAL: ADM_register_adhoc_storage() failed: {}\n", e.what()); exit(EXIT_FAILURE); } + ADM_return_t ret = ADM_SUCCESS; + + if(ret != ADM_SUCCESS) { fmt::print(stdout, "ADM_deploy_adhoc_storage() remote procedure not completed " diff --git a/src/common/net/proto/rpc_types.h b/src/common/net/proto/rpc_types.h index e5551fe0..b624c828 100644 --- a/src/common/net/proto/rpc_types.h +++ b/src/common/net/proto/rpc_types.h @@ -174,8 +174,6 @@ typedef struct adm_pfs_context { const char* c_mount; } adm_pfs_context; -hg_return_t -hg_proc_ADM_pfs_context_t(hg_proc_t proc, void* data); // clang-format off MERCURY_GEN_STRUCT_PROC( @@ -374,9 +372,12 @@ MERCURY_GEN_PROC( ); /// ADM_deploy_adhoc_storage -MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_in_t, ((ADM_storage_t) (adhoc_storage))) +MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_in_t, ((hg_uint64_t) (id))) -MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_out_t, ((int32_t) (retval))) +MERCURY_GEN_PROC(ADM_deploy_adhoc_storage_out_t, + ((hg_uint64_t) (op_id)) + ((hg_int32_t) (retval)) +); /// ADM_register_pfs_storage MERCURY_GEN_PROC(ADM_register_pfs_storage_in_t, ((int32_t) (reqs))) diff --git a/src/lib/admire.cpp b/src/lib/admire.cpp index 5d1e178e..1b694cde 100644 --- a/src/lib/admire.cpp +++ b/src/lib/admire.cpp @@ -284,9 +284,9 @@ remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage) { } void -deploy_adhoc_storage(const server& srv, const admire::adhoc_storage& adhoc_storage) { +deploy_adhoc_storage(const server& srv, const uint64_t adhoc_id) { - const auto ec = detail::deploy_adhoc_storage(srv, adhoc_storage); + const auto ec = detail::deploy_adhoc_storage(srv, adhoc_id); if(!ec) { throw std::runtime_error(fmt::format( diff --git a/src/lib/admire.h b/src/lib/admire.h index 2545d6c4..155434cd 100644 --- a/src/lib/admire.h +++ b/src/lib/admire.h @@ -147,13 +147,12 @@ ADM_remove_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage); * Initiate the deployment of an adhoc storage system instance. * * @param[in] server The server to which the request is directed - * @param[in] job An ADM_JOB identifying the originating job. - * @param[in] adhoc_storage An ADM_STORAGE referring to the adhoc storage + * @param[in] adhoc_id An ADM_STORAGE referring to the adhoc storage * instance of interest. * @return Returns ADM_SUCCESS if the remote procedure has completed */ ADM_return_t -ADM_deploy_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage); +ADM_deploy_adhoc_storage(ADM_server_t server, u_int64_t adhoc_id); /** * Register a PFS storage tier. diff --git a/src/lib/admire.hpp b/src/lib/admire.hpp index 00ab6fb0..53918b27 100644 --- a/src/lib/admire.hpp +++ b/src/lib/admire.hpp @@ -70,7 +70,7 @@ void remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage); void -deploy_adhoc_storage(const server& srv, const adhoc_storage& reqs); +deploy_adhoc_storage(const server& srv, uint64_t adhoc_id); ADM_return_t register_pfs_storage(const server& srv, ADM_pfs_context_t ctx, diff --git a/src/lib/c_wrapper.cpp b/src/lib/c_wrapper.cpp index cd17597a..62713b0d 100644 --- a/src/lib/c_wrapper.cpp +++ b/src/lib/c_wrapper.cpp @@ -121,15 +121,11 @@ ADM_remove_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage) { } ADM_return_t -ADM_deploy_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage) { +ADM_deploy_adhoc_storage(ADM_server_t server, uint64_t adhoc_id) { const admire::server srv{server}; - auto storage = admire::adhoc_storage{ - static_cast(adhoc_storage->s_type), - adhoc_storage->s_name, - adhoc_storage->s_id, adhoc_storage->s_adhoc_ctx}; - return admire::detail::deploy_adhoc_storage(srv, storage); + return admire::detail::deploy_adhoc_storage(srv, adhoc_id); } ADM_return_t diff --git a/src/lib/detail/impl.cpp b/src/lib/detail/impl.cpp index 5c6ebd67..72e78eb6 100644 --- a/src/lib/detail/impl.cpp +++ b/src/lib/detail/impl.cpp @@ -359,7 +359,7 @@ register_adhoc_storage(const server& srv, const std::string& name, admire::error_code deploy_adhoc_storage(const server& srv, - const adhoc_storage& adhoc_storage) { + const uint64_t adhoc_id) { scord::network::rpc_client rpc_client{srv.protocol(), rpc_registration_cb}; @@ -367,14 +367,13 @@ deploy_adhoc_storage(const server& srv, auto endp = rpc_client.lookup(srv.address()); LOGGER_INFO("rpc id: {} name: {} from: {} => " - "body: {{adhoc_stx: {}}}", + "body: {{adhoc_id: {}}}", rpc_id, std::quoted("ADM_"s + __FUNCTION__), - std::quoted(rpc_client.self_address()), adhoc_storage); - - const auto rpc_stx = api::convert(adhoc_storage); + std::quoted(rpc_client.self_address()), adhoc_id); - ADM_deploy_adhoc_storage_in_t in{rpc_stx.get()}; + ADM_deploy_adhoc_storage_in_t in{adhoc_id}; ADM_deploy_adhoc_storage_out_t out; + out.op_id = rpc_id; const auto rpc = endp.call("ADM_deploy_adhoc_storage", &in, &out); diff --git a/src/lib/detail/impl.hpp b/src/lib/detail/impl.hpp index 51aa88e9..b8e90eb0 100644 --- a/src/lib/detail/impl.hpp +++ b/src/lib/detail/impl.hpp @@ -67,7 +67,7 @@ remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage); admire::error_code deploy_adhoc_storage(const server& srv, - const adhoc_storage& adhoc_storage); + const uint64_t adhoc_id); } // namespace admire::detail diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index 9d5927a9..ab924057 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -33,6 +33,7 @@ // Process running #include +#include struct remote_procedure { static std::uint64_t @@ -443,34 +444,43 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { assert(ret == HG_SUCCESS); - auto adhoc_storage = in.adhoc_storage; - const admire::adhoc_storage::ctx ctx(adhoc_storage->s_adhoc_ctx); - const auto rpc_id = remote_procedure::new_id(); LOGGER_INFO("rpc id: {} name: {} from: {} => " - "body: {{adhoc_storage: {}}}", + "body: {{adhoc_id: {}}}", rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), - admire::adhoc_storage(adhoc_storage)); + in.id); admire::error_code ec; ec = admire::error_code::success; + auto& adhoc_manager = scord::adhoc_storage_manager::instance(); + auto adhoc_storage_info = adhoc_manager.find(in.id); + + if(!adhoc_storage_info) { + LOGGER_ERROR("rpc id: {} error_msg: \"Error finding adhoc_storage: {}", + rpc_id, in.id); + ec = admire::error_code::no_such_entity; + } + + const auto& storage_info = adhoc_storage_info.value(); + const auto adhoc_storage = storage_info->adhoc_storage(); out.retval = ec; /* Look inside adhoc_storage and launch gkfs script */ - if (adhoc_storage->s_type == ADM_STORAGE_GEKKOFS) { - /* Number of nodes */ - int nnodes = ctx.resources().nodes().size(); - const std::string nodes = "-n "+std::to_string(nnodes); + if(adhoc_storage.type() == admire::storage::type::gekkofs) { + const auto adhoc_ctx = + (admire::adhoc_storage::ctx*) adhoc_storage.context().get(); + /* Number of nodes */ + const std::string nodes = + std::to_string(adhoc_ctx->resources().nodes().size()); - /* Walltime */ - int twalltime = ctx.walltime(); - const std::string walltime = std::to_string(twalltime); + /* Walltime */ + const std::string walltime = std::to_string(adhoc_ctx->walltime()); - /* Launch script */ + /* Launch script */ pid_t pid = fork(); - switch (pid) { + switch(pid) { case 0: { std::vector args; args.push_back("gkfs"); @@ -485,34 +495,40 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { args.push_back(NULL); std::vector env; env.push_back(NULL); - - execvpe("gkfs", const_cast(args.data()), - const_cast(env.data())); + + execvpe("gkfs", const_cast(args.data()), + const_cast(env.data())); LOGGER_INFO("ADM_deploy_adhoc_storage() script didn't execute"); exit(0); break; - } + } case -1: { ec = ec.other; LOGGER_ERROR("rpc id: {} name: {} to: {} <= " - "body: {{retval: {}}}", - rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), - ec); + "body: {{retval: {}}}", + rpc_id, std::quoted(__FUNCTION__), + std::quoted(get_address(h)), ec); break; } default: { - ec = ec.success; + int wstatus = 0; + waitpid(pid, &wstatus, 0); + if(WEXITSTATUS(wstatus) != 0) { + ec = ec.other; + } else { + ec = ec.success; + } break; } } } - + out.retval = ec; LOGGER_INFO("rpc id: {} name: {} to: {} <= " - "body: {{retval: {}}}", - rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), - ec); - + "body: {{retval: {}}}", + rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), + ec); + ret = margo_respond(h, &out); assert(ret == HG_SUCCESS); -- GitLab From 7578bb064ad045dd0b998215865a109722b767f8 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 18 Nov 2022 13:59:52 +0100 Subject: [PATCH 18/20] Id-op and error deploy --- src/lib/detail/impl.cpp | 8 ++++---- src/scord/rpc_handlers.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lib/detail/impl.cpp b/src/lib/detail/impl.cpp index 72e78eb6..2dd2bb61 100644 --- a/src/lib/detail/impl.cpp +++ b/src/lib/detail/impl.cpp @@ -379,16 +379,16 @@ deploy_adhoc_storage(const server& srv, if(const auto rv = admire::error_code{out.retval}; !rv) { LOGGER_ERROR("rpc id: {} name: {} from: {} <= " - "body: {{retval: {}}}", + "body: {{retval: {}}} [op_id: {}]", rpc_id, std::quoted("ADM_"s + __FUNCTION__), - std::quoted(rpc_client.self_address()), rv); + std::quoted(rpc_client.self_address()), rv, out.op_id); return rv; } LOGGER_INFO("rpc id: {} name: {} from: {} <= " - "body: {{retval: {}}}]", + "body: {{retval: {}}}] [op_id: {}]", rpc_id, std::quoted("ADM_"s + __FUNCTION__), - admire::error_code::success); + admire::error_code::success, out.op_id); return admire::error_code::success; } diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index ab924057..bb708279 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -499,11 +499,11 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { execvpe("gkfs", const_cast(args.data()), const_cast(env.data())); LOGGER_INFO("ADM_deploy_adhoc_storage() script didn't execute"); - exit(0); + exit(EXIT_FAILURE); break; } case -1: { - ec = ec.other; + ec = admire::error_code::other; LOGGER_ERROR("rpc id: {} name: {} to: {} <= " "body: {{retval: {}}}", rpc_id, std::quoted(__FUNCTION__), @@ -514,9 +514,9 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { int wstatus = 0; waitpid(pid, &wstatus, 0); if(WEXITSTATUS(wstatus) != 0) { - ec = ec.other; + ec = admire::error_code::other; } else { - ec = ec.success; + ec = admire::error_code::success; } break; } -- GitLab From 92278242103d725aef03ab4c7237086f2d5bf3ca Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 21 Nov 2022 09:08:51 +0100 Subject: [PATCH 19/20] Reverted ADM_storage public interface --- examples/c/ADM_deploy_adhoc_storage.c | 2 +- examples/cxx/ADM_deploy_adhoc_storage.cpp | 18 +-- src/lib/admire.cpp | 4 +- src/lib/admire.h | 5 +- src/lib/admire.hpp | 2 +- src/lib/c_wrapper.cpp | 5 +- src/lib/detail/impl.cpp | 11 +- src/lib/detail/impl.hpp | 2 +- src/scord/rpc_handlers.cpp | 129 +++++++++++----------- 9 files changed, 85 insertions(+), 93 deletions(-) diff --git a/examples/c/ADM_deploy_adhoc_storage.c b/examples/c/ADM_deploy_adhoc_storage.c index 4444d88e..04ad737a 100644 --- a/examples/c/ADM_deploy_adhoc_storage.c +++ b/examples/c/ADM_deploy_adhoc_storage.c @@ -120,7 +120,7 @@ main(int argc, char* argv[]) { } // We can now request the deployment to the server - if((ret = ADM_deploy_adhoc_storage(server, adhoc_storage->s_id)) != + if((ret = ADM_deploy_adhoc_storage(server, adhoc_storage)) != ADM_SUCCESS) { fprintf(stderr, "ADM_deploy_adhoc_storage() failed: %s\n", ADM_strerror(ret)); diff --git a/examples/cxx/ADM_deploy_adhoc_storage.cpp b/examples/cxx/ADM_deploy_adhoc_storage.cpp index 93845899..cc4ae7a2 100644 --- a/examples/cxx/ADM_deploy_adhoc_storage.cpp +++ b/examples/cxx/ADM_deploy_adhoc_storage.cpp @@ -62,21 +62,13 @@ main(int argc, char* argv[]) { "ADM_register_adhoc_storage() remote procedure completed " "successfully\n"); - admire::deploy_adhoc_storage(server, adhoc_storage.id()); + admire::deploy_adhoc_storage(server, adhoc_storage); } catch(const std::exception& e) { - fmt::print(stderr, "FATAL: ADM_register_adhoc_storage() failed: {}\n", - e.what()); - exit(EXIT_FAILURE); - } - - ADM_return_t ret = ADM_SUCCESS; - - - if(ret != ADM_SUCCESS) { - fmt::print(stdout, - "ADM_deploy_adhoc_storage() remote procedure not completed " - "successfully\n"); + fmt::print( + stderr, + "FATAL: ADM_register_adhoc_storage() or ADM_deploy_adhoc_storage() failed: {}\n", + e.what()); exit(EXIT_FAILURE); } diff --git a/src/lib/admire.cpp b/src/lib/admire.cpp index 1b694cde..e3e74f11 100644 --- a/src/lib/admire.cpp +++ b/src/lib/admire.cpp @@ -284,9 +284,9 @@ remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage) { } void -deploy_adhoc_storage(const server& srv, const uint64_t adhoc_id) { +deploy_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage) { - const auto ec = detail::deploy_adhoc_storage(srv, adhoc_id); + const auto ec = detail::deploy_adhoc_storage(srv, adhoc_storage); if(!ec) { throw std::runtime_error(fmt::format( diff --git a/src/lib/admire.h b/src/lib/admire.h index 155434cd..2545d6c4 100644 --- a/src/lib/admire.h +++ b/src/lib/admire.h @@ -147,12 +147,13 @@ ADM_remove_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage); * Initiate the deployment of an adhoc storage system instance. * * @param[in] server The server to which the request is directed - * @param[in] adhoc_id An ADM_STORAGE referring to the adhoc storage + * @param[in] job An ADM_JOB identifying the originating job. + * @param[in] adhoc_storage An ADM_STORAGE referring to the adhoc storage * instance of interest. * @return Returns ADM_SUCCESS if the remote procedure has completed */ ADM_return_t -ADM_deploy_adhoc_storage(ADM_server_t server, u_int64_t adhoc_id); +ADM_deploy_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage); /** * Register a PFS storage tier. diff --git a/src/lib/admire.hpp b/src/lib/admire.hpp index 53918b27..980fcb27 100644 --- a/src/lib/admire.hpp +++ b/src/lib/admire.hpp @@ -70,7 +70,7 @@ void remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage); void -deploy_adhoc_storage(const server& srv, uint64_t adhoc_id); +deploy_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage); ADM_return_t register_pfs_storage(const server& srv, ADM_pfs_context_t ctx, diff --git a/src/lib/c_wrapper.cpp b/src/lib/c_wrapper.cpp index 62713b0d..fd5744e5 100644 --- a/src/lib/c_wrapper.cpp +++ b/src/lib/c_wrapper.cpp @@ -121,11 +121,12 @@ ADM_remove_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage) { } ADM_return_t -ADM_deploy_adhoc_storage(ADM_server_t server, uint64_t adhoc_id) { +ADM_deploy_adhoc_storage(ADM_server_t server, ADM_storage_t adhoc_storage) { const admire::server srv{server}; - return admire::detail::deploy_adhoc_storage(srv, adhoc_id); + return admire::detail::deploy_adhoc_storage( + srv, admire::adhoc_storage{adhoc_storage}); } ADM_return_t diff --git a/src/lib/detail/impl.cpp b/src/lib/detail/impl.cpp index 2dd2bb61..e2d84d41 100644 --- a/src/lib/detail/impl.cpp +++ b/src/lib/detail/impl.cpp @@ -359,7 +359,7 @@ register_adhoc_storage(const server& srv, const std::string& name, admire::error_code deploy_adhoc_storage(const server& srv, - const uint64_t adhoc_id) { + const adhoc_storage& adhoc_storage) { scord::network::rpc_client rpc_client{srv.protocol(), rpc_registration_cb}; @@ -367,14 +367,13 @@ deploy_adhoc_storage(const server& srv, auto endp = rpc_client.lookup(srv.address()); LOGGER_INFO("rpc id: {} name: {} from: {} => " - "body: {{adhoc_id: {}}}", + "body: {{adhoc_storage: {}}}", rpc_id, std::quoted("ADM_"s + __FUNCTION__), - std::quoted(rpc_client.self_address()), adhoc_id); + std::quoted(rpc_client.self_address()), adhoc_storage); - ADM_deploy_adhoc_storage_in_t in{adhoc_id}; + ADM_deploy_adhoc_storage_in_t in{adhoc_storage.id()}; ADM_deploy_adhoc_storage_out_t out; - out.op_id = rpc_id; - + const auto rpc = endp.call("ADM_deploy_adhoc_storage", &in, &out); if(const auto rv = admire::error_code{out.retval}; !rv) { diff --git a/src/lib/detail/impl.hpp b/src/lib/detail/impl.hpp index b8e90eb0..51aa88e9 100644 --- a/src/lib/detail/impl.hpp +++ b/src/lib/detail/impl.hpp @@ -67,7 +67,7 @@ remove_adhoc_storage(const server& srv, const adhoc_storage& adhoc_storage); admire::error_code deploy_adhoc_storage(const server& srv, - const uint64_t adhoc_id); + const adhoc_storage& adhoc_storage); } // namespace admire::detail diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index bb708279..e3d68dbe 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -450,80 +450,79 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), in.id); - admire::error_code ec; - ec = admire::error_code::success; + auto ec = admire::error_code::success; auto& adhoc_manager = scord::adhoc_storage_manager::instance(); - auto adhoc_storage_info = adhoc_manager.find(in.id); - - if(!adhoc_storage_info) { - LOGGER_ERROR("rpc id: {} error_msg: \"Error finding adhoc_storage: {}", - rpc_id, in.id); - ec = admire::error_code::no_such_entity; - } - - const auto& storage_info = adhoc_storage_info.value(); - const auto adhoc_storage = storage_info->adhoc_storage(); - out.retval = ec; - - /* Look inside adhoc_storage and launch gkfs script */ - - if(adhoc_storage.type() == admire::storage::type::gekkofs) { - const auto adhoc_ctx = - (admire::adhoc_storage::ctx*) adhoc_storage.context().get(); - /* Number of nodes */ - const std::string nodes = - std::to_string(adhoc_ctx->resources().nodes().size()); - - /* Walltime */ - const std::string walltime = std::to_string(adhoc_ctx->walltime()); - - /* Launch script */ - pid_t pid = fork(); - switch(pid) { - case 0: { - std::vector args; - args.push_back("gkfs"); - // args.push_back("-c"); - // args.push_back("gkfs.conf"); - args.push_back("-n"); - args.push_back(nodes.c_str()); - // args.push_back("-w"); - // args.push_back(walltime.c_str()); - args.push_back("--srun"); - args.push_back("start"); - args.push_back(NULL); - std::vector env; - env.push_back(NULL); - - execvpe("gkfs", const_cast(args.data()), - const_cast(env.data())); - LOGGER_INFO("ADM_deploy_adhoc_storage() script didn't execute"); - exit(EXIT_FAILURE); - break; - } - case -1: { - ec = admire::error_code::other; - LOGGER_ERROR("rpc id: {} name: {} to: {} <= " - "body: {{retval: {}}}", - rpc_id, std::quoted(__FUNCTION__), - std::quoted(get_address(h)), ec); - break; - } - default: { - int wstatus = 0; - waitpid(pid, &wstatus, 0); - if(WEXITSTATUS(wstatus) != 0) { + if(const auto am_result = adhoc_manager.find(in.id); + am_result.has_value()) { + const auto& storage_info = am_result.value(); + const auto adhoc_storage = storage_info->adhoc_storage(); + ec = admire::error_code::success; + if(adhoc_storage.type() == admire::storage::type::gekkofs) { + const auto adhoc_ctx = + (admire::adhoc_storage::ctx*) adhoc_storage.context().get(); + /* Number of nodes */ + const std::string nodes = + std::to_string(adhoc_ctx->resources().nodes().size()); + + /* Walltime */ + const std::string walltime = std::to_string(adhoc_ctx->walltime()); + + /* Launch script */ + switch(const auto pid = fork()) { + case 0: { + std::vector args; + args.push_back("gkfs"); + // args.push_back("-c"); + // args.push_back("gkfs.conf"); + args.push_back("-n"); + args.push_back(nodes.c_str()); + // args.push_back("-w"); + // args.push_back(walltime.c_str()); + args.push_back("--srun"); + args.push_back("start"); + args.push_back(NULL); + std::vector env; + env.push_back(NULL); + + execvpe("gkfs", const_cast(args.data()), + const_cast(env.data())); + LOGGER_INFO( + "ADM_deploy_adhoc_storage() script didn't execute"); + exit(EXIT_FAILURE); + break; + } + case -1: { ec = admire::error_code::other; - } else { - ec = admire::error_code::success; + LOGGER_ERROR("rpc id: {} name: {} to: {} <= " + "body: {{retval: {}}}", + rpc_id, std::quoted(__FUNCTION__), + std::quoted(get_address(h)), ec); + break; + } + default: { + int wstatus = 0; + waitpid(pid, &wstatus, 0); + if(WEXITSTATUS(wstatus) != 0) { + ec = admire::error_code::other; + } else { + ec = admire::error_code::success; + } + break; } - break; } } + + } else { + ec = am_result.error(); + LOGGER_ERROR("rpc id: {} name: {} to: {} <= " + "body: {{retval: {}}}", + rpc_id, std::quoted(__FUNCTION__), + std::quoted(get_address(h)), ec); } out.retval = ec; + LOGGER_INFO("rpc id: {} name: {} to: {} <= " "body: {{retval: {}}}", rpc_id, std::quoted(__FUNCTION__), std::quoted(get_address(h)), -- GitLab From 67c2160cd74b58e0e30f9f13877eee861fe03f0e Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Wed, 23 Nov 2022 10:40:18 +0100 Subject: [PATCH 20/20] Missing waitpid return value check --- src/scord/rpc_handlers.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/scord/rpc_handlers.cpp b/src/scord/rpc_handlers.cpp index e3d68dbe..8bc7ef81 100644 --- a/src/scord/rpc_handlers.cpp +++ b/src/scord/rpc_handlers.cpp @@ -502,11 +502,18 @@ ADM_deploy_adhoc_storage(hg_handle_t h) { } default: { int wstatus = 0; - waitpid(pid, &wstatus, 0); - if(WEXITSTATUS(wstatus) != 0) { + pid_t retwait = waitpid(pid, &wstatus, 0); + if(retwait == -1) { + LOGGER_ERROR( + "rpc id: {} error_msg: \"Error waitpid code: {}\"", + rpc_id, retwait); ec = admire::error_code::other; } else { - ec = admire::error_code::success; + if(WEXITSTATUS(wstatus) != 0) { + ec = admire::error_code::other; + } else { + ec = admire::error_code::success; + } } break; } -- GitLab