diff --git a/CHANGELOG.md b/CHANGELOG.md index a792df8a15f0db0c2cbaeb5cf95bedf825ce9813..40ab7979078334add743b4179795c14726e4eacb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,9 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - gkfs_do_write uses int instead of ssize_t causing overflow ([!229](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/229)) - proxy remove metadata has inverted return values ([!237](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/237)) - Rename and symlink support leveraged ([!246](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/246)) + - Java with syscalls deadlocks as it try to resolve paths (malloc) in a locking situation ([!255](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/255)) + - It also solves flock missing implementation when we are ouside gekkofs + - Some features in syscall_intercept still hangs if we do not lower the debug information. ## [0.9.4] - 2025-03 ### New diff --git a/include/client/path.hpp b/include/client/path.hpp index 87acc327a0b4f390c626c74fa8801010a39086b1..7bdc840e9fa988471009259d074bb4b90cb95379 100644 --- a/include/client/path.hpp +++ b/include/client/path.hpp @@ -42,6 +42,8 @@ namespace gkfs::path { +static const std::string excluded_paths[2] = {"sys/", "proc/"}; + unsigned int match_components(const std::string& path, unsigned int& path_components, const std::vector& components); diff --git a/include/client/syscalls/args.hpp b/include/client/syscalls/args.hpp index ea391ba97bf3d348b87588492b11895a6d920405..c833f82dd999b617145e4d8341e3bcf07c4cc370 100644 --- a/include/client/syscalls/args.hpp +++ b/include/client/syscalls/args.hpp @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -76,6 +77,7 @@ enum class type { mmap_prot = ::arg_type_t::mmap_prot, mmap_flags = ::arg_type_t::mmap_flags, clone_flags = ::arg_type_t::clone_flags, + clone3_args = ::arg_type_t::clone3_args, signum = ::arg_type_t::signum, sigproc_how = ::arg_type_t::sigproc_how, generic = ::arg_type_t::arg, @@ -96,6 +98,7 @@ static constexpr auto whence = type::whence; static constexpr auto mmap_prot = type::mmap_prot; static constexpr auto mmap_flags = type::mmap_flags; static constexpr auto clone_flags = type::clone_flags; +static constexpr auto clone3_args = type::clone3_args; static constexpr auto signum = type::signum; static constexpr auto sigproc_how = type::sigproc_how; static constexpr auto generic = type::generic; @@ -154,6 +157,9 @@ format_mmap_flags_arg_to(FmtBuffer& buffer, const printable_arg& parg); template inline void format_clone_flags_arg_to(FmtBuffer& buffer, const printable_arg& parg); +template inline void +format_clone3_args_arg_to(FmtBuffer& buffer, const printable_arg& parg); + template inline void format_signum_arg_to(FmtBuffer& buffer, const printable_arg& parg); @@ -182,6 +188,7 @@ std::array, arg_type_max> formatters = { /* [mmap_prot] = */ format_mmap_prot_arg_to, /* [mmap_flags] = */ format_mmap_flags_arg_to, /* [clone_flags] = */ format_clone_flags_arg_to, + /* [clone3_args] = */ format_clone3_args_arg_to, /* [signum] = */ format_signum_arg_to, /* [sigproc_how] = */ format_sigproc_how_arg_to, /* [arg] = */ format_arg_to, @@ -444,6 +451,91 @@ format_clone_flags_arg_to(FmtBuffer& buffer, const printable_arg& parg) { return; } +/** + * format_clone3_args_arg_to - format a 'args' argument + * + * Format a 'args' argument (such as those passed to clone3()) + * and append the resulting string to the provided buffer. + */ +template +inline void +format_clone3_args_arg_to(FmtBuffer& buffer, const printable_arg& parg) { + + // struct clone_args { + // u64 flags; /* Flags bit mask */ + // u64 pidfd; /* Where to store PID file descriptor + // (int *) */ + // u64 child_tid; /* Where to store child TID, + // in child's memory (pid_t *) */ + // u64 parent_tid; /* Where to store child TID, + // in parent's memory (pid_t *) */ + // u64 exit_signal; /* Signal to deliver to parent on + // child termination */ + // u64 stack; /* Pointer to lowest byte of stack */ + // u64 stack_size; /* Size of stack */ + // u64 tls; /* Location of new TLS */ + // u64 set_tid; /* Pointer to a pid_t array + // (since Linux 5.5) */ + // u64 set_tid_size; /* Number of elements in set_tid + // (since Linux 5.5) */ + // u64 cgroup; /* File descriptor for target cgroup + // of child (since Linux 5.7) */ + // }; + + + struct clone_args* ca = reinterpret_cast(parg.value); + /* Names for clone3() args arg */ + const auto flag_names = + utils::make_array( + FLAG_ENTRY(CLONE_VM), + FLAG_ENTRY(CLONE_FS), + FLAG_ENTRY(CLONE_FILES), + FLAG_ENTRY(CLONE_SIGHAND), + FLAG_ENTRY(CLONE_PTRACE), + FLAG_ENTRY(CLONE_VFORK), + FLAG_ENTRY(CLONE_PARENT), + FLAG_ENTRY(CLONE_THREAD), + FLAG_ENTRY(CLONE_NEWNS), + FLAG_ENTRY(CLONE_SYSVSEM), + FLAG_ENTRY(CLONE_SETTLS), + FLAG_ENTRY(CLONE_PARENT_SETTID), + FLAG_ENTRY(CLONE_CHILD_CLEARTID), + FLAG_ENTRY(CLONE_DETACHED), + FLAG_ENTRY(CLONE_UNTRACED), + FLAG_ENTRY(CLONE_CHILD_SETTID), +#ifdef CLONE_NEWCGROUP + FLAG_ENTRY(CLONE_NEWCGROUP), +#endif + FLAG_ENTRY(CLONE_NEWUTS), + FLAG_ENTRY(CLONE_NEWIPC), + FLAG_ENTRY(CLONE_NEWUSER), + FLAG_ENTRY(CLONE_NEWPID), + FLAG_ENTRY(CLONE_NEWNET), + FLAG_ENTRY(CLONE_IO)); + + fmt::format_to(std::back_inserter(buffer), "{}=", "flags"); + format_flag_set(buffer, ca->flags, flag_names); + + fmt::format_to(std::back_inserter(buffer), "|", "signal"); + format_signum_arg_to(buffer, {"", ca->exit_signal}); + + fmt::format_to(std::back_inserter(buffer), ",{}={}", "pidfd", (void*)ca->pidfd); + fmt::format_to(std::back_inserter(buffer), ",{}={}", "child_tid", (void*)ca->child_tid); + fmt::format_to(std::back_inserter(buffer), ",{}={}", "parent_tid", (void*)ca->parent_tid); + fmt::format_to(std::back_inserter(buffer), ",{}={}", "stack", (void*)ca->stack); + fmt::format_to(std::back_inserter(buffer), ",{}={}", "stack_size", ca->stack_size); + fmt::format_to(std::back_inserter(buffer), ",{}={}", "tls", (void*)ca->tls); + fmt::format_to(std::back_inserter(buffer), ",{}={}", "set_tid", (void*)ca->set_tid); + // set_tid size and cgroup + fmt::format_to(std::back_inserter(buffer), ",{}={}", "set_tid_size", ca->set_tid_size); + fmt::format_to(std::back_inserter(buffer), ",{}={}", "cgroup", ca->cgroup); + + return; + + +} + + /** * format_signum_arg_to - format a 'signum' argument * diff --git a/include/client/syscalls/detail/syscall_info.h b/include/client/syscalls/detail/syscall_info.h index cca13cd82ffa6d4c60b2c924ade801863c369094..b15b54da31e38366cc9ce3726021d500023b5cbc 100644 --- a/include/client/syscalls/detail/syscall_info.h +++ b/include/client/syscalls/detail/syscall_info.h @@ -64,6 +64,7 @@ typedef enum { mmap_prot, /* protections for the mmap() family of syscalls */ mmap_flags, /* flags for the mmap() family of syscalls */ clone_flags, /* flags for the clone() syscall */ + clone3_args, /* args for the clone3() syscall */ signum, /* signal numbers */ sigproc_how, /* sigprocmask argument */ arg, /* generic argument, no special formatting */ diff --git a/include/config.hpp b/include/config.hpp index c1ba919ca9c9eddeec6882f285b1f65ffd67c1f9..0c7544b9acc1a8907ba4c39a8f69efc686935759 100644 --- a/include/config.hpp +++ b/include/config.hpp @@ -114,8 +114,8 @@ constexpr auto dir = "metadata"; // which metadata should be considered apart from size and mode // Blocks are used to store the rename status (-1 is a renamed file) constexpr auto use_atime = false; -constexpr auto use_ctime = false; -constexpr auto use_mtime = false; +constexpr auto use_ctime = true; +constexpr auto use_mtime = true; constexpr auto use_link_cnt = false; #ifdef HAS_RENAME constexpr auto use_blocks = true; diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index f1b7b89d03a4911ea122597897fd62b2ac76a573..c97630fd794b1621ce1c67a9cf8eaf3558431b18 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -437,7 +437,7 @@ hook_flock(unsigned long fd, int flags) { if(CTX->file_map()->exist(fd)) { return 0; } else - return -EBADF; + return syscall_no_intercept_wrapper(SYS_flock, fd, flags); } #ifdef SYS_access diff --git a/src/client/path.cpp b/src/client/path.cpp index 6ccb561c7745d656278f7193b899e83d3564cc29..1792a0e70f437c4ef68e44ef969e678c911de319 100644 --- a/src/client/path.cpp +++ b/src/client/path.cpp @@ -62,7 +62,6 @@ using namespace std; namespace gkfs::path { -static const string excluded_paths[2] = {"sys/", "proc/"}; /** Match components in path * diff --git a/src/client/preload_context.cpp b/src/client/preload_context.cpp index 465ed926fb4bb1ccbbad03cdaa4ed7629730aca4..ca5f9479c5505137a02da40f875bb39d2faf4cc6 100644 --- a/src/client/preload_context.cpp +++ b/src/client/preload_context.cpp @@ -301,6 +301,21 @@ PreloadContext::relativize_fd_path(int dirfd, const char* raw_path, // We assume raw path is valid assert(raw_path != nullptr); + // Skips paths that are used on locking places (i.e. java + // /proc/sys/vm/overcommit_memory) and produces deadlocks as we call malloc + // inside. + if(dirfd == AT_FDCWD) { + for(auto& excl_path : gkfs::path::excluded_paths) { + // compare raw_path with excl_path + if(raw_path != nullptr && raw_path[0] == gkfs::path::separator) { + if(strncmp(raw_path + 1, excl_path.c_str(), + excl_path.length()) == 0) { + return RelativizeStatus::external; + } + } + } + } + std::string path; @@ -582,6 +597,8 @@ PreloadContext::unregister_internal_fd(int fd) { bool PreloadContext::is_internal_fd(int fd) const { + if(!protect_fds()) + return false; if(fd < MIN_INTERNAL_FD) { return false; diff --git a/src/client/syscalls/detail/syscall_info.c b/src/client/syscalls/detail/syscall_info.c index 89d8499281ff8a6811f9ec2ff9c1987efd9a60f0..c4c9ba2593ce26eb0374a435394b777ceeeb15d8 100644 --- a/src/client/syscalls/detail/syscall_info.c +++ b/src/client/syscalls/detail/syscall_info.c @@ -642,7 +642,7 @@ SYSCALL(getpmsg, 5, S_RET(rdec), S_NARG(arg, "arg0"), SYSCALL(pidfd_open, 2, S_RET(rdec), S_NARG(dec, "pid"), S_NARG(arg, "flags")), #endif #ifdef SYS_clone3 - SYSCALL(clone3, 4, S_RET(rdec), S_NARG(arg, "flags"), S_NARG(ptr, "child_tid"), S_NARG(ptr, "parent_tid"), S_NARG(ptr, "tls")), + SYSCALL(clone3, 4, S_RET(rdec), S_NARG(clone3_args, "flags"), S_NARG(arg, "size")), #endif #ifdef SYS_close_range SYSCALL(close_range, 3, S_RET(rdec), S_NARG(dec, "low"), S_NARG(dec, "high"), S_NARG(arg, "flags")),