diff --git a/CHANGELOG.md b/CHANGELOG.md index ec561aad514e65f35ec826efc81e5ed80a598d16..32bcb3b2f557dae4edf79dd493990cad570aecb3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - We cannot use lstat directly as may cause a recursion call on libc interception. - Un/Packing order of directory entries in compressed format was incorrect ([!281](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/281)) - Fix pytorch mmap ([!291](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/291)) + - Fix cuda in syscall ([!292](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/292)) + - mmap and dangling fd issues ## [0.9.5] - 2025-08 diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index cdfe357c97641de1eeee485e59ad928a0349321a..b70ace750766cf321cfb0710880bad5f2809fdf4 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -193,6 +193,12 @@ gkfs_munmap(void* addr, size_t length); int gkfs_msync(void* addr, size_t length, int flags); +// Returns true if addr is currently registered as a GekkoFS mmap mapping. +// Cheap shared-lock read; used by hooks to skip gkfs_munmap/gkfs_msync +// for non-GekkoFS addresses (e.g. CUDA GPU memory mappings). +bool +gkfs_mmap_is_tracked(void* addr); + } // namespace gkfs::syscall diff --git a/include/client/open_file_map.hpp b/include/client/open_file_map.hpp index e4b6bd6092c96d1c13ae9fb27c11a42ad9f7ef04..15c4245a9c8504c67abae9522f1598b0e3790571 100644 --- a/include/client/open_file_map.hpp +++ b/include/client/open_file_map.hpp @@ -46,6 +46,8 @@ #include #include #include +#include + namespace gkfs::filemap { @@ -189,6 +191,9 @@ public: int get_fd_idx(); + + std::vector + get_range(unsigned int first, unsigned int last); }; } // namespace gkfs::filemap diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 8df851d28280ba64e494ff0b69073f4f8cc24bf0..c747f70af8e817cadc76a4007ab1ee832662e9d7 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -96,11 +97,21 @@ struct MmapEntry { int flags; }; -std::mutex mmap_mtx; +// Use shared_mutex so that gkfs_mmap_is_tracked() can do a lock-free +// quick-check without blocking concurrent mmap/munmap operations. +std::shared_mutex mmap_mtx; std::unordered_map mmap_registry; } // namespace +bool +gkfs_mmap_is_tracked(void* addr) { + // Shared (read-only) lock: does not block concurrent is_tracked queries + // or mmap registrations that don't call munmap. + std::shared_lock lock(mmap_mtx); + return mmap_registry.find(addr) != mmap_registry.end(); +} + void* gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { @@ -143,7 +154,7 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Re-acquire lock to insert mapping { - std::lock_guard lock(mmap_mtx); + std::unique_lock lock(mmap_mtx); mmap_registry[ptr] = {ptr, length, path, offset, prot, flags}; } @@ -164,7 +175,7 @@ gkfs_msync(void* addr, size_t length, int flags) { bool do_writeback = false; { - std::lock_guard lock(mmap_mtx); + std::unique_lock lock(mmap_mtx); auto it = mmap_registry.find(addr); if(it != mmap_registry.end()) { auto& entry = it->second; @@ -210,7 +221,7 @@ gkfs_munmap(void* addr, size_t length) { bool do_writeback = false; { - std::lock_guard lock(mmap_mtx); + std::unique_lock lock(mmap_mtx); auto it = mmap_registry.find(addr); if(it != mmap_registry.end()) { auto& entry = it->second; diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index b6f37e7980ffb1c9c162e06575ed6eb0741fd034..f77059039301b73dfb13385b396cd538996090e8 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -48,6 +48,7 @@ #include #include +#include #include #include @@ -71,6 +72,27 @@ with_errno(T ret) { constexpr size_t k_fd_copy_chunk_size = 1UL * 1024UL * 1024UL; +bool +kernel_fd_targets_dev_null(int fd) { + char proc_fd_path[64]; + const auto path_len = std::snprintf(proc_fd_path, sizeof(proc_fd_path), + "/proc/self/fd/%d", fd); + if(path_len <= 0 || path_len >= static_cast(sizeof(proc_fd_path))) { + return false; + } + + char target_path[256]; + const auto nread = + syscall_no_intercept_wrapper(SYS_readlinkat, AT_FDCWD, proc_fd_path, + target_path, sizeof(target_path) - 1); + if(nread < 0) { + return false; + } + + target_path[nread] = '\0'; + return std::string(target_path) == "/dev/null"; +} + ssize_t copy_between_fds(int out_fd, int in_fd, off64_t* in_off, off64_t* out_off, size_t count) { @@ -588,42 +610,45 @@ hook_symlinkat(const char* oldname, int newdfd, const char* newname) { LOG(DEBUG, "{}() called with oldname: \"{}\", newfd: {}, newname: \"{}\"", __func__, oldname, newdfd, newname); - bool internal1 = false; - std::string oldname_resolved; - if(CTX->relativize_path(oldname, oldname_resolved)) { - if(!gkfs::config::metadata::symlink_support) { - LOG(WARNING, "{}() operation not supported", __func__); - return -ENOTSUP; - } - internal1 = true; - } - + // First determine where the destination (newname) lives. + // We MUST do this before touching oldname, because oldname is a literal + // symlink content string — NOT a filesystem path to resolve. Calling + // relativize_path() on it would corrupt relative paths like "../nvidia0" + // by anchoring them to the process CWD, producing wrong paths. std::string newname_resolved; auto rstatus = CTX->relativize_fd_path(newdfd, newname, newname_resolved, false); + switch(rstatus) { case gkfs::preload::RelativizeStatus::fd_unknown: + // Destination fd is not known to us — forward verbatim. return gsl::narrow_cast(syscall_no_intercept_wrapper( SYS_symlinkat, oldname, newdfd, newname)); case gkfs::preload::RelativizeStatus::external: + // Destination is outside the GekkoFS mountpoint — kernel handles. + // Pass oldname as-is (it's a literal symlink content, not a path). return gsl::narrow_cast(syscall_no_intercept_wrapper( SYS_symlinkat, oldname, newdfd, newname)); case gkfs::preload::RelativizeStatus::fd_not_a_dir: return -ENOTDIR; - case gkfs::preload::RelativizeStatus::internal: + case gkfs::preload::RelativizeStatus::internal: { + // Destination is in GekkoFS. Now we also need to inspect oldname. if(!gkfs::config::metadata::symlink_support) { LOG(WARNING, "{}() operation not supported", __func__); return -ENOTSUP; } - if(internal1) { // Parameters are inverted + std::string oldname_resolved; + if(CTX->relativize_path(oldname, oldname_resolved)) { + // Source is also in GekkoFS — create symlink inside GekkoFS. return with_errno(gkfs::syscall::gkfs_mk_symlink( newname_resolved, oldname_resolved)); } - LOG(WARNING, "{}() operation not supported", __func__); + LOG(WARNING, "{}() cross-mount symlink not supported", __func__); return -ENOTSUP; + } default: LOG(ERROR, "{}() relativize status unknown", __func__); @@ -631,6 +656,7 @@ hook_symlinkat(const char* oldname, int newdfd, const char* newname) { } } + int hook_flock(unsigned long fd, int flags) { LOG(ERROR, "{}() called flock (Not Supported) with fd '{}' flags '{}'", @@ -1504,20 +1530,45 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, "{}() called with addr '{}' length '{}' prot '{}' flags '{}' fd '{}' offset '{}'", __func__, fmt::ptr(addr), length, prot, flags, fd, offset); + // MAP_ANONYMOUS mappings are not backed by any file. The kernel ignores + // the fd argument, but CUDA drivers commonly pass fd=0 (stdin). Checking + // file_map()->get(0) can incorrectly route CUDA's GPU memory maps through + // gkfs_mmap, causing cudaErrorDevicesUnavailable with syscall interception. + if(flags & MAP_ANONYMOUS) { + return reinterpret_cast(syscall_no_intercept_wrapper( + SYS_mmap, addr, length, prot, flags, fd, offset)); + } + if(auto file = CTX->file_map()->get(fd)) { - LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, - file->path()); + // In non-range-fd mode, GekkoFS keeps a /dev/null kernel fd per tracked + // file. If that no longer holds, this entry is stale (likely fd reuse), + // so forward to the kernel. + if(!CTX->range_fd() && !CTX->protect_fds() && + !kernel_fd_targets_dev_null(fd)) { + return reinterpret_cast(syscall_no_intercept_wrapper( + SYS_mmap, addr, length, prot, flags, fd, offset)); + } + return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, offset); } return reinterpret_cast(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); } + int hook_munmap(void* addr, size_t length) { LOG(DEBUG, "{}() called with addr '{}' length '{}'", __func__, fmt::ptr(addr), length); + // Only invoke gkfs_munmap for addresses we actually track. + // Bypassing for all other addresses avoids stalling unrelated munmap + // callers (e.g. CUDA's multi-threaded init) on mmap_mtx under + // syscall-level interception. + if(!gkfs::syscall::gkfs_mmap_is_tracked(addr)) { + return syscall_no_intercept_wrapper(SYS_munmap, addr, length); + } + int res = gkfs::syscall::gkfs_munmap(addr, length); if(res == 1) { return 0; @@ -1532,6 +1583,11 @@ hook_msync(void* addr, size_t length, int flags) { LOG(DEBUG, "{}() called with addr '{}' length '{}' flags '{}'", __func__, fmt::ptr(addr), length, flags); + // Same fast-path: skip gkfs entirely for non-GekkoFS mappings. + if(!gkfs::syscall::gkfs_mmap_is_tracked(addr)) { + return syscall_no_intercept_wrapper(SYS_msync, addr, length, flags); + } + int res = gkfs::syscall::gkfs_msync(addr, length, flags); if(res == 1) { return 0; diff --git a/src/client/intercept.cpp b/src/client/intercept.cpp index 0cf78d905f677ca5fac1b6c336f0dc7c114ca913..249212212676bf6a2fbb6b41fb4d48592c4a269e 100644 --- a/src/client/intercept.cpp +++ b/src/client/intercept.cpp @@ -589,20 +589,26 @@ hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, break; #ifdef SYS_close_range case SYS_close_range: { - auto fds = get_open_fds(); - for(auto fd : fds) { - if(fd < static_cast(arg0) || fd > static_cast(arg1)) - continue; - if(CTX->file_map()->exist(fd)) { - gkfs::syscall::gkfs_close(fd); - } else - close(fd); - *result = 0; + auto first = static_cast(arg0); + auto last = static_cast(arg1); + auto flags = static_cast(arg2); + + // Close any GekkoFS virtual fds in the range ourselves. + // Iterate only over our tracked fds to avoid touching native fds. + auto to_close = CTX->file_map()->get_range(first, last); + for(auto fd : to_close) { + gkfs::syscall::gkfs_close(fd); } - } - *result = 0; + + // Forward the range to the kernel for all native fds. + // The kernel handles O_CLOEXEC semantics and won't touch our + // internal fds (which live in the upper fd range). + *result = syscall_no_intercept_wrapper(SYS_close_range, first, last, + flags); break; + } #endif // SYS_close_range + #ifdef SYS_stat case SYS_stat: *result = @@ -1022,7 +1028,7 @@ hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, *result = reinterpret_cast(gkfs::hook::hook_mmap( reinterpret_cast(arg0), static_cast(arg1), static_cast(arg2), static_cast(arg3), - static_cast(arg4), static_cast(arg5))); + static_cast(arg4), static_cast(arg5))); break; case SYS_msync: *result = gkfs::hook::hook_msync(reinterpret_cast(arg0), diff --git a/src/client/open_file_map.cpp b/src/client/open_file_map.cpp index 4dee59b5d7629ea4e2e041584db7ea8e2b9527a4..ebc4d86ca78191eb8f056b0ea0455f27e90b2f8b 100644 --- a/src/client/open_file_map.cpp +++ b/src/client/open_file_map.cpp @@ -316,4 +316,17 @@ OpenFileMap::get_fd_idx() { return fd_idx; } +std::vector +OpenFileMap::get_range(unsigned int first, unsigned int last) { + std::lock_guard lock(files_mutex_); + std::vector result; + // files_ is a sorted std::map, so lower_bound gives us an efficient start + auto it = files_.lower_bound(static_cast(first)); + while(it != files_.end() && static_cast(it->first) <= last) { + result.push_back(it->first); + ++it; + } + return result; +} + } // namespace gkfs::filemap \ No newline at end of file