From 1668ac3f460527a48f901fc55e62895614fb9526 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Tue, 3 Mar 2026 17:48:55 +0100 Subject: [PATCH 1/8] fix cuda with syscall --- include/client/gkfs_functions.hpp | 6 ++++++ src/client/gkfs_functions.cpp | 19 +++++++++++++++---- src/client/hooks.cpp | 13 +++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index cdfe357c9..b70ace750 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -193,6 +193,12 @@ gkfs_munmap(void* addr, size_t length); int gkfs_msync(void* addr, size_t length, int flags); +// Returns true if addr is currently registered as a GekkoFS mmap mapping. +// Cheap shared-lock read; used by hooks to skip gkfs_munmap/gkfs_msync +// for non-GekkoFS addresses (e.g. CUDA GPU memory mappings). +bool +gkfs_mmap_is_tracked(void* addr); + } // namespace gkfs::syscall diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 8df851d28..c747f70af 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -96,11 +97,21 @@ struct MmapEntry { int flags; }; -std::mutex mmap_mtx; +// Use shared_mutex so that gkfs_mmap_is_tracked() can do a lock-free +// quick-check without blocking concurrent mmap/munmap operations. +std::shared_mutex mmap_mtx; std::unordered_map mmap_registry; } // namespace +bool +gkfs_mmap_is_tracked(void* addr) { + // Shared (read-only) lock: does not block concurrent is_tracked queries + // or mmap registrations that don't call munmap. + std::shared_lock lock(mmap_mtx); + return mmap_registry.find(addr) != mmap_registry.end(); +} + void* gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { @@ -143,7 +154,7 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Re-acquire lock to insert mapping { - std::lock_guard lock(mmap_mtx); + std::unique_lock lock(mmap_mtx); mmap_registry[ptr] = {ptr, length, path, offset, prot, flags}; } @@ -164,7 +175,7 @@ gkfs_msync(void* addr, size_t length, int flags) { bool do_writeback = false; { - std::lock_guard lock(mmap_mtx); + std::unique_lock lock(mmap_mtx); auto it = mmap_registry.find(addr); if(it != mmap_registry.end()) { auto& entry = it->second; @@ -210,7 +221,7 @@ gkfs_munmap(void* addr, size_t length) { bool do_writeback = false; { - std::lock_guard lock(mmap_mtx); + std::unique_lock lock(mmap_mtx); auto it = mmap_registry.find(addr); if(it != mmap_registry.end()) { auto& entry = it->second; diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index b6f37e798..6a59673f6 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -1518,6 +1518,14 @@ hook_munmap(void* addr, size_t length) { LOG(DEBUG, "{}() called with addr '{}' length '{}'", __func__, fmt::ptr(addr), length); + // Only invoke gkfs_munmap for addresses we actually track. + // Bypassing for all other addresses avoids stalling unrelated munmap + // callers (e.g. CUDA's multi-threaded init) on mmap_mtx under + // syscall-level interception. + if(!gkfs::syscall::gkfs_mmap_is_tracked(addr)) { + return syscall_no_intercept_wrapper(SYS_munmap, addr, length); + } + int res = gkfs::syscall::gkfs_munmap(addr, length); if(res == 1) { return 0; @@ -1532,6 +1540,11 @@ hook_msync(void* addr, size_t length, int flags) { LOG(DEBUG, "{}() called with addr '{}' length '{}' flags '{}'", __func__, fmt::ptr(addr), length, flags); + // Same fast-path: skip gkfs entirely for non-GekkoFS mappings. + if(!gkfs::syscall::gkfs_mmap_is_tracked(addr)) { + return syscall_no_intercept_wrapper(SYS_msync, addr, length, flags); + } + int res = gkfs::syscall::gkfs_msync(addr, length, flags); if(res == 1) { return 0; -- GitLab From 71f3c67bd20460708353944d157ab1450eb73a40 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Tue, 3 Mar 2026 20:51:49 +0100 Subject: [PATCH 2/8] fix cuda mmap --- src/client/hooks.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index 6a59673f6..495ccc391 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -1504,6 +1504,15 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, "{}() called with addr '{}' length '{}' prot '{}' flags '{}' fd '{}' offset '{}'", __func__, fmt::ptr(addr), length, prot, flags, fd, offset); + // MAP_ANONYMOUS mappings are not backed by any file. The kernel ignores + // the fd argument, but CUDA drivers commonly pass fd=0 (stdin). Checking + // file_map()->get(0) can incorrectly route CUDA's GPU memory maps through + // gkfs_mmap, causing cudaErrorDevicesUnavailable with syscall interception. + if(flags & MAP_ANONYMOUS) { + return reinterpret_cast(syscall_no_intercept_wrapper( + SYS_mmap, addr, length, prot, flags, fd, offset)); + } + if(auto file = CTX->file_map()->get(fd)) { LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, file->path()); @@ -1513,6 +1522,7 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, SYS_mmap, addr, length, prot, flags, fd, offset)); } + int hook_munmap(void* addr, size_t length) { LOG(DEBUG, "{}() called with addr '{}' length '{}'", __func__, -- GitLab From 63794024158eabce3b8f01c667d6cf325ccae06f Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Wed, 4 Mar 2026 07:04:16 +0100 Subject: [PATCH 3/8] filter mmap for cuda --- src/client/hooks.cpp | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index 495ccc391..486524dcc 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -1514,9 +1514,22 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, } if(auto file = CTX->file_map()->get(fd)) { - LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, - file->path()); - return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, offset); + // Only route to gkfs_mmap if the path is genuinely under the GekkoFS + // mountpoint. fd reuse can cause CUDA device fds (e.g. /dev/nvidia-uvm) + // to match stale file_map entries, making gkfs_mmap return EINVAL for + // MAP_FIXED device mappings and triggering cudaErrorDevicesUnavailable. + const auto& path = file->path(); + const auto& mountdir = CTX->mountdir(); + if(path.size() >= mountdir.size() && + path.compare(0, mountdir.size(), mountdir) == 0) { + LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, + path); + return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, + offset); + } + LOG(DEBUG, + "{}() fd {} path '{}' outside mountdir, forwarding to kernel", + __func__, fd, path); } return reinterpret_cast(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); -- GitLab From 2b84b4c5126fc16102fc4efd7b041219f978923e Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Wed, 4 Mar 2026 12:36:24 +0100 Subject: [PATCH 4/8] fix close range --- include/client/open_file_map.hpp | 5 +++++ src/client/intercept.cpp | 28 +++++++++++++++++----------- src/client/open_file_map.cpp | 13 +++++++++++++ 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/include/client/open_file_map.hpp b/include/client/open_file_map.hpp index e4b6bd609..15c4245a9 100644 --- a/include/client/open_file_map.hpp +++ b/include/client/open_file_map.hpp @@ -46,6 +46,8 @@ #include #include #include +#include + namespace gkfs::filemap { @@ -189,6 +191,9 @@ public: int get_fd_idx(); + + std::vector + get_range(unsigned int first, unsigned int last); }; } // namespace gkfs::filemap diff --git a/src/client/intercept.cpp b/src/client/intercept.cpp index 0cf78d905..7a1347cfd 100644 --- a/src/client/intercept.cpp +++ b/src/client/intercept.cpp @@ -589,20 +589,26 @@ hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, break; #ifdef SYS_close_range case SYS_close_range: { - auto fds = get_open_fds(); - for(auto fd : fds) { - if(fd < static_cast(arg0) || fd > static_cast(arg1)) - continue; - if(CTX->file_map()->exist(fd)) { - gkfs::syscall::gkfs_close(fd); - } else - close(fd); - *result = 0; + auto first = static_cast(arg0); + auto last = static_cast(arg1); + auto flags = static_cast(arg2); + + // Close any GekkoFS virtual fds in the range ourselves. + // Iterate only over our tracked fds to avoid touching native fds. + auto to_close = CTX->file_map()->get_range(first, last); + for(auto fd : to_close) { + gkfs::syscall::gkfs_close(fd); } - } - *result = 0; + + // Forward the range to the kernel for all native fds. + // The kernel handles O_CLOEXEC semantics and won't touch our + // internal fds (which live in the upper fd range). + *result = syscall_no_intercept_wrapper(SYS_close_range, first, last, + flags); break; + } #endif // SYS_close_range + #ifdef SYS_stat case SYS_stat: *result = diff --git a/src/client/open_file_map.cpp b/src/client/open_file_map.cpp index 4dee59b5d..ebc4d86ca 100644 --- a/src/client/open_file_map.cpp +++ b/src/client/open_file_map.cpp @@ -316,4 +316,17 @@ OpenFileMap::get_fd_idx() { return fd_idx; } +std::vector +OpenFileMap::get_range(unsigned int first, unsigned int last) { + std::lock_guard lock(files_mutex_); + std::vector result; + // files_ is a sorted std::map, so lower_bound gives us an efficient start + auto it = files_.lower_bound(static_cast(first)); + while(it != files_.end() && static_cast(it->first) <= last) { + result.push_back(it->first); + ++it; + } + return result; +} + } // namespace gkfs::filemap \ No newline at end of file -- GitLab From 43eea03d4ec5a1e530edb75c5628782458f8c9d4 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Wed, 4 Mar 2026 14:34:11 +0100 Subject: [PATCH 5/8] simlink failure --- src/client/hooks.cpp | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index 486524dcc..41d42f4b8 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -588,42 +588,45 @@ hook_symlinkat(const char* oldname, int newdfd, const char* newname) { LOG(DEBUG, "{}() called with oldname: \"{}\", newfd: {}, newname: \"{}\"", __func__, oldname, newdfd, newname); - bool internal1 = false; - std::string oldname_resolved; - if(CTX->relativize_path(oldname, oldname_resolved)) { - if(!gkfs::config::metadata::symlink_support) { - LOG(WARNING, "{}() operation not supported", __func__); - return -ENOTSUP; - } - internal1 = true; - } - + // First determine where the destination (newname) lives. + // We MUST do this before touching oldname, because oldname is a literal + // symlink content string — NOT a filesystem path to resolve. Calling + // relativize_path() on it would corrupt relative paths like "../nvidia0" + // by anchoring them to the process CWD, producing wrong paths. std::string newname_resolved; auto rstatus = CTX->relativize_fd_path(newdfd, newname, newname_resolved, false); + switch(rstatus) { case gkfs::preload::RelativizeStatus::fd_unknown: + // Destination fd is not known to us — forward verbatim. return gsl::narrow_cast(syscall_no_intercept_wrapper( SYS_symlinkat, oldname, newdfd, newname)); case gkfs::preload::RelativizeStatus::external: + // Destination is outside the GekkoFS mountpoint — kernel handles. + // Pass oldname as-is (it's a literal symlink content, not a path). return gsl::narrow_cast(syscall_no_intercept_wrapper( SYS_symlinkat, oldname, newdfd, newname)); case gkfs::preload::RelativizeStatus::fd_not_a_dir: return -ENOTDIR; - case gkfs::preload::RelativizeStatus::internal: + case gkfs::preload::RelativizeStatus::internal: { + // Destination is in GekkoFS. Now we also need to inspect oldname. if(!gkfs::config::metadata::symlink_support) { LOG(WARNING, "{}() operation not supported", __func__); return -ENOTSUP; } - if(internal1) { // Parameters are inverted + std::string oldname_resolved; + if(CTX->relativize_path(oldname, oldname_resolved)) { + // Source is also in GekkoFS — create symlink inside GekkoFS. return with_errno(gkfs::syscall::gkfs_mk_symlink( newname_resolved, oldname_resolved)); } - LOG(WARNING, "{}() operation not supported", __func__); + LOG(WARNING, "{}() cross-mount symlink not supported", __func__); return -ENOTSUP; + } default: LOG(ERROR, "{}() relativize status unknown", __func__); @@ -631,6 +634,7 @@ hook_symlinkat(const char* oldname, int newdfd, const char* newname) { } } + int hook_flock(unsigned long fd, int flags) { LOG(ERROR, "{}() called flock (Not Supported) with fd '{}' flags '{}'", -- GitLab From 611a2d413dfc31b2a582ad83cddb747263cb4cbb Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 5 Mar 2026 14:43:05 +0100 Subject: [PATCH 6/8] casting error --- src/client/intercept.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/intercept.cpp b/src/client/intercept.cpp index 7a1347cfd..249212212 100644 --- a/src/client/intercept.cpp +++ b/src/client/intercept.cpp @@ -1028,7 +1028,7 @@ hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, *result = reinterpret_cast(gkfs::hook::hook_mmap( reinterpret_cast(arg0), static_cast(arg1), static_cast(arg2), static_cast(arg3), - static_cast(arg4), static_cast(arg5))); + static_cast(arg4), static_cast(arg5))); break; case SYS_msync: *result = gkfs::hook::hook_msync(reinterpret_cast(arg0), -- GitLab From 5060cb7f6d036841aaf2cfc9834f4d7e1d6d3505 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 5 Mar 2026 14:49:59 +0100 Subject: [PATCH 7/8] stale fd --- src/client/hooks.cpp | 52 +++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index 41d42f4b8..03f4d50bf 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -48,6 +48,7 @@ #include #include +#include #include #include @@ -71,6 +72,28 @@ with_errno(T ret) { constexpr size_t k_fd_copy_chunk_size = 1UL * 1024UL * 1024UL; +bool +kernel_fd_targets_dev_null(int fd) { + char proc_fd_path[64]; + const auto path_len = + std::snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%d", + fd); + if(path_len <= 0 || path_len >= static_cast(sizeof(proc_fd_path))) { + return false; + } + + char target_path[256]; + const auto nread = syscall_no_intercept_wrapper( + SYS_readlinkat, AT_FDCWD, proc_fd_path, target_path, + sizeof(target_path) - 1); + if(nread < 0) { + return false; + } + + target_path[nread] = '\0'; + return std::string(target_path) == "/dev/null"; +} + ssize_t copy_between_fds(int out_fd, int in_fd, off64_t* in_off, off64_t* out_off, size_t count) { @@ -1518,22 +1541,21 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, } if(auto file = CTX->file_map()->get(fd)) { - // Only route to gkfs_mmap if the path is genuinely under the GekkoFS - // mountpoint. fd reuse can cause CUDA device fds (e.g. /dev/nvidia-uvm) - // to match stale file_map entries, making gkfs_mmap return EINVAL for - // MAP_FIXED device mappings and triggering cudaErrorDevicesUnavailable. - const auto& path = file->path(); - const auto& mountdir = CTX->mountdir(); - if(path.size() >= mountdir.size() && - path.compare(0, mountdir.size(), mountdir) == 0) { - LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, - path); - return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, - offset); + // In non-range-fd mode, GekkoFS keeps a /dev/null kernel fd per + // tracked file. If the current kernel fd no longer points there, the + // file_map entry is stale (likely fd reuse), so forward to kernel. + if(!CTX->range_fd() && !CTX->protect_fds() && + !kernel_fd_targets_dev_null(fd)) { + LOG(DEBUG, + "{}() fd {} has stale file_map entry, forwarding to kernel", + __func__, fd); + return reinterpret_cast(syscall_no_intercept_wrapper( + SYS_mmap, addr, length, prot, flags, fd, offset)); } - LOG(DEBUG, - "{}() fd {} path '{}' outside mountdir, forwarding to kernel", - __func__, fd, path); + + LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, + file->path()); + return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, offset); } return reinterpret_cast(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); -- GitLab From 99c7f9607129a4b92ff0bb9c24bea773fedfb4f0 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 5 Mar 2026 16:07:52 +0100 Subject: [PATCH 8/8] final fix cuda with syscalls --- CHANGELOG.md | 2 ++ src/client/hooks.cpp | 22 ++++++++-------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec561aad5..32bcb3b2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - We cannot use lstat directly as may cause a recursion call on libc interception. - Un/Packing order of directory entries in compressed format was incorrect ([!281](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/281)) - Fix pytorch mmap ([!291](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/291)) + - Fix cuda in syscall ([!292](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/292)) + - mmap and dangling fd issues ## [0.9.5] - 2025-08 diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index 03f4d50bf..f77059039 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -75,17 +75,16 @@ constexpr size_t k_fd_copy_chunk_size = 1UL * 1024UL * 1024UL; bool kernel_fd_targets_dev_null(int fd) { char proc_fd_path[64]; - const auto path_len = - std::snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%d", - fd); + const auto path_len = std::snprintf(proc_fd_path, sizeof(proc_fd_path), + "/proc/self/fd/%d", fd); if(path_len <= 0 || path_len >= static_cast(sizeof(proc_fd_path))) { return false; } char target_path[256]; - const auto nread = syscall_no_intercept_wrapper( - SYS_readlinkat, AT_FDCWD, proc_fd_path, target_path, - sizeof(target_path) - 1); + const auto nread = + syscall_no_intercept_wrapper(SYS_readlinkat, AT_FDCWD, proc_fd_path, + target_path, sizeof(target_path) - 1); if(nread < 0) { return false; } @@ -1541,20 +1540,15 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, } if(auto file = CTX->file_map()->get(fd)) { - // In non-range-fd mode, GekkoFS keeps a /dev/null kernel fd per - // tracked file. If the current kernel fd no longer points there, the - // file_map entry is stale (likely fd reuse), so forward to kernel. + // In non-range-fd mode, GekkoFS keeps a /dev/null kernel fd per tracked + // file. If that no longer holds, this entry is stale (likely fd reuse), + // so forward to the kernel. if(!CTX->range_fd() && !CTX->protect_fds() && !kernel_fd_targets_dev_null(fd)) { - LOG(DEBUG, - "{}() fd {} has stale file_map entry, forwarding to kernel", - __func__, fd); return reinterpret_cast(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); } - LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, - file->path()); return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, offset); } return reinterpret_cast(syscall_no_intercept_wrapper( -- GitLab