Loading CHANGELOG.md +2 −0 Original line number Diff line number Diff line Loading @@ -45,6 +45,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - We cannot use lstat directly as may cause a recursion call on libc interception. - Un/Packing order of directory entries in compressed format was incorrect ([!281](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/281)) - Fix pytorch mmap ([!291](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/291)) - Fix cuda in syscall ([!292](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/292)) - mmap and dangling fd issues ## [0.9.5] - 2025-08 Loading include/client/gkfs_functions.hpp +6 −0 Original line number Diff line number Diff line Loading @@ -193,6 +193,12 @@ gkfs_munmap(void* addr, size_t length); int gkfs_msync(void* addr, size_t length, int flags); // Returns true if addr is currently registered as a GekkoFS mmap mapping. // Cheap shared-lock read; used by hooks to skip gkfs_munmap/gkfs_msync // for non-GekkoFS addresses (e.g. CUDA GPU memory mappings). bool gkfs_mmap_is_tracked(void* addr); } // namespace gkfs::syscall Loading include/client/open_file_map.hpp +5 −0 Original line number Diff line number Diff line Loading @@ -46,6 +46,8 @@ #include <atomic> #include <array> #include <string> #include <vector> namespace gkfs::filemap { Loading Loading @@ -189,6 +191,9 @@ public: int get_fd_idx(); std::vector<int> get_range(unsigned int first, unsigned int last); }; } // namespace gkfs::filemap Loading src/client/gkfs_functions.cpp +15 −4 Original line number Diff line number Diff line Loading @@ -53,6 +53,7 @@ #include <string> #include <string_view> #include <mutex> #include <shared_mutex> #include <set> #include <tuple> #include <thread> Loading Loading @@ -96,11 +97,21 @@ struct MmapEntry { int flags; }; std::mutex mmap_mtx; // Use shared_mutex so that gkfs_mmap_is_tracked() can do a lock-free // quick-check without blocking concurrent mmap/munmap operations. std::shared_mutex mmap_mtx; std::unordered_map<void*, MmapEntry> mmap_registry; } // namespace bool gkfs_mmap_is_tracked(void* addr) { // Shared (read-only) lock: does not block concurrent is_tracked queries // or mmap registrations that don't call munmap. std::shared_lock<std::shared_mutex> lock(mmap_mtx); return mmap_registry.find(addr) != mmap_registry.end(); } void* gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { Loading Loading @@ -143,7 +154,7 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Re-acquire lock to insert mapping { std::lock_guard<std::mutex> lock(mmap_mtx); std::unique_lock<std::shared_mutex> lock(mmap_mtx); mmap_registry[ptr] = {ptr, length, path, offset, prot, flags}; } Loading @@ -164,7 +175,7 @@ gkfs_msync(void* addr, size_t length, int flags) { bool do_writeback = false; { std::lock_guard<std::mutex> lock(mmap_mtx); std::unique_lock<std::shared_mutex> lock(mmap_mtx); auto it = mmap_registry.find(addr); if(it != mmap_registry.end()) { auto& entry = it->second; Loading Loading @@ -210,7 +221,7 @@ gkfs_munmap(void* addr, size_t length) { bool do_writeback = false; { std::lock_guard<std::mutex> lock(mmap_mtx); std::unique_lock<std::shared_mutex> lock(mmap_mtx); auto it = mmap_registry.find(addr); if(it != mmap_registry.end()) { auto& entry = it->second; Loading src/client/hooks.cpp +71 −15 Original line number Diff line number Diff line Loading @@ -48,6 +48,7 @@ #include <common/path_util.hpp> #include <algorithm> #include <cstdio> #include <string> #include <vector> Loading @@ -71,6 +72,27 @@ with_errno(T ret) { constexpr size_t k_fd_copy_chunk_size = 1UL * 1024UL * 1024UL; bool kernel_fd_targets_dev_null(int fd) { char proc_fd_path[64]; const auto path_len = std::snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%d", fd); if(path_len <= 0 || path_len >= static_cast<int>(sizeof(proc_fd_path))) { return false; } char target_path[256]; const auto nread = syscall_no_intercept_wrapper(SYS_readlinkat, AT_FDCWD, proc_fd_path, target_path, sizeof(target_path) - 1); if(nread < 0) { return false; } target_path[nread] = '\0'; return std::string(target_path) == "/dev/null"; } ssize_t copy_between_fds(int out_fd, int in_fd, off64_t* in_off, off64_t* out_off, size_t count) { Loading Loading @@ -588,42 +610,45 @@ hook_symlinkat(const char* oldname, int newdfd, const char* newname) { LOG(DEBUG, "{}() called with oldname: \"{}\", newfd: {}, newname: \"{}\"", __func__, oldname, newdfd, newname); bool internal1 = false; std::string oldname_resolved; if(CTX->relativize_path(oldname, oldname_resolved)) { if(!gkfs::config::metadata::symlink_support) { LOG(WARNING, "{}() operation not supported", __func__); return -ENOTSUP; } internal1 = true; } // First determine where the destination (newname) lives. // We MUST do this before touching oldname, because oldname is a literal // symlink content string — NOT a filesystem path to resolve. Calling // relativize_path() on it would corrupt relative paths like "../nvidia0" // by anchoring them to the process CWD, producing wrong paths. std::string newname_resolved; auto rstatus = CTX->relativize_fd_path(newdfd, newname, newname_resolved, false); switch(rstatus) { case gkfs::preload::RelativizeStatus::fd_unknown: // Destination fd is not known to us — forward verbatim. return gsl::narrow_cast<int>(syscall_no_intercept_wrapper( SYS_symlinkat, oldname, newdfd, newname)); case gkfs::preload::RelativizeStatus::external: // Destination is outside the GekkoFS mountpoint — kernel handles. // Pass oldname as-is (it's a literal symlink content, not a path). return gsl::narrow_cast<int>(syscall_no_intercept_wrapper( SYS_symlinkat, oldname, newdfd, newname)); case gkfs::preload::RelativizeStatus::fd_not_a_dir: return -ENOTDIR; case gkfs::preload::RelativizeStatus::internal: case gkfs::preload::RelativizeStatus::internal: { // Destination is in GekkoFS. Now we also need to inspect oldname. if(!gkfs::config::metadata::symlink_support) { LOG(WARNING, "{}() operation not supported", __func__); return -ENOTSUP; } if(internal1) { // Parameters are inverted std::string oldname_resolved; if(CTX->relativize_path(oldname, oldname_resolved)) { // Source is also in GekkoFS — create symlink inside GekkoFS. return with_errno(gkfs::syscall::gkfs_mk_symlink( newname_resolved, oldname_resolved)); } LOG(WARNING, "{}() operation not supported", __func__); LOG(WARNING, "{}() cross-mount symlink not supported", __func__); return -ENOTSUP; } default: LOG(ERROR, "{}() relativize status unknown", __func__); Loading @@ -631,6 +656,7 @@ hook_symlinkat(const char* oldname, int newdfd, const char* newname) { } } int hook_flock(unsigned long fd, int flags) { LOG(ERROR, "{}() called flock (Not Supported) with fd '{}' flags '{}'", Loading Loading @@ -1504,20 +1530,45 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, "{}() called with addr '{}' length '{}' prot '{}' flags '{}' fd '{}' offset '{}'", __func__, fmt::ptr(addr), length, prot, flags, fd, offset); // MAP_ANONYMOUS mappings are not backed by any file. The kernel ignores // the fd argument, but CUDA drivers commonly pass fd=0 (stdin). Checking // file_map()->get(0) can incorrectly route CUDA's GPU memory maps through // gkfs_mmap, causing cudaErrorDevicesUnavailable with syscall interception. if(flags & MAP_ANONYMOUS) { return reinterpret_cast<void*>(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); } if(auto file = CTX->file_map()->get(fd)) { LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, file->path()); // In non-range-fd mode, GekkoFS keeps a /dev/null kernel fd per tracked // file. If that no longer holds, this entry is stale (likely fd reuse), // so forward to the kernel. if(!CTX->range_fd() && !CTX->protect_fds() && !kernel_fd_targets_dev_null(fd)) { return reinterpret_cast<void*>(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); } return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, offset); } return reinterpret_cast<void*>(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); } int hook_munmap(void* addr, size_t length) { LOG(DEBUG, "{}() called with addr '{}' length '{}'", __func__, fmt::ptr(addr), length); // Only invoke gkfs_munmap for addresses we actually track. // Bypassing for all other addresses avoids stalling unrelated munmap // callers (e.g. CUDA's multi-threaded init) on mmap_mtx under // syscall-level interception. if(!gkfs::syscall::gkfs_mmap_is_tracked(addr)) { return syscall_no_intercept_wrapper(SYS_munmap, addr, length); } int res = gkfs::syscall::gkfs_munmap(addr, length); if(res == 1) { return 0; Loading @@ -1532,6 +1583,11 @@ hook_msync(void* addr, size_t length, int flags) { LOG(DEBUG, "{}() called with addr '{}' length '{}' flags '{}'", __func__, fmt::ptr(addr), length, flags); // Same fast-path: skip gkfs entirely for non-GekkoFS mappings. if(!gkfs::syscall::gkfs_mmap_is_tracked(addr)) { return syscall_no_intercept_wrapper(SYS_msync, addr, length, flags); } int res = gkfs::syscall::gkfs_msync(addr, length, flags); if(res == 1) { return 0; Loading Loading
CHANGELOG.md +2 −0 Original line number Diff line number Diff line Loading @@ -45,6 +45,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - We cannot use lstat directly as may cause a recursion call on libc interception. - Un/Packing order of directory entries in compressed format was incorrect ([!281](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/281)) - Fix pytorch mmap ([!291](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/291)) - Fix cuda in syscall ([!292](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/292)) - mmap and dangling fd issues ## [0.9.5] - 2025-08 Loading
include/client/gkfs_functions.hpp +6 −0 Original line number Diff line number Diff line Loading @@ -193,6 +193,12 @@ gkfs_munmap(void* addr, size_t length); int gkfs_msync(void* addr, size_t length, int flags); // Returns true if addr is currently registered as a GekkoFS mmap mapping. // Cheap shared-lock read; used by hooks to skip gkfs_munmap/gkfs_msync // for non-GekkoFS addresses (e.g. CUDA GPU memory mappings). bool gkfs_mmap_is_tracked(void* addr); } // namespace gkfs::syscall Loading
include/client/open_file_map.hpp +5 −0 Original line number Diff line number Diff line Loading @@ -46,6 +46,8 @@ #include <atomic> #include <array> #include <string> #include <vector> namespace gkfs::filemap { Loading Loading @@ -189,6 +191,9 @@ public: int get_fd_idx(); std::vector<int> get_range(unsigned int first, unsigned int last); }; } // namespace gkfs::filemap Loading
src/client/gkfs_functions.cpp +15 −4 Original line number Diff line number Diff line Loading @@ -53,6 +53,7 @@ #include <string> #include <string_view> #include <mutex> #include <shared_mutex> #include <set> #include <tuple> #include <thread> Loading Loading @@ -96,11 +97,21 @@ struct MmapEntry { int flags; }; std::mutex mmap_mtx; // Use shared_mutex so that gkfs_mmap_is_tracked() can do a lock-free // quick-check without blocking concurrent mmap/munmap operations. std::shared_mutex mmap_mtx; std::unordered_map<void*, MmapEntry> mmap_registry; } // namespace bool gkfs_mmap_is_tracked(void* addr) { // Shared (read-only) lock: does not block concurrent is_tracked queries // or mmap registrations that don't call munmap. std::shared_lock<std::shared_mutex> lock(mmap_mtx); return mmap_registry.find(addr) != mmap_registry.end(); } void* gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { Loading Loading @@ -143,7 +154,7 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Re-acquire lock to insert mapping { std::lock_guard<std::mutex> lock(mmap_mtx); std::unique_lock<std::shared_mutex> lock(mmap_mtx); mmap_registry[ptr] = {ptr, length, path, offset, prot, flags}; } Loading @@ -164,7 +175,7 @@ gkfs_msync(void* addr, size_t length, int flags) { bool do_writeback = false; { std::lock_guard<std::mutex> lock(mmap_mtx); std::unique_lock<std::shared_mutex> lock(mmap_mtx); auto it = mmap_registry.find(addr); if(it != mmap_registry.end()) { auto& entry = it->second; Loading Loading @@ -210,7 +221,7 @@ gkfs_munmap(void* addr, size_t length) { bool do_writeback = false; { std::lock_guard<std::mutex> lock(mmap_mtx); std::unique_lock<std::shared_mutex> lock(mmap_mtx); auto it = mmap_registry.find(addr); if(it != mmap_registry.end()) { auto& entry = it->second; Loading
src/client/hooks.cpp +71 −15 Original line number Diff line number Diff line Loading @@ -48,6 +48,7 @@ #include <common/path_util.hpp> #include <algorithm> #include <cstdio> #include <string> #include <vector> Loading @@ -71,6 +72,27 @@ with_errno(T ret) { constexpr size_t k_fd_copy_chunk_size = 1UL * 1024UL * 1024UL; bool kernel_fd_targets_dev_null(int fd) { char proc_fd_path[64]; const auto path_len = std::snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%d", fd); if(path_len <= 0 || path_len >= static_cast<int>(sizeof(proc_fd_path))) { return false; } char target_path[256]; const auto nread = syscall_no_intercept_wrapper(SYS_readlinkat, AT_FDCWD, proc_fd_path, target_path, sizeof(target_path) - 1); if(nread < 0) { return false; } target_path[nread] = '\0'; return std::string(target_path) == "/dev/null"; } ssize_t copy_between_fds(int out_fd, int in_fd, off64_t* in_off, off64_t* out_off, size_t count) { Loading Loading @@ -588,42 +610,45 @@ hook_symlinkat(const char* oldname, int newdfd, const char* newname) { LOG(DEBUG, "{}() called with oldname: \"{}\", newfd: {}, newname: \"{}\"", __func__, oldname, newdfd, newname); bool internal1 = false; std::string oldname_resolved; if(CTX->relativize_path(oldname, oldname_resolved)) { if(!gkfs::config::metadata::symlink_support) { LOG(WARNING, "{}() operation not supported", __func__); return -ENOTSUP; } internal1 = true; } // First determine where the destination (newname) lives. // We MUST do this before touching oldname, because oldname is a literal // symlink content string — NOT a filesystem path to resolve. Calling // relativize_path() on it would corrupt relative paths like "../nvidia0" // by anchoring them to the process CWD, producing wrong paths. std::string newname_resolved; auto rstatus = CTX->relativize_fd_path(newdfd, newname, newname_resolved, false); switch(rstatus) { case gkfs::preload::RelativizeStatus::fd_unknown: // Destination fd is not known to us — forward verbatim. return gsl::narrow_cast<int>(syscall_no_intercept_wrapper( SYS_symlinkat, oldname, newdfd, newname)); case gkfs::preload::RelativizeStatus::external: // Destination is outside the GekkoFS mountpoint — kernel handles. // Pass oldname as-is (it's a literal symlink content, not a path). return gsl::narrow_cast<int>(syscall_no_intercept_wrapper( SYS_symlinkat, oldname, newdfd, newname)); case gkfs::preload::RelativizeStatus::fd_not_a_dir: return -ENOTDIR; case gkfs::preload::RelativizeStatus::internal: case gkfs::preload::RelativizeStatus::internal: { // Destination is in GekkoFS. Now we also need to inspect oldname. if(!gkfs::config::metadata::symlink_support) { LOG(WARNING, "{}() operation not supported", __func__); return -ENOTSUP; } if(internal1) { // Parameters are inverted std::string oldname_resolved; if(CTX->relativize_path(oldname, oldname_resolved)) { // Source is also in GekkoFS — create symlink inside GekkoFS. return with_errno(gkfs::syscall::gkfs_mk_symlink( newname_resolved, oldname_resolved)); } LOG(WARNING, "{}() operation not supported", __func__); LOG(WARNING, "{}() cross-mount symlink not supported", __func__); return -ENOTSUP; } default: LOG(ERROR, "{}() relativize status unknown", __func__); Loading @@ -631,6 +656,7 @@ hook_symlinkat(const char* oldname, int newdfd, const char* newname) { } } int hook_flock(unsigned long fd, int flags) { LOG(ERROR, "{}() called flock (Not Supported) with fd '{}' flags '{}'", Loading Loading @@ -1504,20 +1530,45 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, "{}() called with addr '{}' length '{}' prot '{}' flags '{}' fd '{}' offset '{}'", __func__, fmt::ptr(addr), length, prot, flags, fd, offset); // MAP_ANONYMOUS mappings are not backed by any file. The kernel ignores // the fd argument, but CUDA drivers commonly pass fd=0 (stdin). Checking // file_map()->get(0) can incorrectly route CUDA's GPU memory maps through // gkfs_mmap, causing cudaErrorDevicesUnavailable with syscall interception. if(flags & MAP_ANONYMOUS) { return reinterpret_cast<void*>(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); } if(auto file = CTX->file_map()->get(fd)) { LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, file->path()); // In non-range-fd mode, GekkoFS keeps a /dev/null kernel fd per tracked // file. If that no longer holds, this entry is stale (likely fd reuse), // so forward to the kernel. if(!CTX->range_fd() && !CTX->protect_fds() && !kernel_fd_targets_dev_null(fd)) { return reinterpret_cast<void*>(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); } return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, offset); } return reinterpret_cast<void*>(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); } int hook_munmap(void* addr, size_t length) { LOG(DEBUG, "{}() called with addr '{}' length '{}'", __func__, fmt::ptr(addr), length); // Only invoke gkfs_munmap for addresses we actually track. // Bypassing for all other addresses avoids stalling unrelated munmap // callers (e.g. CUDA's multi-threaded init) on mmap_mtx under // syscall-level interception. if(!gkfs::syscall::gkfs_mmap_is_tracked(addr)) { return syscall_no_intercept_wrapper(SYS_munmap, addr, length); } int res = gkfs::syscall::gkfs_munmap(addr, length); if(res == 1) { return 0; Loading @@ -1532,6 +1583,11 @@ hook_msync(void* addr, size_t length, int flags) { LOG(DEBUG, "{}() called with addr '{}' length '{}' flags '{}'", __func__, fmt::ptr(addr), length, flags); // Same fast-path: skip gkfs entirely for non-GekkoFS mappings. if(!gkfs::syscall::gkfs_mmap_is_tracked(addr)) { return syscall_no_intercept_wrapper(SYS_msync, addr, length, flags); } int res = gkfs::syscall::gkfs_msync(addr, length, flags); if(res == 1) { return 0; Loading