From 57fb346cfd429b6e51fdc12aee169cc310ccad30 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 26 Feb 2026 19:10:45 +0100 Subject: [PATCH 01/68] fix s3d mmap issue with libc --- src/client/gkfs_functions.cpp | 45 ++++++++++++++++++++++------------- src/client/gkfs_libc.cpp | 34 +++++++++++++++----------- 2 files changed, 49 insertions(+), 30 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index a7f2b0822..09edceae6 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -75,11 +75,11 @@ using namespace std; namespace { -// set to store void * addr, fd, length and offset -// set to store void * addr, fd, length, offset, prot -// Protected by mmap_set_mutex for thread-safe access from parallel Python -// threads -std::set> mmap_set; +// Tracks active GekkoFS mmap regions. +// Stores (addr, path, length, offset, prot). +// path is captured at mmap() time so that munmap()/msync() can flush data +// back even after the originating fd has been closed (POSIX allows this). +std::set> mmap_set; std::mutex mmap_set_mutex; } // namespace @@ -93,6 +93,17 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, errno = EINVAL; return MAP_FAILED; } + + // Capture the file path *now*, while the fd is still open. + // s3d.x (and many Fortran programs) close the fd immediately after mmap; + // storing the path lets munmap/msync flush data back even when fd is gone. + auto gkfs_fd = CTX->file_map()->get(fd); + if(!gkfs_fd) { + errno = EBADF; + return MAP_FAILED; + } + std::string path = gkfs_fd->path(); + void* ptr = calloc(1, length); if(ptr == nullptr) { return MAP_FAILED; @@ -106,7 +117,7 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Register mapping under lock so concurrent threads don't race on mmap_set { std::lock_guard lock(mmap_set_mutex); - mmap_set.insert(std::make_tuple(ptr, fd, length, offset, prot)); + mmap_set.insert(std::make_tuple(ptr, path, length, offset, prot)); } return ptr; } @@ -122,12 +133,13 @@ gkfs_msync(void* addr, size_t length, int flags) { [addr](const auto& t) { return std::get<0>(t) == addr; }); if(it != mmap_set.end()) { - int fd = std::get<1>(*it); + const std::string& path = std::get<1>(*it); size_t map_length = std::get<2>(*it); // use stored length, not caller's off_t offset = std::get<3>(*it); int prot = std::get<4>(*it); if(prot & PROT_WRITE) { - gkfs::syscall::gkfs_pwrite(fd, addr, map_length, offset); + // Write via path so this works even when the fd is already closed. + gkfs::rpc::forward_write(path, addr, offset, map_length, 0); } return 0; } @@ -140,20 +152,21 @@ gkfs_munmap(void* addr, size_t length) { std::unique_lock lock(mmap_set_mutex); auto it = std::find_if( mmap_set.begin(), mmap_set.end(), - [&addr](const std::tuple& t) { - return std::get<0>(t) == addr; - }); + [&addr](const std::tuple& + t) { return std::get<0>(t) == addr; }); if(it != mmap_set.end()) { - int fd = std::get<1>(*it); + std::string path = std::get<1>(*it); size_t map_length = std::get<2>(*it); off_t offset = std::get<3>(*it); int prot = std::get<4>(*it); - // Flush dirty pages back before freeing - if(prot & PROT_WRITE) { - gkfs::syscall::gkfs_pwrite(fd, addr, map_length, offset); - } mmap_set.erase(it); lock.unlock(); // release lock before free to avoid holding it longer + // Flush dirty pages back before freeing. + // Use path-based write so this works even when the fd is already closed + // (s3d.x and many Fortran programs close the fd right after mmap). + if(prot & PROT_WRITE) { + gkfs::rpc::forward_write(path, addr, offset, map_length, 0); + } free(addr); return 0; } diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index b65c9b976..99f05476d 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -2510,32 +2510,38 @@ fcntl(int fd, int cmd, ...) { void* mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { gkfs_init_routine_placeholder(); - // If fd is GekkoFS fd, GekkoFS needs to provide mmap support. - // This is complex: requires GekkoFS to manage memory regions or map its - // data. - GKFS_OPERATION(mmap, addr, length, prot, flags, fd, - offset); // gkfs_mmap - GKFS_FALLBACK(mmap, addr, length, prot, flags, fd, offset); + // Route GekkoFS fds through gkfs_mmap (calloc + pread based). + GKFS_OPERATION(mmap, addr, length, prot, flags, fd, offset); + // For non-GekkoFS fds use the raw kernel syscall to avoid any possible + // infinite recursion through dlsym(RTLD_NEXT, "mmap") on glibc builds + // where mmap/mmap64 are weak aliases pointing back to our interposer. + void* ret = reinterpret_cast( + syscall(SYS_mmap, addr, length, prot, flags, fd, offset)); + if(ret == reinterpret_cast(-1)) { + return MAP_FAILED; + } + return ret; } // mmap64 is the large-file alias for mmap on Linux x86_64. // Python's built-in mmap module and NumPy memmap call this variant directly, // bypassing plain mmap(). Without this interceptor, GekkoFS fds fall through // to the kernel and return ENXIO. -// -// IMPORTANT: We intentionally fall back to dlsym_mmap (not dlsym_mmap64) for -// non-GekkoFS fds. On 64-bit Linux, mmap64 is implemented as an alias of mmap -// in glibc — dlsym(RTLD_NEXT, "mmap64") can therefore resolve back to our own -// mmap64 interposer, causing infinite recursion → stack overflow → SIGSEGV. #if defined(__USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE) || \ defined(__linux__) void* mmap64(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { gkfs_init_routine_placeholder(); - // Delegate to gkfs_mmap when fd belongs to GekkoFS, otherwise fallback. + // Route GekkoFS fds through gkfs_mmap. GKFS_OPERATION(mmap, addr, length, prot, flags, fd, offset); - // Fall back via the plain mmap dlsym wrapper — avoids infinite recursion. - GKFS_FALLBACK(mmap, addr, length, prot, flags, fd, offset); + // Same direct syscall fallback — mmap and mmap64 issue the same SYS_mmap + // on 64-bit Linux. Avoids the dlsym recursion risk entirely. + void* ret = reinterpret_cast( + syscall(SYS_mmap, addr, length, prot, flags, fd, offset)); + if(ret == reinterpret_cast(-1)) { + return MAP_FAILED; + } + return ret; } #endif -- GitLab From 8ffa733b8b7c93715eceba829aefa4f10e66d649 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 26 Feb 2026 19:30:18 +0100 Subject: [PATCH 02/68] fix s3d --- src/client/gkfs_functions.cpp | 78 +++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 09edceae6..87b4be68e 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -76,10 +76,10 @@ using namespace std; namespace { // Tracks active GekkoFS mmap regions. -// Stores (addr, path, length, offset, prot). +// Stores (addr, fd, path, length, offset, prot). // path is captured at mmap() time so that munmap()/msync() can flush data // back even after the originating fd has been closed (POSIX allows this). -std::set> mmap_set; +std::set> mmap_set; std::mutex mmap_set_mutex; } // namespace @@ -108,17 +108,17 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, if(ptr == nullptr) { return MAP_FAILED; } - // Pre-populate the buffer from the GekkoFS file (read-write mapping) - auto ret = gkfs::syscall::gkfs_pread(fd, ptr, length, offset); - if(ret == -1) { - free(ptr); - return MAP_FAILED; - } + // Register mapping under lock so concurrent threads don't race on mmap_set { std::lock_guard lock(mmap_set_mutex); - mmap_set.insert(std::make_tuple(ptr, path, length, offset, prot)); + mmap_set.insert(std::make_tuple(ptr, fd, path, length, offset, prot)); } + + // Pre-populate the buffer from the GekkoFS file (read-write mapping) + // Treat pread failure as no-data, not fatal. + gkfs::syscall::gkfs_pread(fd, ptr, length, offset); + return ptr; } @@ -133,13 +133,26 @@ gkfs_msync(void* addr, size_t length, int flags) { [addr](const auto& t) { return std::get<0>(t) == addr; }); if(it != mmap_set.end()) { - const std::string& path = std::get<1>(*it); - size_t map_length = std::get<2>(*it); // use stored length, not caller's - off_t offset = std::get<3>(*it); - int prot = std::get<4>(*it); + int fd = std::get<1>(*it); + const std::string& path = std::get<2>(*it); + size_t map_length = std::get<3>(*it); // use stored length, not caller's + off_t offset = std::get<4>(*it); + int prot = std::get<5>(*it); if(prot & PROT_WRITE) { - // Write via path so this works even when the fd is already closed. - gkfs::rpc::forward_write(path, addr, offset, map_length, 0); + auto gkfs_fd = CTX->file_map()->get(fd); + if(gkfs_fd) { + // fd is still valid, use gkfs_pwrite + gkfs::syscall::gkfs_pwrite(fd, addr, map_length, offset); + } else { + // fd is no longer valid (e.g. s3d.x closes fd before munmap). + // Write data directly and update the size metadata. + auto [werr, wsize] = gkfs::rpc::forward_write( + path, addr, offset, map_length, 0); + if(!werr) { + gkfs::utils::update_file_size(path, map_length, offset, + false, false); + } + } } return 0; } @@ -150,22 +163,35 @@ gkfs_msync(void* addr, size_t length, int flags) { int gkfs_munmap(void* addr, size_t length) { std::unique_lock lock(mmap_set_mutex); - auto it = std::find_if( - mmap_set.begin(), mmap_set.end(), - [&addr](const std::tuple& - t) { return std::get<0>(t) == addr; }); + auto it = std::find_if(mmap_set.begin(), mmap_set.end(), + [&addr](const std::tuple& t) { + return std::get<0>(t) == addr; + }); if(it != mmap_set.end()) { - std::string path = std::get<1>(*it); - size_t map_length = std::get<2>(*it); - off_t offset = std::get<3>(*it); - int prot = std::get<4>(*it); + int fd = std::get<1>(*it); + std::string path = std::get<2>(*it); + size_t map_length = std::get<3>(*it); + off_t offset = std::get<4>(*it); + int prot = std::get<5>(*it); mmap_set.erase(it); lock.unlock(); // release lock before free to avoid holding it longer // Flush dirty pages back before freeing. - // Use path-based write so this works even when the fd is already closed - // (s3d.x and many Fortran programs close the fd right after mmap). if(prot & PROT_WRITE) { - gkfs::rpc::forward_write(path, addr, offset, map_length, 0); + auto gkfs_fd = CTX->file_map()->get(fd); + if(gkfs_fd) { + // fd is still valid, use gkfs_pwrite + gkfs::syscall::gkfs_pwrite(fd, addr, map_length, offset); + } else { + // fd is no longer valid (e.g. s3d.x closes fd before munmap). + // Write data directly and update the size metadata. + auto [werr, wsize] = gkfs::rpc::forward_write( + path, addr, offset, map_length, 0); + if(!werr) { + gkfs::utils::update_file_size(path, map_length, offset, + false, false); + } + } } free(addr); return 0; -- GitLab From 5bb254b8c01a5f8ba9b1d1c580def9763144b268 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 26 Feb 2026 19:52:22 +0100 Subject: [PATCH 03/68] fix s3d --- src/client/gkfs_functions.cpp | 44 ++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 87b4be68e..711f9019f 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -75,12 +75,24 @@ using namespace std; namespace { +using mmap_entry = std::tuple; + // Tracks active GekkoFS mmap regions. // Stores (addr, fd, path, length, offset, prot). // path is captured at mmap() time so that munmap()/msync() can flush data // back even after the originating fd has been closed (POSIX allows this). -std::set> mmap_set; -std::mutex mmap_set_mutex; + +std::set& +get_mmap_set() { + static std::set mmap_set; + return mmap_set; +} + +std::mutex& +get_mmap_set_mutex() { + static std::mutex mmap_set_mutex; + return mmap_set_mutex; +} } // namespace @@ -111,8 +123,9 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Register mapping under lock so concurrent threads don't race on mmap_set { - std::lock_guard lock(mmap_set_mutex); - mmap_set.insert(std::make_tuple(ptr, fd, path, length, offset, prot)); + std::lock_guard lock(get_mmap_set_mutex()); + get_mmap_set().insert( + std::make_tuple(ptr, fd, path, length, offset, prot)); } // Pre-populate the buffer from the GekkoFS file (read-write mapping) @@ -125,19 +138,20 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, int // cppcheck-suppress constParameterPointer gkfs_msync(void* addr, size_t length, int flags) { - std::lock_guard lock(mmap_set_mutex); + std::unique_lock lock(get_mmap_set_mutex()); // Find by start address; msync may pass a sub-range length so we use the // full mapping length stored in mmap_set. auto it = std::find_if( - mmap_set.begin(), mmap_set.end(), + get_mmap_set().begin(), get_mmap_set().end(), [addr](const auto& t) { return std::get<0>(t) == addr; }); - if(it != mmap_set.end()) { + if(it != get_mmap_set().end()) { int fd = std::get<1>(*it); - const std::string& path = std::get<2>(*it); + const std::string path = std::get<2>(*it); size_t map_length = std::get<3>(*it); // use stored length, not caller's off_t offset = std::get<4>(*it); int prot = std::get<5>(*it); + lock.unlock(); // Release lock before I/O if(prot & PROT_WRITE) { auto gkfs_fd = CTX->file_map()->get(fd); if(gkfs_fd) { @@ -162,19 +176,17 @@ gkfs_msync(void* addr, size_t length, int flags) { int gkfs_munmap(void* addr, size_t length) { - std::unique_lock lock(mmap_set_mutex); - auto it = std::find_if(mmap_set.begin(), mmap_set.end(), - [&addr](const std::tuple& t) { - return std::get<0>(t) == addr; - }); - if(it != mmap_set.end()) { + std::unique_lock lock(get_mmap_set_mutex()); + auto it = std::find_if( + get_mmap_set().begin(), get_mmap_set().end(), + [&addr](const mmap_entry& t) { return std::get<0>(t) == addr; }); + if(it != get_mmap_set().end()) { int fd = std::get<1>(*it); std::string path = std::get<2>(*it); size_t map_length = std::get<3>(*it); off_t offset = std::get<4>(*it); int prot = std::get<5>(*it); - mmap_set.erase(it); + get_mmap_set().erase(it); lock.unlock(); // release lock before free to avoid holding it longer // Flush dirty pages back before freeing. if(prot & PROT_WRITE) { -- GitLab From ce84b006167f0fde2662cc31134b30e133000981 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 26 Feb 2026 20:59:58 +0100 Subject: [PATCH 04/68] fix2 --- src/client/gkfs_libc.cpp | 26 +++++++++++++++++--- src/client/hooks.cpp | 33 ++++++++++++++++++------- src/client/preload_context.cpp | 45 +++++++++++++++++++++++----------- 3 files changed, 77 insertions(+), 27 deletions(-) diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 99f05476d..3e36ba024 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -283,13 +283,26 @@ log_arguments(const char* symbol) { DEBUG_INFO("[BYPASS] {}", symbol); } +// Helper to safely handle null pointers for logging +template +const T& +safe_log_arg(const T& arg) { + return arg; +} + +static const char* +safe_log_arg(const char* arg) { + return arg ? arg : "(null)"; +} + // Variadic case: 1+ arguments template void log_arguments(const char* symbol, Args&&... args) { std::stringstream ss; ss << "[BYPASS] Calling " << symbol << " with arguments: "; - ((ss << "[" << typeid(Args).name() << "] " << args << " "), ...); + ((ss << "[" << typeid(Args).name() << "] " << safe_log_arg(args) << " "), + ...); DEBUG_INFO("{}", ss.str()); } @@ -299,7 +312,8 @@ void log_argumentsx(const char* symbol, Args&&... args) { std::stringstream ss; ss << "[BYPASS-ERROR] Calling " << symbol << " with arguments: "; - ((ss << "[" << typeid(Args).name() << "] " << args << " "), ...); + ((ss << "[" << typeid(Args).name() << "] " << safe_log_arg(args) << " "), + ...); DEBUG_INFO("{}", ss.str()); } @@ -1366,7 +1380,9 @@ version int mkdir(const char* path, mode_t mode) { gkfs_init_routine_placeholder(); - DEBUG_INFO("[MKDIR] Attempting to mkdir: {}", path); + if(path != nullptr) { + DEBUG_INFO("[MKDIR] Attempting to mkdir: {}", path); + } if(CTX->interception_enabled()) { std::string resolved; switch(resolve_gkfs_path(AT_FDCWD, path, resolved)) { @@ -3005,7 +3021,9 @@ aio_error(const struct aiocb* aiocbp) { int mkstemp(char* templates) { gkfs_init_routine_placeholder(); - DEBUG_INFO("[BYPASS] mkstemp(template='{}')", templates); + if(templates != nullptr) { + DEBUG_INFO("[BYPASS] mkstemp(template='{}')", templates); + } GKFS_FALLBACK(mkstemp, templates); } diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index 9fb5faac3..2b6f8cb10 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -385,8 +385,10 @@ hook_pwritev(unsigned long fd, const struct iovec* iov, unsigned long iovcnt, int hook_unlinkat(int dirfd, const char* cpath, int flags) { - LOG(DEBUG, "{}() called with dirfd: {}, path: \"{}\", flags: {}", __func__, - dirfd, cpath, flags); + if(cpath != nullptr) { + LOG(DEBUG, "{}() called with dirfd: {}, path: \"{}\", flags: {}", + __func__, dirfd, cpath, flags); + } if((flags & ~AT_REMOVEDIR) != 0) { LOG(ERROR, "{}() Flags unknown: {}", __func__, flags); @@ -538,9 +540,11 @@ hook_faccessat(int dirfd, const char* cpath, int mode) { int hook_faccessat2(int dirfd, const char* cpath, int mode, int flags) { - LOG(DEBUG, - "{}() called with dirfd: '{}', path: '{}', mode: '{}', flags: '{}'", - __func__, dirfd, cpath, mode, flags); + if(cpath != nullptr) { + LOG(DEBUG, + "{}() called with dirfd: '{}', path: '{}', mode: '{}', flags: '{}'", + __func__, dirfd, cpath, mode, flags); + } std::string resolved; auto rstatus = CTX->relativize_fd_path(dirfd, cpath, resolved); @@ -743,12 +747,18 @@ hook_fchmod(unsigned int fd, mode_t mode) { int hook_chmod(const char* path, mode_t mode) { + if(path == nullptr) { + return -EFAULT; + } LOG(DEBUG, "{}() called with path: \"{}\", mode: {}", __func__, path, mode); return 0; } int hook_lchown(const char* path, uid_t owner, gid_t group) { + if(path == nullptr) { + return -EFAULT; + } LOG(DEBUG, "{}() called with path: \"{}\", owner: {}, group: {}", __func__, path, owner, group); @@ -768,6 +778,9 @@ hook_lchown(const char* path, uid_t owner, gid_t group) { int hook_chown(const char* path, uid_t owner, gid_t group) { + if(path == nullptr) { + return -EFAULT; + } LOG(DEBUG, "{}() called with path: \"{}\", owner: {}, group: {}", __func__, path, owner, group); @@ -801,10 +814,12 @@ hook_fchown(unsigned int fd, uid_t owner, gid_t group) { int hook_fchownat(int dirfd, const char* cpath, uid_t owner, gid_t group, int flags) { - LOG(DEBUG, - "{}() called with dirfd: {}, path: \"{}\", owner: {}, group: {}, " - "flags: '{}'", - __func__, dirfd, cpath, owner, group, flags); + if(cpath != nullptr) { + LOG(DEBUG, + "{}() called with dirfd: {}, path: \"{}\", owner: {}, group: {}, " + "flags: '{}'", + __func__, dirfd, cpath, owner, group, flags); + } std::string resolved; // Force follow to true for resolution check to ensure we find the object diff --git a/src/client/preload_context.cpp b/src/client/preload_context.cpp index a8d70a567..cc8466cd0 100644 --- a/src/client/preload_context.cpp +++ b/src/client/preload_context.cpp @@ -322,9 +322,16 @@ PreloadContext::relativize_fd_path(int dirfd, const char* raw_path, // - GekkoFS fd → return its stored path as internal // - external/unknown fd → return fd_unknown so the hook forwards to kernel if(raw_path == nullptr) { - if(dirfd != AT_FDCWD && ofm_->exist(dirfd)) { - relative_path = ofm_->get(dirfd)->path(); - return RelativizeStatus::internal; + if((flags & AT_EMPTY_PATH) == 0) { + return RelativizeStatus::external; + } + + if(dirfd != AT_FDCWD) { + auto open_file = ofm_->get(dirfd); + if(open_file) { + relative_path = open_file->path(); + return RelativizeStatus::internal; + } } // dirfd is either AT_FDCWD (which requires a valid path) or an // external fd — let the kernel handle it. @@ -360,16 +367,22 @@ PreloadContext::relativize_fd_path(int dirfd, const char* raw_path, } else { // check if we have the AT_EMPTY_PATH flag // for fstatat. - if(flags & AT_EMPTY_PATH) { - relative_path = ofm_->get(dirfd)->path(); - return RelativizeStatus::internal; + if((flags & AT_EMPTY_PATH) != 0 && raw_path[0] == '\0') { + auto open_file = ofm_->get(dirfd); + if(open_file) { + relative_path = open_file->path(); + return RelativizeStatus::internal; + } } } // path is relative to fd - auto dir = ofm_->get_dir(dirfd); - if(dir == nullptr) { + auto open_file = ofm_->get(dirfd); + if(open_file == nullptr || + open_file->type() != gkfs::filemap::FileType::directory) { return RelativizeStatus::fd_not_a_dir; } + auto dir = + std::static_pointer_cast(open_file); path = mountdir_; path.append(dir->path()); path.push_back(gkfs::path::separator); @@ -403,13 +416,17 @@ PreloadContext::relativize_path(const char* raw_path, std::string path; - if(raw_path != nullptr && raw_path[0] != gkfs::path::separator) { - /* Path is not absolute, we need to prepend CWD; - * First reserve enough space to minimize memory copy - */ - path = gkfs::path::prepend_path(cwd_, raw_path); + if(raw_path != nullptr) { + if(raw_path[0] != gkfs::path::separator) { + /* Path is not absolute, we need to prepend CWD; + * First reserve enough space to minimize memory copy + */ + path = gkfs::path::prepend_path(cwd_, raw_path); + } else { + path = raw_path; + } } else { - path = raw_path; + return false; } auto [is_in_path, resolved_path] = -- GitLab From 8ee039de10e14e5b7aa5b8a423bbe7087374d9db Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 26 Feb 2026 21:28:17 +0100 Subject: [PATCH 05/68] fixes --- src/client/gkfs_libc.cpp | 3 ++ src/client/gkfs_metadata.cpp | 10 ++--- src/client/preload.cpp | 3 +- src/client/preload_util.cpp | 2 +- src/client/rpc/forward_metadata.cpp | 64 +++++++++++++++++------------ 5 files changed, 49 insertions(+), 33 deletions(-) diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 3e36ba024..f6f6c4928 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -320,6 +320,9 @@ log_argumentsx(const char* symbol, Args&&... args) { // Convert stat to stat64 static void convert(struct stat* src, struct stat64* dest) { + if(dest == nullptr || src == nullptr) { + return; + } dest->st_dev = static_cast<__dev_t>(src->st_dev); dest->st_ino = static_cast<__ino64_t>(src->st_ino); dest->st_mode = static_cast<__mode_t>(src->st_mode); diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 13b71fa84..5df8bd58f 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -664,7 +664,7 @@ gkfs_rename(const string& old_path, const string& new_path) { int gkfs_stat(const string& path, struct stat* buf, bool follow_links, bool bypass_rename) { - if(CTX->use_write_size_cache()) { + if(CTX->use_write_size_cache() && CTX->write_size_cache()) { auto err = CTX->write_size_cache()->flush(path, true).first; if(err) { LOG(ERROR, "{}() write_size_cache() failed with err '{}'", __func__, @@ -724,7 +724,7 @@ gkfs_stat(const string& path, struct stat* buf, bool follow_links, int gkfs_statx(int dirfs, const std::string& path, int flags, unsigned int mask, struct statx* buf, bool follow_links) { - if(CTX->use_write_size_cache()) { + if(CTX->use_write_size_cache() && CTX->write_size_cache()) { auto err = CTX->write_size_cache()->flush(path, true).first; if(err) { LOG(ERROR, "{}() write_size_cache() failed with err '{}'", __func__, @@ -902,7 +902,7 @@ gkfs_lseek(shared_ptr gkfs_fd, off_t offset, gkfs_fd->pos(gkfs_fd->pos() + offset); break; case SEEK_END: { - if(CTX->use_write_size_cache()) { + if(CTX->use_write_size_cache() && CTX->write_size_cache()) { CTX->write_size_cache()->flush(gkfs_fd->path()); } @@ -1019,7 +1019,7 @@ gkfs_truncate(const std::string& path, off_t length) { return -1; } - if(CTX->use_write_size_cache()) { + if(CTX->use_write_size_cache() && CTX->write_size_cache()) { auto err = CTX->write_size_cache()->flush(path, true).first; if(err) { LOG(ERROR, "{}() write_size_cache() failed with err '{}'", __func__, @@ -1772,7 +1772,7 @@ gkfs_close(unsigned int fd) { } // flush write size cache to be server consistent - if(CTX->use_write_size_cache()) { + if(CTX->use_write_size_cache() && CTX->write_size_cache()) { auto err = CTX->write_size_cache()->flush(file->path(), true).first; if(err) { LOG(ERROR, "{}() write_size_cache() failed with err '{}'", diff --git a/src/client/preload.cpp b/src/client/preload.cpp index 9e9b80664..5ee53a29a 100644 --- a/src/client/preload.cpp +++ b/src/client/preload.cpp @@ -392,7 +392,8 @@ init_preload() { } #ifndef BYPASS_SYSCALL - CTX->enable_interception(); + // CTX->enable_interception(); // Moved to after init_environment to avoid + // race gkfs::preload::start_self_interception(); #endif diff --git a/src/client/preload_util.cpp b/src/client/preload_util.cpp index a9aeba4df..27733afb4 100644 --- a/src/client/preload_util.cpp +++ b/src/client/preload_util.cpp @@ -247,7 +247,7 @@ get_metadata(const string& path, bool follow_links, bool include_inline) { std::string inline_data; int err{}; // Use file metadata from dentry cache if available - if(CTX->use_dentry_cache()) { + if(CTX->use_dentry_cache() && CTX->dentry_cache()) { // get parent and filename path to retrieve the cache entry std::filesystem::path p(path); auto parent = p.parent_path().string(); diff --git a/src/client/rpc/forward_metadata.cpp b/src/client/rpc/forward_metadata.cpp index 1a6cb357f..28790520e 100644 --- a/src/client/rpc/forward_metadata.cpp +++ b/src/client/rpc/forward_metadata.cpp @@ -46,20 +46,26 @@ using namespace std; namespace gkfs::rpc { -int -forward_create(const std::string& path, const mode_t mode, const int copy) { - auto endp = CTX->hosts().at( - CTX->distributor()->locate_file_metadata(path, copy)); +if(!CTX->distributor()) { + LOG(ERROR, "{}() Distributor not initialized!", __func__); + return EBUSY; +} +auto endp = + CTX->hosts().at(CTX->distributor()->locate_file_metadata(path, copy)); - gkfs::rpc::rpc_mk_node_in_t in; - in.path = path; - in.mode = mode; +if(!CTX->rpc_engine()) { + LOG(ERROR, "{}() RPC engine not initialized!", __func__); + return EBUSY; +} - auto out = gkfs::rpc::forward_call( - CTX->rpc_engine(), endp, gkfs::rpc::tag::create, in, __func__, - path); +gkfs::rpc::rpc_mk_node_in_t in; +in.path = path; +in.mode = mode; - return out.err; +auto out = gkfs::rpc::forward_call( + CTX->rpc_engine(), endp, gkfs::rpc::tag::create, in, __func__, path); + +return out.err; } int @@ -75,25 +81,31 @@ forward_create_write_inline(const std::string& path, mode_t mode, return write_res.first; } -int -forward_stat(const std::string& path, string& attr, string& inline_data, - const int copy, const bool include_inline) { - auto endp = CTX->hosts().at( - CTX->distributor()->locate_file_metadata(path, copy)); +if(!CTX->distributor()) { + LOG(ERROR, "{}() Distributor not initialized!", __func__); + return EBUSY; +} +auto endp = + CTX->hosts().at(CTX->distributor()->locate_file_metadata(path, copy)); - gkfs::rpc::rpc_path_only_in_t in; - in.path = path; - in.include_inline = include_inline; +if(!CTX->rpc_engine()) { + LOG(ERROR, "{}() RPC engine not initialized!", __func__); + return EBUSY; +} - auto out = gkfs::rpc::forward_call( - CTX->rpc_engine(), endp, gkfs::rpc::tag::stat, in, __func__, path); +gkfs::rpc::rpc_path_only_in_t in; +in.path = path; +in.include_inline = include_inline; - if(out.err == 0) { - attr = out.db_val; - inline_data.assign(out.inline_data.begin(), out.inline_data.end()); - } +auto out = gkfs::rpc::forward_call( + CTX->rpc_engine(), endp, gkfs::rpc::tag::stat, in, __func__, path); - return out.err; +if(out.err == 0) { + attr = out.db_val; + inline_data.assign(out.inline_data.begin(), out.inline_data.end()); +} + +return out.err; } int -- GitLab From 557c53e0296c9d51e45fc02c3e0f58f5acbd40a0 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 26 Feb 2026 21:34:04 +0100 Subject: [PATCH 06/68] fix --- src/client/rpc/forward_metadata.cpp | 80 ++++++++++++++++------------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/src/client/rpc/forward_metadata.cpp b/src/client/rpc/forward_metadata.cpp index 28790520e..1950d4b5d 100644 --- a/src/client/rpc/forward_metadata.cpp +++ b/src/client/rpc/forward_metadata.cpp @@ -46,26 +46,29 @@ using namespace std; namespace gkfs::rpc { -if(!CTX->distributor()) { - LOG(ERROR, "{}() Distributor not initialized!", __func__); - return EBUSY; -} -auto endp = - CTX->hosts().at(CTX->distributor()->locate_file_metadata(path, copy)); +int +forward_create(const std::string& path, const mode_t mode, const int copy) { + if(!CTX->distributor()) { + LOG(ERROR, "{}() Distributor not initialized!", __func__); + return EBUSY; + } + auto endp = CTX->hosts().at( + CTX->distributor()->locate_file_metadata(path, copy)); -if(!CTX->rpc_engine()) { - LOG(ERROR, "{}() RPC engine not initialized!", __func__); - return EBUSY; -} + if(!CTX->rpc_engine()) { + LOG(ERROR, "{}() RPC engine not initialized!", __func__); + return EBUSY; + } -gkfs::rpc::rpc_mk_node_in_t in; -in.path = path; -in.mode = mode; + gkfs::rpc::rpc_mk_node_in_t in; + in.path = path; + in.mode = mode; -auto out = gkfs::rpc::forward_call( - CTX->rpc_engine(), endp, gkfs::rpc::tag::create, in, __func__, path); + auto out = gkfs::rpc::forward_call( + CTX->rpc_engine(), endp, gkfs::rpc::tag::create, in, __func__, + path); -return out.err; + return out.err; } int @@ -81,31 +84,34 @@ forward_create_write_inline(const std::string& path, mode_t mode, return write_res.first; } -if(!CTX->distributor()) { - LOG(ERROR, "{}() Distributor not initialized!", __func__); - return EBUSY; -} -auto endp = - CTX->hosts().at(CTX->distributor()->locate_file_metadata(path, copy)); +int +forward_stat(const std::string& path, string& attr, string& inline_data, + const int copy, const bool include_inline) { + if(!CTX->distributor()) { + LOG(ERROR, "{}() Distributor not initialized!", __func__); + return EBUSY; + } + auto endp = CTX->hosts().at( + CTX->distributor()->locate_file_metadata(path, copy)); -if(!CTX->rpc_engine()) { - LOG(ERROR, "{}() RPC engine not initialized!", __func__); - return EBUSY; -} + if(!CTX->rpc_engine()) { + LOG(ERROR, "{}() RPC engine not initialized!", __func__); + return EBUSY; + } -gkfs::rpc::rpc_path_only_in_t in; -in.path = path; -in.include_inline = include_inline; + gkfs::rpc::rpc_path_only_in_t in; + in.path = path; + in.include_inline = include_inline; -auto out = gkfs::rpc::forward_call( - CTX->rpc_engine(), endp, gkfs::rpc::tag::stat, in, __func__, path); + auto out = gkfs::rpc::forward_call( + CTX->rpc_engine(), endp, gkfs::rpc::tag::stat, in, __func__, path); -if(out.err == 0) { - attr = out.db_val; - inline_data.assign(out.inline_data.begin(), out.inline_data.end()); -} + if(out.err == 0) { + attr = out.db_val; + inline_data.assign(out.inline_data.begin(), out.inline_data.end()); + } -return out.err; + return out.err; } int @@ -849,7 +855,7 @@ forward_get_dirents_filtered(const std::string& path, int server, } // reuse standard decompression for now as format is same - entries = gkfs::rpc::decompress_and_parse_entries_filtered( + entries = gkfs::rpc::decompress_and_parse_entries( out, large_buffer.get()); last_scanned_key = out.last_scanned_key; -- GitLab From 7b4d4e15081b174a71649f6daed892fc264bc42f Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Thu, 26 Feb 2026 21:51:44 +0100 Subject: [PATCH 07/68] fox2 --- src/client/gkfs_libc.cpp | 4 ++- src/client/preload.cpp | 43 +++++++++++++++++++++-------- src/client/rpc/forward_metadata.cpp | 8 +++--- 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index f6f6c4928..0c3c049e5 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -276,7 +276,9 @@ static ResultEntry* results = nullptr; //===================================// void -gkfs_init_routine_placeholder() {} +gkfs_init_routine_placeholder() { + init_preload(); +} void log_arguments(const char* symbol) { diff --git a/src/client/preload.cpp b/src/client/preload.cpp index 5ee53a29a..002167f56 100644 --- a/src/client/preload.cpp +++ b/src/client/preload.cpp @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -350,6 +351,8 @@ init_environment() { std::atomic init{false}; +static std::mutex init_mutex; +static thread_local bool reentrance_guard_init = false; /** * Called initially ONCE when preload library is used with the LD_PRELOAD @@ -384,23 +387,34 @@ init_preload() { #ifdef ENABLE_USER return; #endif + if(reentrance_guard_init) { + return; + } + reentrance_guard_init = true; + std::lock_guard lock(init_mutex); + if(init) { + reentrance_guard_init = false; + return; + } + // The original errno value will be restored after initialization to not // leak internal error codes auto oerrno = errno; - if(atomic_exchange(&init, 1) == 0) { - pthread_atfork(&at_fork, &at_parent, &at_child); - } -#ifndef BYPASS_SYSCALL - // CTX->enable_interception(); // Moved to after init_environment to avoid - // race - gkfs::preload::start_self_interception(); -#endif + pthread_atfork(&at_fork, &at_parent, &at_child); + + // Instantiate CTX before any interception starts to avoid reentrance + // during singleton construction (which calls getcwd()) + (void) CTX; CTX->init_logging(); // from here ownwards it is safe to print messages LOG(DEBUG, "Logging subsystem initialized"); +#ifndef BYPASS_SYSCALL + gkfs::preload::start_self_interception(); +#endif + // Kernel modules such as ib_uverbs may create fds in kernel space and pass // them to user-space processes using ioctl()-like interfaces. if this // happens during our internal initialization, there's no way for us to @@ -433,6 +447,13 @@ init_preload() { gkfs::preload::init_environment(); CTX->enable_interception(); +#ifndef BYPASS_SYSCALL + gkfs::preload::start_interception(); +#endif + + init = true; + reentrance_guard_init = false; + if(CTX->protect_fds()) CTX->unprotect_user_fds(); @@ -479,9 +500,9 @@ init_preload() { ? "ON" : "OFF") == "ON"; -#ifndef BYPASS_SYSCALL - gkfs::preload::start_interception(); -#endif + if(CTX->protect_fds()) + CTX->unprotect_user_fds(); + errno = oerrno; if(!CTX->init_metrics()) { diff --git a/src/client/rpc/forward_metadata.cpp b/src/client/rpc/forward_metadata.cpp index 1950d4b5d..23434fc3e 100644 --- a/src/client/rpc/forward_metadata.cpp +++ b/src/client/rpc/forward_metadata.cpp @@ -50,14 +50,14 @@ int forward_create(const std::string& path, const mode_t mode, const int copy) { if(!CTX->distributor()) { LOG(ERROR, "{}() Distributor not initialized!", __func__); - return EBUSY; + return ENOTCONN; } auto endp = CTX->hosts().at( CTX->distributor()->locate_file_metadata(path, copy)); if(!CTX->rpc_engine()) { LOG(ERROR, "{}() RPC engine not initialized!", __func__); - return EBUSY; + return ENOTCONN; } gkfs::rpc::rpc_mk_node_in_t in; @@ -89,14 +89,14 @@ forward_stat(const std::string& path, string& attr, string& inline_data, const int copy, const bool include_inline) { if(!CTX->distributor()) { LOG(ERROR, "{}() Distributor not initialized!", __func__); - return EBUSY; + return ENOTCONN; } auto endp = CTX->hosts().at( CTX->distributor()->locate_file_metadata(path, copy)); if(!CTX->rpc_engine()) { LOG(ERROR, "{}() RPC engine not initialized!", __func__); - return EBUSY; + return ENOTCONN; } gkfs::rpc::rpc_path_only_in_t in; -- GitLab From 764887cba6ba40d344eb9723fdf8f9bc192aa2dd Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 08:23:57 +0100 Subject: [PATCH 08/68] fix libc --- src/client/preload.cpp | 208 +++++++++++++++++++++-------------------- 1 file changed, 107 insertions(+), 101 deletions(-) diff --git a/src/client/preload.cpp b/src/client/preload.cpp index 002167f56..197f02a38 100644 --- a/src/client/preload.cpp +++ b/src/client/preload.cpp @@ -387,131 +387,137 @@ init_preload() { #ifdef ENABLE_USER return; #endif + // Reentrance guard: prevents recursive calls on the same thread from + // deadlocking on init_mutex (which is non-reentrant). if(reentrance_guard_init) { return; } reentrance_guard_init = true; - std::lock_guard lock(init_mutex); - if(init) { - reentrance_guard_init = false; - return; - } - // The original errno value will be restored after initialization to not - // leak internal error codes - auto oerrno = errno; + // Scope the mutex guard tightly so it covers only the core init work. + // Post-init config reads happen OUTSIDE the lock to avoid deadlocking when + // they indirectly call wrapped libc functions. + { + std::lock_guard lock(init_mutex); + if(init) { + // Already initialized by another thread; release and fall through + // so reentrance_guard_init is cleared at the end. + goto done; + } - pthread_atfork(&at_fork, &at_parent, &at_child); + // The original errno value will be restored after initialization to not + // leak internal error codes + auto oerrno = errno; - // Instantiate CTX before any interception starts to avoid reentrance - // during singleton construction (which calls getcwd()) - (void) CTX; + pthread_atfork(&at_fork, &at_parent, &at_child); - CTX->init_logging(); - // from here ownwards it is safe to print messages - LOG(DEBUG, "Logging subsystem initialized"); + // Instantiate CTX before any interception starts to avoid reentrance + // during singleton construction (which calls getcwd()) + (void) CTX; + + CTX->init_logging(); + // from here onwards it is safe to print messages + LOG(DEBUG, "Logging subsystem initialized"); #ifndef BYPASS_SYSCALL - gkfs::preload::start_self_interception(); + gkfs::preload::start_self_interception(); #endif - // Kernel modules such as ib_uverbs may create fds in kernel space and pass - // them to user-space processes using ioctl()-like interfaces. if this - // happens during our internal initialization, there's no way for us to - // control this creation and the fd will be created in the - // [0, MAX_USER_FDS) range rather than in our private - // [MAX_USER_FDS, GKFS_MAX_OPEN_FDS) range. - // with MAX_USER_FDS = GKFS_MAX_OPEN_FDS - GKFS_MAX_INTERNAL_FDS - // To prevent this for our internal - // initialization code, we forcefully occupy the user fd range to force - // such modules to create fds in our private range. - if(gkfs::env::var_is_set(gkfs::env::PROTECT_FD)) { - CTX->protect_fds(true); - LOG(INFO, "Protecting user fds"); - } else { - // Another alternative is to use start issuing fds from gekko from a - // offset. but without protecting the FDs - CTX->range_fd(gkfs::env::var_is_set(gkfs::env::RANGE_FD)); - LOG(INFO, "Moving FDs to range"); - } + if(gkfs::env::var_is_set(gkfs::env::PROTECT_FD)) { + CTX->protect_fds(true); + LOG(INFO, "Protecting user fds"); + } else { + CTX->range_fd(gkfs::env::var_is_set(gkfs::env::RANGE_FD)); + LOG(INFO, "Moving FDs to range"); + } - if(CTX->protect_fds()) { - CTX->protect_user_fds(); - } + if(CTX->protect_fds()) { + CTX->protect_user_fds(); + } - log_prog_name(); - gkfs::path::init_cwd(); + log_prog_name(); + gkfs::path::init_cwd(); - LOG(DEBUG, "Current working directory: '{}'", CTX->cwd()); - LOG(DEBUG, "Number of replicas : '{}'", CTX->get_replicas()); - gkfs::preload::init_environment(); - CTX->enable_interception(); + LOG(DEBUG, "Current working directory: '{}'", CTX->cwd()); + LOG(DEBUG, "Number of replicas : '{}'", CTX->get_replicas()); + gkfs::preload::init_environment(); + CTX->enable_interception(); #ifndef BYPASS_SYSCALL - gkfs::preload::start_interception(); + gkfs::preload::start_interception(); #endif - init = true; - reentrance_guard_init = false; - - if(CTX->protect_fds()) - CTX->unprotect_user_fds(); + if(CTX->protect_fds()) + CTX->unprotect_user_fds(); + + errno = oerrno; + + // Mark fully initialized while still holding the lock so other threads + // waiting on init_mutex will observe init==true immediately. + init = true; + } // ← init_mutex released here + + // Post-init config: runs outside the mutex. Wrapped libc calls here are + // safe because reentrance_guard_init is still true on this thread, and + // init==true so other threads won't block. + { + auto forwarding_map_file = + gkfs::env::get_var(gkfs::env::FORWARDING_MAP_FILE, + gkfs::config::forwarding_file_path); + if(!forwarding_map_file.empty()) { + init_forwarding_mapper(); + } - auto forwarding_map_file = gkfs::env::get_var( - gkfs::env::FORWARDING_MAP_FILE, gkfs::config::forwarding_file_path); - if(!forwarding_map_file.empty()) { - init_forwarding_mapper(); + gkfs::config::metadata::use_inline_data = + gkfs::env::get_var(gkfs::env::USE_INLINE_DATA, + gkfs::config::metadata::use_inline_data + ? "ON" + : "OFF") == "ON"; + gkfs::config::metadata::create_write_optimization = + gkfs::env::get_var( + gkfs::env::CREATE_WRITE_OPTIMIZATION, + gkfs::config::metadata::create_write_optimization + ? "ON" + : "OFF") == "ON"; + gkfs::config::metadata::read_inline_prefetch = + gkfs::env::get_var(gkfs::env::READ_INLINE_PREFETCH, + gkfs::config::metadata::read_inline_prefetch + ? "ON" + : "OFF") == "ON"; + gkfs::config::rpc::use_dirents_compression = + gkfs::env::get_var(gkfs::env::USE_DIRENTS_COMPRESSION, + gkfs::config::rpc::use_dirents_compression + ? "ON" + : "OFF") == "ON"; + gkfs::config::metadata::create_check_parents = + gkfs::env::get_var(gkfs::env::CREATE_CHECK_PARENTS, + gkfs::config::metadata::create_check_parents + ? "ON" + : "OFF") == "ON"; + gkfs::config::metadata::symlink_support = + gkfs::env::get_var(gkfs::env::SYMLINK_SUPPORT, + gkfs::config::metadata::symlink_support + ? "ON" + : "OFF") == "ON"; + gkfs::config::metadata::rename_support = + gkfs::env::get_var(gkfs::env::RENAME_SUPPORT, + gkfs::config::metadata::rename_support + ? "ON" + : "OFF") == "ON"; + + if(!CTX->init_metrics()) { + exit_error_msg(EXIT_FAILURE, + "Unable to initialize client metrics. Exiting..."); + } + std::atexit(quick_exit_handler); } - // Special CONFIGURATION handling - gkfs::config::metadata::use_inline_data = - gkfs::env::get_var(gkfs::env::USE_INLINE_DATA, - gkfs::config::metadata::use_inline_data - ? "ON" - : "OFF") == "ON"; - gkfs::config::metadata::create_write_optimization = - gkfs::env::get_var(gkfs::env::CREATE_WRITE_OPTIMIZATION, - gkfs::config::metadata::create_write_optimization - ? "ON" - : "OFF") == "ON"; - gkfs::config::metadata::read_inline_prefetch = - gkfs::env::get_var(gkfs::env::READ_INLINE_PREFETCH, - gkfs::config::metadata::read_inline_prefetch - ? "ON" - : "OFF") == "ON"; - gkfs::config::rpc::use_dirents_compression = - gkfs::env::get_var(gkfs::env::USE_DIRENTS_COMPRESSION, - gkfs::config::rpc::use_dirents_compression - ? "ON" - : "OFF") == "ON"; - gkfs::config::metadata::create_check_parents = - gkfs::env::get_var(gkfs::env::CREATE_CHECK_PARENTS, - gkfs::config::metadata::create_check_parents - ? "ON" - : "OFF") == "ON"; - gkfs::config::metadata::symlink_support = - gkfs::env::get_var(gkfs::env::SYMLINK_SUPPORT, - gkfs::config::metadata::symlink_support - ? "ON" - : "OFF") == "ON"; - gkfs::config::metadata::rename_support = - gkfs::env::get_var(gkfs::env::RENAME_SUPPORT, - gkfs::config::metadata::rename_support - ? "ON" - : "OFF") == "ON"; - - if(CTX->protect_fds()) - CTX->unprotect_user_fds(); - - errno = oerrno; - - if(!CTX->init_metrics()) { - exit_error_msg(EXIT_FAILURE, - "Unable to initialize client metrics. Exiting..."); - } - std::atexit(quick_exit_handler); +done: + // Always clear on the way out so this thread can make normal wrapped calls. + reentrance_guard_init = false; } + /** * Called last when preload library is used with the LD_PRELOAD environment * variable -- GitLab From 821758bcc54f6a6061d4b391d690a76e793f5a49 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 08:28:47 +0100 Subject: [PATCH 09/68] rollback init --- src/client/preload.cpp | 202 ++++++++++++++++++----------------------- 1 file changed, 87 insertions(+), 115 deletions(-) diff --git a/src/client/preload.cpp b/src/client/preload.cpp index 197f02a38..11a7b4423 100644 --- a/src/client/preload.cpp +++ b/src/client/preload.cpp @@ -351,8 +351,6 @@ init_environment() { std::atomic init{false}; -static std::mutex init_mutex; -static thread_local bool reentrance_guard_init = false; /** * Called initially ONCE when preload library is used with the LD_PRELOAD @@ -387,134 +385,108 @@ init_preload() { #ifdef ENABLE_USER return; #endif - // Reentrance guard: prevents recursive calls on the same thread from - // deadlocking on init_mutex (which is non-reentrant). - if(reentrance_guard_init) { - return; + // The original errno value will be restored after initialization to not + // leak internal error codes + auto oerrno = errno; + if(atomic_exchange(&init, 1) == 0) { + pthread_atfork(&at_fork, &at_parent, &at_child); } - reentrance_guard_init = true; - - // Scope the mutex guard tightly so it covers only the core init work. - // Post-init config reads happen OUTSIDE the lock to avoid deadlocking when - // they indirectly call wrapped libc functions. - { - std::lock_guard lock(init_mutex); - if(init) { - // Already initialized by another thread; release and fall through - // so reentrance_guard_init is cleared at the end. - goto done; - } - // The original errno value will be restored after initialization to not - // leak internal error codes - auto oerrno = errno; +#ifndef BYPASS_SYSCALL + CTX->enable_interception(); + gkfs::preload::start_self_interception(); +#endif - pthread_atfork(&at_fork, &at_parent, &at_child); + CTX->init_logging(); + // from here onwards it is safe to print messages + LOG(DEBUG, "Logging subsystem initialized"); - // Instantiate CTX before any interception starts to avoid reentrance - // during singleton construction (which calls getcwd()) - (void) CTX; + // Kernel modules such as ib_uverbs may create fds in kernel space and pass + // them to user-space processes using ioctl()-like interfaces. if this + // happens during our internal initialization, there's no way for us to + // control this creation and the fd will be created in the + // [0, MAX_USER_FDS) range rather than in our private + // [MAX_USER_FDS, GKFS_MAX_OPEN_FDS) range. + // with MAX_USER_FDS = GKFS_MAX_OPEN_FDS - GKFS_MAX_INTERNAL_FDS + // To prevent this for our internal + // initialization code, we forcefully occupy the user fd range to force + // such modules to create fds in our private range. + if(gkfs::env::var_is_set(gkfs::env::PROTECT_FD)) { + CTX->protect_fds(true); + LOG(INFO, "Protecting user fds"); + } else { + CTX->range_fd(gkfs::env::var_is_set(gkfs::env::RANGE_FD)); + LOG(INFO, "Moving FDs to range"); + } - CTX->init_logging(); - // from here onwards it is safe to print messages - LOG(DEBUG, "Logging subsystem initialized"); + if(CTX->protect_fds()) { + CTX->protect_user_fds(); + } -#ifndef BYPASS_SYSCALL - gkfs::preload::start_self_interception(); -#endif + log_prog_name(); + gkfs::path::init_cwd(); - if(gkfs::env::var_is_set(gkfs::env::PROTECT_FD)) { - CTX->protect_fds(true); - LOG(INFO, "Protecting user fds"); - } else { - CTX->range_fd(gkfs::env::var_is_set(gkfs::env::RANGE_FD)); - LOG(INFO, "Moving FDs to range"); - } + LOG(DEBUG, "Current working directory: '{}'", CTX->cwd()); + LOG(DEBUG, "Number of replicas : '{}'", CTX->get_replicas()); + gkfs::preload::init_environment(); + CTX->enable_interception(); - if(CTX->protect_fds()) { - CTX->protect_user_fds(); - } + if(CTX->protect_fds()) + CTX->unprotect_user_fds(); - log_prog_name(); - gkfs::path::init_cwd(); + auto forwarding_map_file = gkfs::env::get_var( + gkfs::env::FORWARDING_MAP_FILE, gkfs::config::forwarding_file_path); + if(!forwarding_map_file.empty()) { + init_forwarding_mapper(); + } - LOG(DEBUG, "Current working directory: '{}'", CTX->cwd()); - LOG(DEBUG, "Number of replicas : '{}'", CTX->get_replicas()); - gkfs::preload::init_environment(); - CTX->enable_interception(); + // Special CONFIGURATION handling + gkfs::config::metadata::use_inline_data = + gkfs::env::get_var(gkfs::env::USE_INLINE_DATA, + gkfs::config::metadata::use_inline_data + ? "ON" + : "OFF") == "ON"; + gkfs::config::metadata::create_write_optimization = + gkfs::env::get_var(gkfs::env::CREATE_WRITE_OPTIMIZATION, + gkfs::config::metadata::create_write_optimization + ? "ON" + : "OFF") == "ON"; + gkfs::config::metadata::read_inline_prefetch = + gkfs::env::get_var(gkfs::env::READ_INLINE_PREFETCH, + gkfs::config::metadata::read_inline_prefetch + ? "ON" + : "OFF") == "ON"; + gkfs::config::rpc::use_dirents_compression = + gkfs::env::get_var(gkfs::env::USE_DIRENTS_COMPRESSION, + gkfs::config::rpc::use_dirents_compression + ? "ON" + : "OFF") == "ON"; + gkfs::config::metadata::create_check_parents = + gkfs::env::get_var(gkfs::env::CREATE_CHECK_PARENTS, + gkfs::config::metadata::create_check_parents + ? "ON" + : "OFF") == "ON"; + gkfs::config::metadata::symlink_support = + gkfs::env::get_var(gkfs::env::SYMLINK_SUPPORT, + gkfs::config::metadata::symlink_support + ? "ON" + : "OFF") == "ON"; + gkfs::config::metadata::rename_support = + gkfs::env::get_var(gkfs::env::RENAME_SUPPORT, + gkfs::config::metadata::rename_support + ? "ON" + : "OFF") == "ON"; #ifndef BYPASS_SYSCALL - gkfs::preload::start_interception(); + gkfs::preload::start_interception(); #endif + errno = oerrno; - if(CTX->protect_fds()) - CTX->unprotect_user_fds(); - - errno = oerrno; - - // Mark fully initialized while still holding the lock so other threads - // waiting on init_mutex will observe init==true immediately. - init = true; - } // ← init_mutex released here - - // Post-init config: runs outside the mutex. Wrapped libc calls here are - // safe because reentrance_guard_init is still true on this thread, and - // init==true so other threads won't block. - { - auto forwarding_map_file = - gkfs::env::get_var(gkfs::env::FORWARDING_MAP_FILE, - gkfs::config::forwarding_file_path); - if(!forwarding_map_file.empty()) { - init_forwarding_mapper(); - } - - gkfs::config::metadata::use_inline_data = - gkfs::env::get_var(gkfs::env::USE_INLINE_DATA, - gkfs::config::metadata::use_inline_data - ? "ON" - : "OFF") == "ON"; - gkfs::config::metadata::create_write_optimization = - gkfs::env::get_var( - gkfs::env::CREATE_WRITE_OPTIMIZATION, - gkfs::config::metadata::create_write_optimization - ? "ON" - : "OFF") == "ON"; - gkfs::config::metadata::read_inline_prefetch = - gkfs::env::get_var(gkfs::env::READ_INLINE_PREFETCH, - gkfs::config::metadata::read_inline_prefetch - ? "ON" - : "OFF") == "ON"; - gkfs::config::rpc::use_dirents_compression = - gkfs::env::get_var(gkfs::env::USE_DIRENTS_COMPRESSION, - gkfs::config::rpc::use_dirents_compression - ? "ON" - : "OFF") == "ON"; - gkfs::config::metadata::create_check_parents = - gkfs::env::get_var(gkfs::env::CREATE_CHECK_PARENTS, - gkfs::config::metadata::create_check_parents - ? "ON" - : "OFF") == "ON"; - gkfs::config::metadata::symlink_support = - gkfs::env::get_var(gkfs::env::SYMLINK_SUPPORT, - gkfs::config::metadata::symlink_support - ? "ON" - : "OFF") == "ON"; - gkfs::config::metadata::rename_support = - gkfs::env::get_var(gkfs::env::RENAME_SUPPORT, - gkfs::config::metadata::rename_support - ? "ON" - : "OFF") == "ON"; - - if(!CTX->init_metrics()) { - exit_error_msg(EXIT_FAILURE, - "Unable to initialize client metrics. Exiting..."); - } - std::atexit(quick_exit_handler); + if(!CTX->init_metrics()) { + exit_error_msg(EXIT_FAILURE, + "Unable to initialize client metrics. Exiting..."); } - -done: - // Always clear on the way out so this thread can make normal wrapped calls. - reentrance_guard_init = false; + std::atexit(quick_exit_handler); } -- GitLab From d771efd622a2305d6d8e494ddbee252ead8a0c23 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 08:31:48 +0100 Subject: [PATCH 10/68] fix init libc --- src/client/gkfs_libc.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 0c3c049e5..7c028aaba 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -277,7 +277,10 @@ static ResultEntry* results = nullptr; void gkfs_init_routine_placeholder() { - init_preload(); + // no-op: init_preload() is invoked as a constructor + // (__attribute__((constructor))) Calling init_preload() here causes every + // intercepted libc function to re-enter the initialization body + // (atomic_exchange only guards pthread_atfork). } void -- GitLab From cf9a7b1631bb13666a9f33040397e8c569b56eba Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 08:37:25 +0100 Subject: [PATCH 11/68] fix mmap calloc --- src/client/gkfs_functions.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 711f9019f..4686b1167 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -116,8 +116,9 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, } std::string path = gkfs_fd->path(); - void* ptr = calloc(1, length); - if(ptr == nullptr) { + void* ptr = ::mmap(addr, length, prot | PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if(ptr == MAP_FAILED) { return MAP_FAILED; } @@ -128,10 +129,12 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, std::make_tuple(ptr, fd, path, length, offset, prot)); } - // Pre-populate the buffer from the GekkoFS file (read-write mapping) - // Treat pread failure as no-data, not fatal. gkfs::syscall::gkfs_pread(fd, ptr, length, offset); + if(!(prot & PROT_WRITE) && prot != (prot | PROT_READ | PROT_WRITE)) { + ::mprotect(ptr, length, prot); + } + return ptr; } @@ -205,7 +208,7 @@ gkfs_munmap(void* addr, size_t length) { } } } - free(addr); + ::munmap(addr, map_length); return 0; } return -1; -- GitLab From ed1369a79284c2d49529564f74a4f41edd59037e Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 08:53:50 +0100 Subject: [PATCH 12/68] flush on pread --- include/client/gkfs_functions.hpp | 3 +++ src/client/gkfs_functions.cpp | 35 +++++++++++++++++++++++++++++++ src/client/gkfs_libc.cpp | 21 +++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index b1056c7c2..d49837764 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -145,6 +145,9 @@ gkfs_read_ws(const gkfs::filemap::OpenFile& file, char* buf, size_t count, ssize_t gkfs_pread(int fd, void* buf, size_t count, off64_t offset); +void +gkfs_mmap_flush_for_path(const std::string& path); + ssize_t gkfs_read(int fd, void* buf, size_t count); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 4686b1167..634c0c03f 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -98,6 +98,41 @@ get_mmap_set_mutex() { namespace gkfs::syscall { +/** + * Flush any pending write-mmap for `path` directly to the GekkoFS daemon. + * + * This is called at the start of every gkfs_pread so that a reader always + * sees data written through an mmap mapping that hasn't been munmap'd yet. + * Without this, the daemon has 0 bytes and reads return EOF. + */ +void +gkfs_mmap_flush_for_path(const std::string& path) { + std::unique_lock lock(get_mmap_set_mutex()); + for(const auto& entry : get_mmap_set()) { + const std::string& mpath = std::get<2>(entry); + int prot = std::get<5>(entry); + if(mpath != path || !(prot & PROT_WRITE)) + continue; + void* addr = std::get<0>(entry); + size_t len = std::get<3>(entry); + off_t off = std::get<4>(entry); + int fd = std::get<1>(entry); + lock.unlock(); + // Try fd first (may still be valid), fall back to path-based RPC. + auto gkfs_fd = CTX->file_map()->get(fd); + if(gkfs_fd) { + gkfs::syscall::gkfs_pwrite(fd, addr, len, off); + } else { + auto [werr, wsize] = + gkfs::rpc::forward_write(path, addr, off, len, 0); + if(!werr) { + gkfs::utils::update_file_size(path, len, off, false, false); + } + } + return; // Only one active write mapping per path expected + } +} + void* gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 7c028aaba..679623ce4 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -77,6 +77,12 @@ #include #include +// Forward declaration: defined in gkfs_functions.cpp +namespace gkfs::syscall { +void +gkfs_mmap_flush_for_path(const std::string& path); +} // namespace gkfs::syscall + //========================= Global Atomics and Variables //=======================// @@ -868,6 +874,11 @@ close_range(unsigned int low, unsigned int high, int flags) { ssize_t read(int fd, void* buf, size_t nbyte) { gkfs_init_routine_placeholder(); + if(CTX->interception_enabled() && is_gkfs_fd(fd)) { + auto gkfs_fd = CTX->file_map()->get(fd); + if(gkfs_fd) + gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()); + } GKFS_OPERATION(read, fd, buf, nbyte); GKFS_FALLBACK(read, fd, buf, nbyte); } @@ -882,6 +893,11 @@ write(int fd, const void* buf, size_t nbyte) { ssize_t pread(int fd, void* buf, size_t count, off_t offset) { gkfs_init_routine_placeholder(); + if(CTX->interception_enabled() && is_gkfs_fd(fd)) { + auto gkfs_fd = CTX->file_map()->get(fd); + if(gkfs_fd) + gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()); + } GKFS_OPERATION(pread, fd, buf, count, offset); GKFS_FALLBACK(pread, fd, buf, count, offset); } @@ -896,6 +912,11 @@ pwrite(int fd, const void* buf, size_t count, off_t offset) { ssize_t pread64(int fd, void* buf, size_t count, off64_t offset) { gkfs_init_routine_placeholder(); + if(CTX->interception_enabled() && is_gkfs_fd(fd)) { + auto gkfs_fd = CTX->file_map()->get(fd); + if(gkfs_fd) + gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()); + } GKFS_OPERATION(pread, fd, buf, count, offset); // GekkoFS pread likely handles large offsets GKFS_FALLBACK(pread64, fd, buf, count, offset); -- GitLab From c29b89c6a6cfcfe0c1d1f7329a2a39d826a54944 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 11:27:27 +0100 Subject: [PATCH 13/68] fix mmap --- src/client/gkfs_functions.cpp | 63 +++++++++++++++++++++++------------ src/client/gkfs_libc.cpp | 28 +++++++++------- 2 files changed, 57 insertions(+), 34 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 634c0c03f..4087c3671 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -75,7 +75,20 @@ using namespace std; namespace { -using mmap_entry = std::tuple; +struct mmap_entry { + void* addr; + int fd; + std::string path; + size_t length; + off_t offset; + int prot; + + // We use addr as the unique key for the set + bool + operator<(const mmap_entry& other) const { + return addr < other.addr; + } +}; // Tracks active GekkoFS mmap regions. // Stores (addr, fd, path, length, offset, prot). @@ -109,15 +122,22 @@ void gkfs_mmap_flush_for_path(const std::string& path) { std::unique_lock lock(get_mmap_set_mutex()); for(const auto& entry : get_mmap_set()) { - const std::string& mpath = std::get<2>(entry); - int prot = std::get<5>(entry); - if(mpath != path || !(prot & PROT_WRITE)) + if(entry.path != path || !(entry.prot & PROT_WRITE)) continue; - void* addr = std::get<0>(entry); - size_t len = std::get<3>(entry); - off_t off = std::get<4>(entry); - int fd = std::get<1>(entry); + + void* addr = entry.addr; + size_t len = entry.length; + off_t off = entry.offset; + int fd = entry.fd; lock.unlock(); + + // Ensure pages are present in process memory before Mercury tries to + // register them (NA_Mem_register). This avoids EFAULT during bulk IO. + volatile char* p = static_cast(addr); + for(size_t i = 0; i < len; i += 4096) { + (void) p[i]; + } + // Try fd first (may still be valid), fall back to path-based RPC. auto gkfs_fd = CTX->file_map()->get(fd); if(gkfs_fd) { @@ -160,8 +180,7 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Register mapping under lock so concurrent threads don't race on mmap_set { std::lock_guard lock(get_mmap_set_mutex()); - get_mmap_set().insert( - std::make_tuple(ptr, fd, path, length, offset, prot)); + get_mmap_set().insert(mmap_entry{ptr, fd, path, length, offset, prot}); } gkfs::syscall::gkfs_pread(fd, ptr, length, offset); @@ -181,14 +200,14 @@ gkfs_msync(void* addr, size_t length, int flags) { // full mapping length stored in mmap_set. auto it = std::find_if( get_mmap_set().begin(), get_mmap_set().end(), - [addr](const auto& t) { return std::get<0>(t) == addr; }); + [addr](const mmap_entry& t) { return t.addr == addr; }); if(it != get_mmap_set().end()) { - int fd = std::get<1>(*it); - const std::string path = std::get<2>(*it); - size_t map_length = std::get<3>(*it); // use stored length, not caller's - off_t offset = std::get<4>(*it); - int prot = std::get<5>(*it); + int fd = it->fd; + const std::string path = it->path; + size_t map_length = it->length; // use stored length, not caller's + off_t offset = it->offset; + int prot = it->prot; lock.unlock(); // Release lock before I/O if(prot & PROT_WRITE) { auto gkfs_fd = CTX->file_map()->get(fd); @@ -217,13 +236,13 @@ gkfs_munmap(void* addr, size_t length) { std::unique_lock lock(get_mmap_set_mutex()); auto it = std::find_if( get_mmap_set().begin(), get_mmap_set().end(), - [&addr](const mmap_entry& t) { return std::get<0>(t) == addr; }); + [&addr](const mmap_entry& t) { return t.addr == addr; }); if(it != get_mmap_set().end()) { - int fd = std::get<1>(*it); - std::string path = std::get<2>(*it); - size_t map_length = std::get<3>(*it); - off_t offset = std::get<4>(*it); - int prot = std::get<5>(*it); + int fd = it->fd; + std::string path = it->path; + size_t map_length = it->length; + off_t offset = it->offset; + int prot = it->prot; get_mmap_set().erase(it); lock.unlock(); // release lock before free to avoid holding it longer // Flush dirty pages back before freeing. diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 679623ce4..33d5b831f 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -2641,22 +2641,26 @@ fopen(const char* path, const char* mode) { int flags = 0; mode_t open_mode = 0666; // Default mode for creation - // Simplified mode parsing (from original, needs to be robust) - if(strchr(mode, 'a')) { - flags = O_WRONLY | O_CREAT | O_APPEND; - } else if(strchr(mode, 'w')) { - flags = O_WRONLY | O_CREAT | O_TRUNC; - } else if(strchr(mode, 'r')) { + // Handle fopen modes correctly according to POSIX + if(mode[0] == 'r') { flags = O_RDONLY; + } else if(mode[0] == 'w') { + flags = O_WRONLY | O_CREAT | O_TRUNC; + } else if(mode[0] == 'a') { + flags = O_WRONLY | O_CREAT | O_APPEND; } else { errno = EINVAL; return nullptr; - } // Invalid mode start + } + + // Handle '+' for read/write + if(strchr(mode, '+')) { + flags = (flags & ~(O_RDONLY | O_WRONLY)) | O_RDWR; + } - if(strchr(mode, '+')) { // r+, w+, a+ - flags &= ~(O_RDONLY | O_WRONLY); // Clear O_RDONLY/O_WRONLY - // if set by r/w/a - flags |= O_RDWR; + // Handle 'x' for exclusive creation (if GekkoFS supports it) + if(strchr(mode, 'x')) { + flags |= O_EXCL; } // 'b' (binary) is ignored on POSIX for open() flags. // 'x' (O_EXCL) could be handled if GekkoFS open supports it. @@ -2908,7 +2912,7 @@ fputs(const char* str, FILE* stream) { // stream->_flags |= _IO_ERR_SEEN; return EOF; } - return 0; // Success (non-negative value for fputs) + return 0; // Only one active write mapping per path expected } GKFS_FALLBACK(fputs, str, stream); } -- GitLab From f5fd2449b257e125f8c51d2ccfa705ac4347fa83 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 11:46:38 +0100 Subject: [PATCH 14/68] fix --- src/client/gkfs_data.cpp | 8 ++++++-- src/client/gkfs_functions.cpp | 16 ++++++++++------ src/client/gkfs_metadata.cpp | 15 +++++++++------ 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 3533ce3ef..9e018f088 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -610,8 +610,12 @@ gkfs_pread(int fd, void* buf, size_t count, off64_t offset) { ssize_t gkfs_read(int fd, void* buf, size_t count) { auto gkfs_fd = CTX->file_map()->get(fd); - if(!gkfs_fd) - return 0; + if(!gkfs_fd) { + errno = EBADF; + return -1; + } + LOG(INFO, "{}() reading path '{}' count {} pos {}", __func__, + gkfs_fd->path(), count, gkfs_fd->pos()); auto pos = gkfs_fd->pos(); // retrieve the current offset auto ret = gkfs_read_ws(*gkfs_fd, reinterpret_cast(buf), count, pos); // Update offset in file descriptor in the file map diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 4087c3671..5dde50760 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -120,6 +120,7 @@ namespace gkfs::syscall { */ void gkfs_mmap_flush_for_path(const std::string& path) { + LOG(INFO, "{}() path: '{}'", __func__, path); std::unique_lock lock(get_mmap_set_mutex()); for(const auto& entry : get_mmap_set()) { if(entry.path != path || !(entry.prot & PROT_WRITE)) @@ -129,20 +130,23 @@ gkfs_mmap_flush_for_path(const std::string& path) { size_t len = entry.length; off_t off = entry.offset; int fd = entry.fd; - lock.unlock(); + + LOG(INFO, "{}() Flushing mmap for path '{}' addr {} len {}", __func__, + path, addr, len); // Ensure pages are present in process memory before Mercury tries to // register them (NA_Mem_register). This avoids EFAULT during bulk IO. + // We hold the lock during this phase to ensure the mapping is NOT + // unmapped by another thread while we touch it. volatile char* p = static_cast(addr); for(size_t i = 0; i < len; i += 4096) { (void) p[i]; } - // Try fd first (may still be valid), fall back to path-based RPC. - auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd) { - gkfs::syscall::gkfs_pwrite(fd, addr, len, off); - } else { + lock.unlock(); // Release lock before RPC (safe as we return immediately + // after) + + if(len > 0) { auto [werr, wsize] = gkfs::rpc::forward_write(path, addr, off, len, 0); if(!werr) { diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 5df8bd58f..c4917753b 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -871,7 +871,12 @@ gkfs_statvfs(struct statvfs* buf) { */ off_t gkfs_lseek(unsigned int fd, off_t offset, unsigned int whence) { - return gkfs_lseek(CTX->file_map()->get(fd), offset, whence); + auto gkfs_fd = CTX->file_map()->get(fd); + if(!gkfs_fd) { + errno = EBADF; + return -1; + } + return gkfs_lseek(gkfs_fd, offset, whence); } /** @@ -1784,15 +1789,13 @@ gkfs_close(unsigned int fd) { if(CTX->use_dentry_cache() && gkfs::config::cache::clear_dentry_cache_on_close) { // clear cache for directory - if(CTX->file_map()->get(fd)->type() == - gkfs::filemap::FileType::directory) { - CTX->dentry_cache()->clear_dir( - CTX->file_map()->get(fd)->path()); + if(file->type() == gkfs::filemap::FileType::directory) { + CTX->dentry_cache()->clear_dir(file->path()); } } if(CTX->protect_files_generator()) { - auto path = CTX->file_map()->get(fd)->path(); + auto path = file->path(); generate_lock_file(path, false); } // No call to the daemon is required -- GitLab From 19ebd6e20b0efe8a7866dfd261f5cbe24d1ec3e8 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 12:07:08 +0100 Subject: [PATCH 15/68] fix --- src/client/gkfs_data.cpp | 2 +- src/client/gkfs_functions.cpp | 5 +++-- src/client/gkfs_libc.cpp | 3 +++ src/client/gkfs_metadata.cpp | 4 ++-- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 9e018f088..817cc9a64 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -614,7 +614,7 @@ gkfs_read(int fd, void* buf, size_t count) { errno = EBADF; return -1; } - LOG(INFO, "{}() reading path '{}' count {} pos {}", __func__, + LOG(DEBUG, "{}() reading path '{}' count {} pos {}", __func__, gkfs_fd->path(), count, gkfs_fd->pos()); auto pos = gkfs_fd->pos(); // retrieve the current offset auto ret = gkfs_read_ws(*gkfs_fd, reinterpret_cast(buf), count, pos); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 5dde50760..ea3747b48 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -120,7 +120,7 @@ namespace gkfs::syscall { */ void gkfs_mmap_flush_for_path(const std::string& path) { - LOG(INFO, "{}() path: '{}'", __func__, path); + LOG(DEBUG, "{}() path: '{}'", __func__, path); std::unique_lock lock(get_mmap_set_mutex()); for(const auto& entry : get_mmap_set()) { if(entry.path != path || !(entry.prot & PROT_WRITE)) @@ -131,7 +131,7 @@ gkfs_mmap_flush_for_path(const std::string& path) { off_t off = entry.offset; int fd = entry.fd; - LOG(INFO, "{}() Flushing mmap for path '{}' addr {} len {}", __func__, + LOG(DEBUG, "{}() Flushing mmap for path '{}' addr {} len {}", __func__, path, addr, len); // Ensure pages are present in process memory before Mercury tries to @@ -153,6 +153,7 @@ gkfs_mmap_flush_for_path(const std::string& path) { gkfs::utils::update_file_size(path, len, off, false, false); } } + LOG(DEBUG, "{}() Flush completed for path '{}'", __func__, path); return; // Only one active write mapping per path expected } } diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 33d5b831f..f25b8e55f 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -874,6 +874,7 @@ close_range(unsigned int low, unsigned int high, int flags) { ssize_t read(int fd, void* buf, size_t nbyte) { gkfs_init_routine_placeholder(); + DEBUG_INFO("read(fd={}, nbyte={})", fd, nbyte); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); if(gkfs_fd) @@ -893,6 +894,7 @@ write(int fd, const void* buf, size_t nbyte) { ssize_t pread(int fd, void* buf, size_t count, off_t offset) { gkfs_init_routine_placeholder(); + DEBUG_INFO("pread(fd={}, count={}, offset={})", fd, count, offset); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); if(gkfs_fd) @@ -912,6 +914,7 @@ pwrite(int fd, const void* buf, size_t count, off_t offset) { ssize_t pread64(int fd, void* buf, size_t count, off64_t offset) { gkfs_init_routine_placeholder(); + DEBUG_INFO("pread64(fd={}, count={}, offset={})", fd, count, offset); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); if(gkfs_fd) diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index c4917753b..733a189b0 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -869,8 +869,8 @@ gkfs_statvfs(struct statvfs* buf) { * @param whence * @return 0 on success, -1 on failure */ -off_t -gkfs_lseek(unsigned int fd, off_t offset, unsigned int whence) { +off64_t +gkfs_lseek(unsigned int fd, off64_t offset, unsigned int whence) { auto gkfs_fd = CTX->file_map()->get(fd); if(!gkfs_fd) { errno = EBADF; -- GitLab From 71a757a596c6b312ee37bb5a574c7fc6dfddf8dc Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 12:27:45 +0100 Subject: [PATCH 16/68] more mmap debug --- src/client/gkfs_functions.cpp | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index ea3747b48..b80c4b7f9 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -120,10 +120,18 @@ namespace gkfs::syscall { */ void gkfs_mmap_flush_for_path(const std::string& path) { - LOG(DEBUG, "{}() path: '{}'", __func__, path); + std::string path_copy = path; // Copy to be safe + LOG(DEBUG, "{}() entering for path: '{}'", __func__, path_copy); std::unique_lock lock(get_mmap_set_mutex()); - for(const auto& entry : get_mmap_set()) { - if(entry.path != path || !(entry.prot & PROT_WRITE)) + LOG(DEBUG, "{}() lock acquired for path: '{}'", __func__, path_copy); + + auto& mmap_set = get_mmap_set(); + LOG(DEBUG, "{}() mmap_set size: {}", __func__, mmap_set.size()); + + for(const auto& entry : mmap_set) { + LOG(DEBUG, "{}() checking entry.path: '{}' vs '{}'", __func__, + entry.path, path_copy); + if(entry.path != path_copy || !(entry.prot & PROT_WRITE)) continue; void* addr = entry.addr; @@ -131,8 +139,8 @@ gkfs_mmap_flush_for_path(const std::string& path) { off_t off = entry.offset; int fd = entry.fd; - LOG(DEBUG, "{}() Flushing mmap for path '{}' addr {} len {}", __func__, - path, addr, len); + LOG(DEBUG, "{}() Matched! Flushing mmap for path '{}' addr {} len {}", + __func__, path_copy, addr, len); // Ensure pages are present in process memory before Mercury tries to // register them (NA_Mem_register). This avoids EFAULT during bulk IO. @@ -148,14 +156,17 @@ gkfs_mmap_flush_for_path(const std::string& path) { if(len > 0) { auto [werr, wsize] = - gkfs::rpc::forward_write(path, addr, off, len, 0); + gkfs::rpc::forward_write(path_copy, addr, off, len, 0); if(!werr) { - gkfs::utils::update_file_size(path, len, off, false, false); + gkfs::utils::update_file_size(path_copy, len, off, false, + false); } } - LOG(DEBUG, "{}() Flush completed for path '{}'", __func__, path); + LOG(DEBUG, "{}() Flush completed for path '{}'", __func__, path_copy); return; // Only one active write mapping per path expected } + LOG(DEBUG, "{}() exiting (no match found) for path: '{}'", __func__, + path_copy); } void* -- GitLab From 3571df9281920aa434923adc416b0a06e93e357e Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 12:39:22 +0100 Subject: [PATCH 17/68] fsegment mmap --- src/client/gkfs_data.cpp | 46 +++-- src/client/gkfs_functions.cpp | 304 ++++++++++++++++++++++++---------- src/client/hooks.cpp | 12 +- 3 files changed, 252 insertions(+), 110 deletions(-) diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 817cc9a64..865fe6f82 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -353,8 +353,10 @@ gkfs_write_ws(gkfs::filemap::OpenFile& file, const char* buf, size_t count, ssize_t gkfs_pwrite(int fd, const void* buf, size_t count, off64_t offset) { auto file = CTX->file_map()->get(fd); - if(!file) - return 0; + if(!file) { + errno = EBADF; + return -1; + } return gkfs_write_ws(*file, reinterpret_cast(buf), count, offset); } @@ -370,8 +372,10 @@ gkfs_pwrite(int fd, const void* buf, size_t count, off64_t offset) { ssize_t gkfs_write(int fd, const void* buf, size_t count) { auto gkfs_fd = CTX->file_map()->get(fd); - if(!gkfs_fd) - return 0; + if(!gkfs_fd) { + errno = EBADF; + return -1; + } // call pwrite and update pos auto ret = gkfs_write_ws(*gkfs_fd, reinterpret_cast(buf), count, gkfs_fd->pos(), true); @@ -391,8 +395,10 @@ ssize_t gkfs_pwritev(int fd, const struct iovec* iov, int iovcnt, off_t offset) { auto file = CTX->file_map()->get(fd); - if(!file) - return 0; + if(!file) { + errno = EBADF; + return -1; + } auto pos = offset; // keep track of current position ssize_t written = 0; ssize_t ret; @@ -432,8 +438,10 @@ ssize_t gkfs_writev(int fd, const struct iovec* iov, int iovcnt) { auto gkfs_fd = CTX->file_map()->get(fd); - if(!gkfs_fd) - return 0; + if(!gkfs_fd) { + errno = EBADF; + return -1; + } auto pos = gkfs_fd->pos(); // retrieve the current offset auto ret = gkfs_pwritev(fd, iov, iovcnt, pos); assert(ret != 0); @@ -594,8 +602,10 @@ gkfs_read_ws(const gkfs::filemap::OpenFile& file, char* buf, size_t count, ssize_t gkfs_pread(int fd, void* buf, size_t count, off64_t offset) { auto gkfs_fd = CTX->file_map()->get(fd); - if(!gkfs_fd) - return 0; + if(!gkfs_fd) { + errno = EBADF; + return -1; + } return gkfs_read_ws(*gkfs_fd, reinterpret_cast(buf), count, offset); } @@ -638,8 +648,10 @@ ssize_t gkfs_preadv(int fd, const struct iovec* iov, int iovcnt, off_t offset) { auto file = CTX->file_map()->get(fd); - if(!file) - return 0; + if(!file) { + errno = EBADF; + return -1; + } auto pos = offset; // keep track of current position ssize_t read = 0; ssize_t ret; @@ -679,8 +691,10 @@ ssize_t gkfs_readv(int fd, const struct iovec* iov, int iovcnt) { auto gkfs_fd = CTX->file_map()->get(fd); - if(!gkfs_fd) - return 0; + if(!gkfs_fd) { + errno = EBADF; + return -1; + } auto pos = gkfs_fd->pos(); // retrieve the current offset auto ret = gkfs_preadv(fd, iov, iovcnt, pos); assert(ret != 0); @@ -695,8 +709,8 @@ int gkfs_fsync(unsigned int fd) { auto file = CTX->file_map()->get(fd); if(!file) { - errno = 0; - return 0; + errno = EBADF; + return -1; } // flush write size cache to be server consistent if(CTX->use_write_size_cache()) { diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index b80c4b7f9..b69bdefe0 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -55,6 +55,9 @@ #include #include #include +#include +#include +#include #include #ifdef GKFS_ENABLE_CLIENT_METRICS @@ -107,6 +110,35 @@ get_mmap_set_mutex() { return mmap_set_mutex; } +using addr_type = uintptr_t; + +addr_type +ptr_to_addr(void* ptr) { + return reinterpret_cast(ptr); +} + +void* +addr_to_ptr(addr_type addr) { + return reinterpret_cast(addr); +} + +bool +overlaps(addr_type a_begin, addr_type a_end, addr_type b_begin, + addr_type b_end) { + return a_begin < b_end && b_begin < a_end; +} + +size_t +tracked_segments_for_path(const std::string& path) { + size_t segments = 0; + for(const auto& entry : get_mmap_set()) { + if(entry.path == path) { + ++segments; + } + } + return segments; +} + } // namespace namespace gkfs::syscall { @@ -120,53 +152,42 @@ namespace gkfs::syscall { */ void gkfs_mmap_flush_for_path(const std::string& path) { - std::string path_copy = path; // Copy to be safe - LOG(DEBUG, "{}() entering for path: '{}'", __func__, path_copy); - std::unique_lock lock(get_mmap_set_mutex()); - LOG(DEBUG, "{}() lock acquired for path: '{}'", __func__, path_copy); - + std::lock_guard lock(get_mmap_set_mutex()); auto& mmap_set = get_mmap_set(); - LOG(DEBUG, "{}() mmap_set size: {}", __func__, mmap_set.size()); + size_t flushed_segments = 0; + + LOG(DEBUG, "{}() path '{}' tracked segments {}", __func__, path, + tracked_segments_for_path(path)); for(const auto& entry : mmap_set) { - LOG(DEBUG, "{}() checking entry.path: '{}' vs '{}'", __func__, - entry.path, path_copy); - if(entry.path != path_copy || !(entry.prot & PROT_WRITE)) + if(entry.path != path || !(entry.prot & PROT_WRITE) || + entry.length == 0) continue; void* addr = entry.addr; size_t len = entry.length; off_t off = entry.offset; - int fd = entry.fd; - - LOG(DEBUG, "{}() Matched! Flushing mmap for path '{}' addr {} len {}", - __func__, path_copy, addr, len); + LOG(DEBUG, "{}() flushing segment path '{}' addr {} len {} off {}", + __func__, path, addr, len, off); // Ensure pages are present in process memory before Mercury tries to // register them (NA_Mem_register). This avoids EFAULT during bulk IO. - // We hold the lock during this phase to ensure the mapping is NOT - // unmapped by another thread while we touch it. volatile char* p = static_cast(addr); for(size_t i = 0; i < len; i += 4096) { (void) p[i]; } - lock.unlock(); // Release lock before RPC (safe as we return immediately - // after) - - if(len > 0) { - auto [werr, wsize] = - gkfs::rpc::forward_write(path_copy, addr, off, len, 0); - if(!werr) { - gkfs::utils::update_file_size(path_copy, len, off, false, - false); - } + auto [werr, wsize] = gkfs::rpc::forward_write(path, addr, off, len, 0); + if(!werr) { + gkfs::utils::update_file_size(path, len, off, false, false); + } else { + LOG(WARNING, "{}() forward_write failed for path '{}' err {}", + __func__, path, werr); } - LOG(DEBUG, "{}() Flush completed for path '{}'", __func__, path_copy); - return; // Only one active write mapping per path expected + ++flushed_segments; } - LOG(DEBUG, "{}() exiting (no match found) for path: '{}'", __func__, - path_copy); + LOG(DEBUG, "{}() path '{}' flushed segments {}", __func__, path, + flushed_segments); } void* @@ -211,77 +232,184 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, int // cppcheck-suppress constParameterPointer gkfs_msync(void* addr, size_t length, int flags) { + (void) flags; + if(length == 0) { + return -1; + } + + struct flush_range { + void* addr; + size_t length; + off_t offset; + int fd; + std::string path; + }; + + const auto sync_begin = ptr_to_addr(addr); + const auto sync_end = sync_begin + length; + if(sync_end < sync_begin) { + errno = EINVAL; + return -1; + } + + std::vector flush_ranges; std::unique_lock lock(get_mmap_set_mutex()); - // Find by start address; msync may pass a sub-range length so we use the - // full mapping length stored in mmap_set. - auto it = std::find_if( - get_mmap_set().begin(), get_mmap_set().end(), - [addr](const mmap_entry& t) { return t.addr == addr; }); - - if(it != get_mmap_set().end()) { - int fd = it->fd; - const std::string path = it->path; - size_t map_length = it->length; // use stored length, not caller's - off_t offset = it->offset; - int prot = it->prot; - lock.unlock(); // Release lock before I/O - if(prot & PROT_WRITE) { - auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd) { - // fd is still valid, use gkfs_pwrite - gkfs::syscall::gkfs_pwrite(fd, addr, map_length, offset); - } else { - // fd is no longer valid (e.g. s3d.x closes fd before munmap). - // Write data directly and update the size metadata. - auto [werr, wsize] = gkfs::rpc::forward_write( - path, addr, offset, map_length, 0); - if(!werr) { - gkfs::utils::update_file_size(path, map_length, offset, - false, false); - } + for(const auto& entry : get_mmap_set()) { + if(!(entry.prot & PROT_WRITE) || entry.length == 0) { + continue; + } + const auto entry_begin = ptr_to_addr(entry.addr); + const auto entry_end = entry_begin + entry.length; + if(!overlaps(entry_begin, entry_end, sync_begin, sync_end)) { + continue; + } + + const auto flush_begin = std::max(entry_begin, sync_begin); + const auto flush_end = std::min(entry_end, sync_end); + const auto relative = flush_begin - entry_begin; + flush_ranges.push_back( + flush_range{addr_to_ptr(flush_begin), + static_cast(flush_end - flush_begin), + static_cast(entry.offset + relative), + entry.fd, entry.path}); + } + if(flush_ranges.empty()) { + return -1; + } + + for(const auto& range : flush_ranges) { + auto gkfs_fd = CTX->file_map()->get(range.fd); + if(gkfs_fd) { + auto ret = gkfs::syscall::gkfs_pwrite(range.fd, range.addr, + range.length, range.offset); + if(ret >= 0) { + continue; } } - return 0; + auto [werr, wsize] = gkfs::rpc::forward_write( + range.path, range.addr, range.offset, range.length, 0); + if(!werr) { + gkfs::utils::update_file_size(range.path, range.length, + range.offset, false, false); + } else { + LOG(WARNING, "{}() forward_write failed for path '{}' err {}", + __func__, range.path, werr); + } } - return -1; + return 0; } int gkfs_munmap(void* addr, size_t length) { + if(length == 0) { + errno = EINVAL; + return -1; + } + + struct flush_range { + void* addr; + size_t length; + off_t offset; + int fd; + std::string path; + }; + + const auto unmap_begin = ptr_to_addr(addr); + const auto unmap_end = unmap_begin + length; + if(unmap_end < unmap_begin) { + errno = EINVAL; + return -1; + } + + std::vector flush_ranges; + std::vector new_entries; + std::unique_lock lock(get_mmap_set_mutex()); - auto it = std::find_if( - get_mmap_set().begin(), get_mmap_set().end(), - [&addr](const mmap_entry& t) { return t.addr == addr; }); - if(it != get_mmap_set().end()) { - int fd = it->fd; - std::string path = it->path; - size_t map_length = it->length; - off_t offset = it->offset; - int prot = it->prot; - get_mmap_set().erase(it); - lock.unlock(); // release lock before free to avoid holding it longer - // Flush dirty pages back before freeing. - if(prot & PROT_WRITE) { - auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd) { - // fd is still valid, use gkfs_pwrite - gkfs::syscall::gkfs_pwrite(fd, addr, map_length, offset); - } else { - // fd is no longer valid (e.g. s3d.x closes fd before munmap). - // Write data directly and update the size metadata. - auto [werr, wsize] = gkfs::rpc::forward_write( - path, addr, offset, map_length, 0); - if(!werr) { - gkfs::utils::update_file_size(path, map_length, offset, - false, false); - } + bool handled = false; + for(const auto& entry : get_mmap_set()) { + const auto entry_begin = ptr_to_addr(entry.addr); + const auto entry_end = entry_begin + entry.length; + + if(!overlaps(entry_begin, entry_end, unmap_begin, unmap_end)) { + new_entries.push_back(entry); + continue; + } + + handled = true; + const auto cut_begin = std::max(entry_begin, unmap_begin); + const auto cut_end = std::min(entry_end, unmap_end); + + LOG(DEBUG, "{}() path '{}' unmap overlap entry [{}..{}) cut [{}..{})", + __func__, entry.path, static_cast(entry_begin), + static_cast(entry_end), + static_cast(cut_begin), + static_cast(cut_end)); + + if((entry.prot & PROT_WRITE) && cut_end > cut_begin) { + const auto rel = cut_begin - entry_begin; + flush_ranges.push_back( + flush_range{addr_to_ptr(cut_begin), + static_cast(cut_end - cut_begin), + static_cast(entry.offset + rel), + entry.fd, entry.path}); + } + + if(entry_begin < cut_begin) { + auto left = entry; + left.addr = addr_to_ptr(entry_begin); + left.length = static_cast(cut_begin - entry_begin); + left.offset = entry.offset; + new_entries.push_back(std::move(left)); + } + + if(cut_end < entry_end) { + auto right = entry; + right.addr = addr_to_ptr(cut_end); + right.length = static_cast(entry_end - cut_end); + right.offset = + static_cast(entry.offset + (cut_end - entry_begin)); + new_entries.push_back(std::move(right)); + } + } + + if(!handled) { + return -1; + } + + for(const auto& range : flush_ranges) { + auto gkfs_fd = CTX->file_map()->get(range.fd); + if(gkfs_fd) { + auto ret = gkfs::syscall::gkfs_pwrite(range.fd, range.addr, + range.length, range.offset); + if(ret >= 0) { + continue; } } - ::munmap(addr, map_length); - return 0; + auto [werr, wsize] = gkfs::rpc::forward_write( + range.path, range.addr, range.offset, range.length, 0); + if(!werr) { + gkfs::utils::update_file_size(range.path, range.length, + range.offset, false, false); + } else { + LOG(WARNING, "{}() forward_write failed for path '{}' err {}", + __func__, range.path, werr); + } + } + + if(::munmap(addr, length) != 0) { + return -1; + } + + get_mmap_set().clear(); + for(const auto& entry : new_entries) { + get_mmap_set().insert(entry); } - return -1; + + LOG(DEBUG, "{}() unmap [{}..{}) updated tracked segments {}", __func__, + static_cast(unmap_begin), + static_cast(unmap_end), get_mmap_set().size()); + return 0; } @@ -349,4 +477,4 @@ gkfs_getsingleserverdir_filtered(const std::string& path, int server, path, server, start_key, filter_name, filter_size, filter_ctime); } -} // namespace gkfs::syscall \ No newline at end of file +} // namespace gkfs::syscall diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index 2b6f8cb10..a0a68188a 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -216,8 +216,8 @@ hook_fstat(unsigned int fd, struct stat* buf) { return -EFAULT; } - if(CTX->file_map()->exist(fd)) { - auto path = CTX->file_map()->get(fd)->path(); + if(auto file = CTX->file_map()->get(fd)) { + auto path = file->path(); if(gkfs::config::metadata::rename_support) { // Special case for fstat and rename, fd points to new file... // We can change file_map and recall @@ -612,8 +612,8 @@ hook_ftruncate(unsigned int fd, unsigned long length) { LOG(DEBUG, "{}() called with fd: {}, offset: {}", __func__, fd, length); - if(CTX->file_map()->exist(fd)) { - auto path = CTX->file_map()->get(fd)->path(); + if(auto file = CTX->file_map()->get(fd)) { + auto path = file->path(); return with_errno(gkfs::syscall::gkfs_truncate(path, length)); } return gsl::narrow_cast( @@ -983,8 +983,8 @@ hook_futimens(unsigned int fd, const struct timespec times[2]) { LOG(DEBUG, "{}() called with fd: {}, times: {}", __func__, fd, fmt::ptr(times)); - if(CTX->file_map()->exist(fd)) { - auto path = CTX->file_map()->get(fd)->path(); + if(auto file = CTX->file_map()->get(fd)) { + auto path = file->path(); return with_errno(gkfs::syscall::gkfs_utimensat(path, times)); } return gsl::narrow_cast( -- GitLab From 89eb113b78c22025b93922e59bfcc65b8dbeb797 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 13:25:42 +0100 Subject: [PATCH 18/68] segment mmap --- src/client/gkfs_functions.cpp | 43 +++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index b69bdefe0..bc1af6d36 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -155,14 +155,18 @@ gkfs_mmap_flush_for_path(const std::string& path) { std::lock_guard lock(get_mmap_set_mutex()); auto& mmap_set = get_mmap_set(); size_t flushed_segments = 0; + const long page_size = ::sysconf(_SC_PAGESIZE); LOG(DEBUG, "{}() path '{}' tracked segments {}", __func__, path, tracked_segments_for_path(path)); - for(const auto& entry : mmap_set) { + for(auto it = mmap_set.begin(); it != mmap_set.end();) { + const auto& entry = *it; if(entry.path != path || !(entry.prot & PROT_WRITE) || - entry.length == 0) + entry.length == 0) { + ++it; continue; + } void* addr = entry.addr; size_t len = entry.length; @@ -170,21 +174,42 @@ gkfs_mmap_flush_for_path(const std::string& path) { LOG(DEBUG, "{}() flushing segment path '{}' addr {} len {} off {}", __func__, path, addr, len, off); - // Ensure pages are present in process memory before Mercury tries to - // register them (NA_Mem_register). This avoids EFAULT during bulk IO. - volatile char* p = static_cast(addr); - for(size_t i = 0; i < len; i += 4096) { - (void) p[i]; + // Avoid raw page dereferences here: stale segments can exist when + // munmap happens outside the interceptor path. mincore() lets us detect + // unmapped pages safely and drop stale tracking entries. + bool mapped = true; + if(page_size > 0) { + const auto start = ptr_to_addr(addr); + const auto end = start + len; + unsigned char mincore_vec = 0; + for(auto p = start; p < end; + p += static_cast(page_size)) { + if(::mincore(addr_to_ptr(p), static_cast(page_size), + &mincore_vec) != 0) { + mapped = false; + break; + } + } + } + + if(!mapped) { + LOG(WARNING, + "{}() dropping stale mmap segment path '{}' addr {} len {}", + __func__, path, addr, len); + it = mmap_set.erase(it); + continue; } auto [werr, wsize] = gkfs::rpc::forward_write(path, addr, off, len, 0); if(!werr) { gkfs::utils::update_file_size(path, len, off, false, false); } else { - LOG(WARNING, "{}() forward_write failed for path '{}' err {}", - __func__, path, werr); + LOG(WARNING, + "{}() forward_write failed for path '{}' err {} (addr {} len {})", + __func__, path, werr, addr, len); } ++flushed_segments; + ++it; } LOG(DEBUG, "{}() path '{}' flushed segments {}", __func__, path, flushed_segments); -- GitLab From 173eb82f8ecc116a03bf2685b4bb7f8466e103cf Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 13:47:25 +0100 Subject: [PATCH 19/68] chunked mmap --- src/client/gkfs_functions.cpp | 91 ++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 40 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index bc1af6d36..37069877d 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -139,6 +139,48 @@ tracked_segments_for_path(const std::string& path) { return segments; } +constexpr size_t k_mmap_flush_chunk_size = 8UL * 1024UL * 1024UL; + +int +flush_range_chunked(const std::string& path, const void* addr, size_t len, + off_t off) { + const auto* base = static_cast(addr); + size_t total = 0; + + while(total < len) { + const size_t chunk = std::min(k_mmap_flush_chunk_size, len - total); + const off_t chunk_off = + static_cast(off + static_cast(total)); + auto* chunk_ptr = const_cast(base + total); + + auto [werr, wsize] = + gkfs::rpc::forward_write(path, chunk_ptr, chunk_off, chunk, 0); + if(werr) { + errno = werr; + LOG(WARNING, + "{}() forward_write failed path '{}' off {} len {} err {}", + __func__, path, chunk_off, chunk, werr); + return -1; + } + if(wsize <= 0) { + errno = EIO; + LOG(WARNING, + "{}() forward_write invalid size {} path '{}' off {} len {}", + __func__, wsize, path, chunk_off, chunk); + return -1; + } + + const size_t wrote = static_cast(wsize); + gkfs::utils::update_file_size(path, wrote, chunk_off, false, false); + total += wrote; + if(wrote < chunk) { + break; + } + } + + return static_cast(total); +} + } // namespace namespace gkfs::syscall { @@ -200,13 +242,10 @@ gkfs_mmap_flush_for_path(const std::string& path) { continue; } - auto [werr, wsize] = gkfs::rpc::forward_write(path, addr, off, len, 0); - if(!werr) { - gkfs::utils::update_file_size(path, len, off, false, false); - } else { + if(flush_range_chunked(path, addr, len, off) < 0) { LOG(WARNING, - "{}() forward_write failed for path '{}' err {} (addr {} len {})", - __func__, path, werr, addr, len); + "{}() chunked flush failed for path '{}' (addr {} len {} off {})", + __func__, path, addr, len, off); } ++flushed_segments; ++it; @@ -303,23 +342,9 @@ gkfs_msync(void* addr, size_t length, int flags) { } for(const auto& range : flush_ranges) { - auto gkfs_fd = CTX->file_map()->get(range.fd); - if(gkfs_fd) { - auto ret = gkfs::syscall::gkfs_pwrite(range.fd, range.addr, - range.length, range.offset); - if(ret >= 0) { - continue; - } - } - auto [werr, wsize] = gkfs::rpc::forward_write( - range.path, range.addr, range.offset, range.length, 0); - if(!werr) { - gkfs::utils::update_file_size(range.path, range.length, - range.offset, false, false); - } else { - LOG(WARNING, "{}() forward_write failed for path '{}' err {}", - __func__, range.path, werr); - } + (void) range.fd; + (void) flush_range_chunked(range.path, range.addr, range.length, + range.offset); } return 0; } @@ -403,23 +428,9 @@ gkfs_munmap(void* addr, size_t length) { } for(const auto& range : flush_ranges) { - auto gkfs_fd = CTX->file_map()->get(range.fd); - if(gkfs_fd) { - auto ret = gkfs::syscall::gkfs_pwrite(range.fd, range.addr, - range.length, range.offset); - if(ret >= 0) { - continue; - } - } - auto [werr, wsize] = gkfs::rpc::forward_write( - range.path, range.addr, range.offset, range.length, 0); - if(!werr) { - gkfs::utils::update_file_size(range.path, range.length, - range.offset, false, false); - } else { - LOG(WARNING, "{}() forward_write failed for path '{}' err {}", - __func__, range.path, werr); - } + (void) range.fd; + (void) flush_range_chunked(range.path, range.addr, range.length, + range.offset); } if(::munmap(addr, length) != 0) { -- GitLab From 6267adafdcace81af9a6be1fe322d3d021f80a45 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 14:00:22 +0100 Subject: [PATCH 20/68] segment --- src/client/gkfs_functions.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 37069877d..4ffdb08ca 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -146,15 +147,16 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, off_t off) { const auto* base = static_cast(addr); size_t total = 0; + std::vector bounce(k_mmap_flush_chunk_size); while(total < len) { const size_t chunk = std::min(k_mmap_flush_chunk_size, len - total); const off_t chunk_off = static_cast(off + static_cast(total)); - auto* chunk_ptr = const_cast(base + total); + std::memcpy(bounce.data(), base + total, chunk); - auto [werr, wsize] = - gkfs::rpc::forward_write(path, chunk_ptr, chunk_off, chunk, 0); + auto [werr, wsize] = gkfs::rpc::forward_write(path, bounce.data(), + chunk_off, chunk, 0); if(werr) { errno = werr; LOG(WARNING, -- GitLab From 0f18828c944ef79dfc30a7852694bdc5a5d1d47a Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 14:00:35 +0100 Subject: [PATCH 21/68] segment --- src/client/gkfs_functions.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 4ffdb08ca..4b5a3e071 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -248,6 +248,8 @@ gkfs_mmap_flush_for_path(const std::string& path) { LOG(WARNING, "{}() chunked flush failed for path '{}' (addr {} len {} off {})", __func__, path, addr, len, off); + it = mmap_set.erase(it); + continue; } ++flushed_segments; ++it; @@ -345,8 +347,10 @@ gkfs_msync(void* addr, size_t length, int flags) { for(const auto& range : flush_ranges) { (void) range.fd; - (void) flush_range_chunked(range.path, range.addr, range.length, - range.offset); + if(flush_range_chunked(range.path, range.addr, range.length, + range.offset) < 0) { + return -1; + } } return 0; } @@ -431,8 +435,10 @@ gkfs_munmap(void* addr, size_t length) { for(const auto& range : flush_ranges) { (void) range.fd; - (void) flush_range_chunked(range.path, range.addr, range.length, - range.offset); + if(flush_range_chunked(range.path, range.addr, range.length, + range.offset) < 0) { + return -1; + } } if(::munmap(addr, length) != 0) { -- GitLab From fc808fe0f12cc4003cb6cbca0a058b10f6dcfd5c Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 14:17:30 +0100 Subject: [PATCH 22/68] process_vm_readv --- src/client/gkfs_functions.cpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 4b5a3e071..2f07e7d77 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -73,6 +73,7 @@ extern "C" { #include #include #include +#include } using namespace std; @@ -153,7 +154,22 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, const size_t chunk = std::min(k_mmap_flush_chunk_size, len - total); const off_t chunk_off = static_cast(off + static_cast(total)); - std::memcpy(bounce.data(), base + total, chunk); + size_t copied = 0; + while(copied < chunk) { + struct iovec local_iov{bounce.data() + copied, chunk - copied}; + struct iovec remote_iov{const_cast(base + total + copied), + chunk - copied}; + const ssize_t nread = ::process_vm_readv(::getpid(), &local_iov, 1, + &remote_iov, 1, 0); + if(nread <= 0) { + LOG(WARNING, + "{}() process_vm_readv failed path '{}' src {} len {} errno {}", + __func__, path, remote_iov.iov_base, remote_iov.iov_len, + errno); + return -1; + } + copied += static_cast(nread); + } auto [werr, wsize] = gkfs::rpc::forward_write(path, bounce.data(), chunk_off, chunk, 0); -- GitLab From fa83d52d3ad7e1df270ea4b5441b5bda04c878ed Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 14:25:44 +0100 Subject: [PATCH 23/68] error prop --- include/client/gkfs_functions.hpp | 2 +- src/client/gkfs_functions.cpp | 7 ++++--- src/client/gkfs_libc.cpp | 19 +++++++++++-------- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index d49837764..b0703b68f 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -145,7 +145,7 @@ gkfs_read_ws(const gkfs::filemap::OpenFile& file, char* buf, size_t count, ssize_t gkfs_pread(int fd, void* buf, size_t count, off64_t offset); -void +int gkfs_mmap_flush_for_path(const std::string& path); ssize_t diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 2f07e7d77..fa027d7eb 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -210,7 +210,7 @@ namespace gkfs::syscall { * sees data written through an mmap mapping that hasn't been munmap'd yet. * Without this, the daemon has 0 bytes and reads return EOF. */ -void +int gkfs_mmap_flush_for_path(const std::string& path) { std::lock_guard lock(get_mmap_set_mutex()); auto& mmap_set = get_mmap_set(); @@ -264,14 +264,15 @@ gkfs_mmap_flush_for_path(const std::string& path) { LOG(WARNING, "{}() chunked flush failed for path '{}' (addr {} len {} off {})", __func__, path, addr, len, off); - it = mmap_set.erase(it); - continue; + errno = EIO; + return -1; } ++flushed_segments; ++it; } LOG(DEBUG, "{}() path '{}' flushed segments {}", __func__, path, flushed_segments); + return 0; } void* diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index f25b8e55f..2c46fc8ba 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -79,7 +79,7 @@ // Forward declaration: defined in gkfs_functions.cpp namespace gkfs::syscall { -void +int gkfs_mmap_flush_for_path(const std::string& path); } // namespace gkfs::syscall @@ -877,8 +877,9 @@ read(int fd, void* buf, size_t nbyte) { DEBUG_INFO("read(fd={}, nbyte={})", fd, nbyte); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd) - gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()); + if(gkfs_fd && + gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()) < 0) + return -1; } GKFS_OPERATION(read, fd, buf, nbyte); GKFS_FALLBACK(read, fd, buf, nbyte); @@ -897,8 +898,9 @@ pread(int fd, void* buf, size_t count, off_t offset) { DEBUG_INFO("pread(fd={}, count={}, offset={})", fd, count, offset); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd) - gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()); + if(gkfs_fd && + gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()) < 0) + return -1; } GKFS_OPERATION(pread, fd, buf, count, offset); GKFS_FALLBACK(pread, fd, buf, count, offset); @@ -917,8 +919,9 @@ pread64(int fd, void* buf, size_t count, off64_t offset) { DEBUG_INFO("pread64(fd={}, count={}, offset={})", fd, count, offset); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd) - gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()); + if(gkfs_fd && + gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()) < 0) + return -1; } GKFS_OPERATION(pread, fd, buf, count, offset); // GekkoFS pread likely handles large offsets @@ -3231,4 +3234,4 @@ _ZNSt10filesystem10remove_allERKNS_7__cxx114pathE( } } return real_std_fs_remove_all_ptr(p); -} \ No newline at end of file +} -- GitLab From b78e90236449129e570187b3c2721fdc59a6edfd Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 14:31:13 +0100 Subject: [PATCH 24/68] fix --- include/client/gkfs_functions.hpp | 3 ++- src/client/gkfs_functions.cpp | 42 +++++++++++++++++++++++++------ src/client/gkfs_libc.cpp | 15 ++++++++--- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index b0703b68f..e7d1650a2 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -146,7 +146,8 @@ ssize_t gkfs_pread(int fd, void* buf, size_t count, off64_t offset); int -gkfs_mmap_flush_for_path(const std::string& path); +gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, + size_t length); ssize_t gkfs_read(int fd, void* buf, size_t count); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index fa027d7eb..3ebec459c 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -211,14 +211,30 @@ namespace gkfs::syscall { * Without this, the daemon has 0 bytes and reads return EOF. */ int -gkfs_mmap_flush_for_path(const std::string& path) { +gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, + size_t length) { + if(offset < 0) { + errno = EINVAL; + return -1; + } + if(length == 0) { + return 0; + } + + const auto read_begin = static_cast(offset); + const auto read_end = read_begin + static_cast(length); + if(read_end < read_begin) { + errno = EINVAL; + return -1; + } + std::lock_guard lock(get_mmap_set_mutex()); auto& mmap_set = get_mmap_set(); size_t flushed_segments = 0; const long page_size = ::sysconf(_SC_PAGESIZE); - LOG(DEBUG, "{}() path '{}' tracked segments {}", __func__, path, - tracked_segments_for_path(path)); + LOG(DEBUG, "{}() path '{}' tracked segments {} read-range [{}..{})", + __func__, path, tracked_segments_for_path(path), read_begin, read_end); for(auto it = mmap_set.begin(); it != mmap_set.end();) { const auto& entry = *it; @@ -228,10 +244,22 @@ gkfs_mmap_flush_for_path(const std::string& path) { continue; } - void* addr = entry.addr; - size_t len = entry.length; - off_t off = entry.offset; - LOG(DEBUG, "{}() flushing segment path '{}' addr {} len {} off {}", + const auto entry_begin = static_cast(entry.offset); + const auto entry_end = + entry_begin + static_cast(entry.length); + if(entry_end <= read_begin || read_end <= entry_begin) { + ++it; + continue; + } + + const auto flush_begin = std::max(entry_begin, read_begin); + const auto flush_end = std::min(entry_end, read_end); + const auto rel_begin = static_cast(flush_begin - entry_begin); + const size_t len = static_cast(flush_end - flush_begin); + void* addr = addr_to_ptr(ptr_to_addr(entry.addr) + rel_begin); + off_t off = static_cast(flush_begin); + LOG(DEBUG, + "{}() flushing overlap path '{}' addr {} len {} file-off {}", __func__, path, addr, len, off); // Avoid raw page dereferences here: stale segments can exist when diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 2c46fc8ba..8d45f3782 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -80,7 +80,8 @@ // Forward declaration: defined in gkfs_functions.cpp namespace gkfs::syscall { int -gkfs_mmap_flush_for_path(const std::string& path); +gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, + size_t length); } // namespace gkfs::syscall @@ -878,7 +879,10 @@ read(int fd, void* buf, size_t nbyte) { if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); if(gkfs_fd && - gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()) < 0) + gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path(), + static_cast( + gkfs_fd->pos()), + nbyte) < 0) return -1; } GKFS_OPERATION(read, fd, buf, nbyte); @@ -899,7 +903,9 @@ pread(int fd, void* buf, size_t count, off_t offset) { if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); if(gkfs_fd && - gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()) < 0) + gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path(), + static_cast(offset), + count) < 0) return -1; } GKFS_OPERATION(pread, fd, buf, count, offset); @@ -920,7 +926,8 @@ pread64(int fd, void* buf, size_t count, off64_t offset) { if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); if(gkfs_fd && - gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path()) < 0) + gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path(), offset, + count) < 0) return -1; } GKFS_OPERATION(pread, fd, buf, count, -- GitLab From 01fcc673f0bb316be7eabeb9ccc35206be90521d Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 14:37:27 +0100 Subject: [PATCH 25/68] fix error 5 --- src/client/gkfs_functions.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 3ebec459c..07acfb176 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -142,6 +142,8 @@ tracked_segments_for_path(const std::string& path) { } constexpr size_t k_mmap_flush_chunk_size = 8UL * 1024UL * 1024UL; +constexpr int k_flush_ok = 0; +constexpr int k_flush_stale_mapping = -2; int flush_range_chunked(const std::string& path, const void* addr, size_t len, @@ -162,10 +164,14 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, const ssize_t nread = ::process_vm_readv(::getpid(), &local_iov, 1, &remote_iov, 1, 0); if(nread <= 0) { + const int read_errno = errno; LOG(WARNING, "{}() process_vm_readv failed path '{}' src {} len {} errno {}", __func__, path, remote_iov.iov_base, remote_iov.iov_len, - errno); + read_errno); + if(read_errno == EFAULT || read_errno == ENOMEM) { + return k_flush_stale_mapping; + } return -1; } copied += static_cast(nread); @@ -196,7 +202,7 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, } } - return static_cast(total); + return (total == len) ? k_flush_ok : static_cast(total); } } // namespace @@ -288,7 +294,15 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, continue; } - if(flush_range_chunked(path, addr, len, off) < 0) { + const int flush_rc = flush_range_chunked(path, addr, len, off); + if(flush_rc == k_flush_stale_mapping) { + LOG(WARNING, + "{}() stale mmap segment during flush path '{}' (addr {} len {} off {}), dropping segment", + __func__, path, addr, len, off); + it = mmap_set.erase(it); + continue; + } + if(flush_rc < 0) { LOG(WARNING, "{}() chunked flush failed for path '{}' (addr {} len {} off {})", __func__, path, addr, len, off); -- GitLab From 417fce01815834b730bc61df239be6ffe9d9d311 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 14:41:26 +0100 Subject: [PATCH 26/68] remove vec --- src/client/gkfs_functions.cpp | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 07acfb176..bf28f317a 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -73,7 +73,6 @@ extern "C" { #include #include #include -#include } using namespace std; @@ -143,7 +142,6 @@ tracked_segments_for_path(const std::string& path) { constexpr size_t k_mmap_flush_chunk_size = 8UL * 1024UL * 1024UL; constexpr int k_flush_ok = 0; -constexpr int k_flush_stale_mapping = -2; int flush_range_chunked(const std::string& path, const void* addr, size_t len, @@ -156,26 +154,7 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, const size_t chunk = std::min(k_mmap_flush_chunk_size, len - total); const off_t chunk_off = static_cast(off + static_cast(total)); - size_t copied = 0; - while(copied < chunk) { - struct iovec local_iov{bounce.data() + copied, chunk - copied}; - struct iovec remote_iov{const_cast(base + total + copied), - chunk - copied}; - const ssize_t nread = ::process_vm_readv(::getpid(), &local_iov, 1, - &remote_iov, 1, 0); - if(nread <= 0) { - const int read_errno = errno; - LOG(WARNING, - "{}() process_vm_readv failed path '{}' src {} len {} errno {}", - __func__, path, remote_iov.iov_base, remote_iov.iov_len, - read_errno); - if(read_errno == EFAULT || read_errno == ENOMEM) { - return k_flush_stale_mapping; - } - return -1; - } - copied += static_cast(nread); - } + std::memcpy(bounce.data(), base + total, chunk); auto [werr, wsize] = gkfs::rpc::forward_write(path, bounce.data(), chunk_off, chunk, 0); @@ -295,13 +274,6 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, } const int flush_rc = flush_range_chunked(path, addr, len, off); - if(flush_rc == k_flush_stale_mapping) { - LOG(WARNING, - "{}() stale mmap segment during flush path '{}' (addr {} len {} off {}), dropping segment", - __func__, path, addr, len, off); - it = mmap_set.erase(it); - continue; - } if(flush_rc < 0) { LOG(WARNING, "{}() chunked flush failed for path '{}' (addr {} len {} off {})", -- GitLab From dc0186a014b382b3ddb53d35d6e5455115cbdcfd Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 14:44:59 +0100 Subject: [PATCH 27/68] f1 --- src/client/gkfs_functions.cpp | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index bf28f317a..53d5c4711 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -73,6 +73,7 @@ extern "C" { #include #include #include +#include } using namespace std; @@ -142,6 +143,7 @@ tracked_segments_for_path(const std::string& path) { constexpr size_t k_mmap_flush_chunk_size = 8UL * 1024UL * 1024UL; constexpr int k_flush_ok = 0; +constexpr int k_flush_unavailable = 1; int flush_range_chunked(const std::string& path, const void* addr, size_t len, @@ -154,7 +156,31 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, const size_t chunk = std::min(k_mmap_flush_chunk_size, len - total); const off_t chunk_off = static_cast(off + static_cast(total)); - std::memcpy(bounce.data(), base + total, chunk); + size_t copied = 0; + while(copied < chunk) { + struct iovec local_iov{bounce.data() + copied, chunk - copied}; + struct iovec remote_iov{const_cast(base + total + copied), + chunk - copied}; + const ssize_t nread = ::process_vm_readv(::getpid(), &local_iov, 1, + &remote_iov, 1, 0); + if(nread <= 0) { + const int copy_errno = errno; + if(copy_errno == EFAULT || copy_errno == ENOMEM) { + LOG(WARNING, + "{}() source became unavailable path '{}' src {} len {}", + __func__, path, remote_iov.iov_base, + remote_iov.iov_len); + return k_flush_unavailable; + } + errno = copy_errno; + LOG(WARNING, + "{}() process_vm_readv failed path '{}' src {} len {} errno {}", + __func__, path, remote_iov.iov_base, remote_iov.iov_len, + copy_errno); + return -1; + } + copied += static_cast(nread); + } auto [werr, wsize] = gkfs::rpc::forward_write(path, bounce.data(), chunk_off, chunk, 0); @@ -274,6 +300,12 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, } const int flush_rc = flush_range_chunked(path, addr, len, off); + if(flush_rc == k_flush_unavailable) { + // Mapping raced with unmap; treat as best-effort flush miss and + // continue without surfacing EIO. + ++it; + continue; + } if(flush_rc < 0) { LOG(WARNING, "{}() chunked flush failed for path '{}' (addr {} len {} off {})", -- GitLab From 1ec9fc8d8740fd6aa54eb126d5348d3c05e1a999 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 14:50:32 +0100 Subject: [PATCH 28/68] narrow --- src/client/gkfs_functions.cpp | 2 ++ src/client/gkfs_libc.cpp | 15 ++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 53d5c4711..03416672b 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -405,6 +405,7 @@ gkfs_msync(void* addr, size_t length, int flags) { entry.fd, entry.path}); } if(flush_ranges.empty()) { + errno = ENODEV; return -1; } @@ -493,6 +494,7 @@ gkfs_munmap(void* addr, size_t length) { } if(!handled) { + errno = ENODEV; return -1; } diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 8d45f3782..971504730 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -2614,11 +2614,11 @@ msync(void* addr, size_t length, int flags) { DEBUG_INFO("[GKFS] msync handled by GekkoFS for addr={}", addr); return 0; } - // If gkfs_ret indicates "not my memory" (e.g., -1 with - // errno=ENOMEM/ENODEV), then fallback. Assuming non-zero means GekkoFS - // didn't handle it or errored internally. The original just checked for - // 0. If gkfs_msync sets errno for "not my memory", we might need to - // clear it before fallback. + if(errno != ENODEV) { + // It was a GKFS mapping path but flush/sync failed: do not silently + // bypass to libc, propagate the failure. + return -1; + } DEBUG_INFO("[BYPASS] msync for addr={}", addr); GKFS_FALLBACK(msync, addr, length, flags); } @@ -2633,6 +2633,11 @@ munmap(void* addr, size_t length) { DEBUG_INFO("[GKFS] munmap handled by GekkoFS for addr={}", addr); return 0; } + if(errno != ENODEV) { + // It was a GKFS mapping path but flush/unmap failed: do not silently + // bypass and lose dirty mmap contents. + return -1; + } DEBUG_INFO("[BYPASS] munmap for addr={}", addr); GKFS_FALLBACK(munmap, addr, length); } -- GitLab From 2ac7400313ec86c390d331613cd8633b8ad33b79 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 14:56:39 +0100 Subject: [PATCH 29/68] debug --- src/client/gkfs_functions.cpp | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 03416672b..693db8c79 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -151,26 +151,35 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, const auto* base = static_cast(addr); size_t total = 0; std::vector bounce(k_mmap_flush_chunk_size); + const long page_size_l = ::sysconf(_SC_PAGESIZE); + const size_t page_size = + page_size_l > 0 ? static_cast(page_size_l) : 4096U; while(total < len) { const size_t chunk = std::min(k_mmap_flush_chunk_size, len - total); const off_t chunk_off = static_cast(off + static_cast(total)); size_t copied = 0; + bool chunk_incomplete = false; while(copied < chunk) { - struct iovec local_iov{bounce.data() + copied, chunk - copied}; + const size_t to_copy = std::min(page_size, chunk - copied); + struct iovec local_iov{bounce.data() + copied, to_copy}; struct iovec remote_iov{const_cast(base + total + copied), - chunk - copied}; + to_copy}; const ssize_t nread = ::process_vm_readv(::getpid(), &local_iov, 1, &remote_iov, 1, 0); if(nread <= 0) { const int copy_errno = errno; if(copy_errno == EFAULT || copy_errno == ENOMEM) { - LOG(WARNING, - "{}() source became unavailable path '{}' src {} len {}", - __func__, path, remote_iov.iov_base, - remote_iov.iov_len); - return k_flush_unavailable; + if(copied == 0) { + LOG(WARNING, + "{}() source unavailable before copy path '{}' src {} len {}", + __func__, path, remote_iov.iov_base, + remote_iov.iov_len); + return k_flush_unavailable; + } + chunk_incomplete = true; + break; } errno = copy_errno; LOG(WARNING, @@ -182,27 +191,31 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, copied += static_cast(nread); } + if(copied == 0) { + return k_flush_unavailable; + } + auto [werr, wsize] = gkfs::rpc::forward_write(path, bounce.data(), - chunk_off, chunk, 0); + chunk_off, copied, 0); if(werr) { errno = werr; LOG(WARNING, "{}() forward_write failed path '{}' off {} len {} err {}", - __func__, path, chunk_off, chunk, werr); + __func__, path, chunk_off, copied, werr); return -1; } if(wsize <= 0) { errno = EIO; LOG(WARNING, "{}() forward_write invalid size {} path '{}' off {} len {}", - __func__, wsize, path, chunk_off, chunk); + __func__, wsize, path, chunk_off, copied); return -1; } const size_t wrote = static_cast(wsize); gkfs::utils::update_file_size(path, wrote, chunk_off, false, false); total += wrote; - if(wrote < chunk) { + if(wrote < copied || chunk_incomplete) { break; } } -- GitLab From 84488b0aecf18d92716a4667c2681763c87f4aca Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 15:02:00 +0100 Subject: [PATCH 30/68] fi --- include/client/gkfs_functions.hpp | 3 +++ src/client/gkfs_functions.cpp | 35 +++++++++++++++++++++++++++---- src/client/gkfs_libc.cpp | 18 +++++++--------- src/client/gkfs_metadata.cpp | 9 ++++++++ 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index e7d1650a2..bb70a8769 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -149,6 +149,9 @@ int gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, size_t length); +int +gkfs_mmap_flush_all_for_path(const std::string& path); + ssize_t gkfs_read(int fd, void* buf, size_t count); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 693db8c79..d580a1ad9 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -269,8 +269,7 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, } const auto entry_begin = static_cast(entry.offset); - const auto entry_end = - entry_begin + static_cast(entry.length); + const auto entry_end = entry_begin + static_cast(entry.length); if(entry_end <= read_begin || read_end <= entry_begin) { ++it; continue; @@ -282,8 +281,7 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, const size_t len = static_cast(flush_end - flush_begin); void* addr = addr_to_ptr(ptr_to_addr(entry.addr) + rel_begin); off_t off = static_cast(flush_begin); - LOG(DEBUG, - "{}() flushing overlap path '{}' addr {} len {} file-off {}", + LOG(DEBUG, "{}() flushing overlap path '{}' addr {} len {} file-off {}", __func__, path, addr, len, off); // Avoid raw page dereferences here: stale segments can exist when @@ -334,6 +332,35 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, return 0; } +int +gkfs_mmap_flush_all_for_path(const std::string& path) { + std::lock_guard lock(get_mmap_set_mutex()); + size_t flushed_segments = 0; + + for(const auto& entry : get_mmap_set()) { + if(entry.path != path || !(entry.prot & PROT_WRITE) || + entry.length == 0) { + continue; + } + + const int flush_rc = flush_range_chunked(path, entry.addr, entry.length, + entry.offset); + if(flush_rc == k_flush_unavailable) { + // Mapping is no longer readable, likely already unmapped. + continue; + } + if(flush_rc < 0) { + errno = EIO; + return -1; + } + ++flushed_segments; + } + + LOG(DEBUG, "{}() path '{}' flushed all segments {}", __func__, path, + flushed_segments); + return 0; +} + void* gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 971504730..17da55b4a 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -878,11 +878,9 @@ read(int fd, void* buf, size_t nbyte) { DEBUG_INFO("read(fd={}, nbyte={})", fd, nbyte); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd && - gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path(), - static_cast( - gkfs_fd->pos()), - nbyte) < 0) + if(gkfs_fd && gkfs::syscall::gkfs_mmap_flush_for_path( + gkfs_fd->path(), + static_cast(gkfs_fd->pos()), nbyte) < 0) return -1; } GKFS_OPERATION(read, fd, buf, nbyte); @@ -903,9 +901,8 @@ pread(int fd, void* buf, size_t count, off_t offset) { if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); if(gkfs_fd && - gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path(), - static_cast(offset), - count) < 0) + gkfs::syscall::gkfs_mmap_flush_for_path( + gkfs_fd->path(), static_cast(offset), count) < 0) return -1; } GKFS_OPERATION(pread, fd, buf, count, offset); @@ -925,9 +922,8 @@ pread64(int fd, void* buf, size_t count, off64_t offset) { DEBUG_INFO("pread64(fd={}, count={}, offset={})", fd, count, offset); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd && - gkfs::syscall::gkfs_mmap_flush_for_path(gkfs_fd->path(), offset, - count) < 0) + if(gkfs_fd && gkfs::syscall::gkfs_mmap_flush_for_path( + gkfs_fd->path(), offset, count) < 0) return -1; } GKFS_OPERATION(pread, fd, buf, count, diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 733a189b0..7e81624f6 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -1770,6 +1770,15 @@ int gkfs_close(unsigned int fd) { auto file = CTX->file_map()->get(fd); if(file) { + if(file->type() == gkfs::filemap::FileType::regular) { + auto flush_err = gkfs_mmap_flush_all_for_path(file->path()); + if(flush_err < 0) { + LOG(ERROR, "{}() mmap flush failed for path '{}'", __func__, + file->path()); + return -1; + } + } + if(file->get_flag(gkfs::filemap::OpenFile_flags::creation_pending)) { gkfs_create(file->path(), file->mode()); file->set_flag(gkfs::filemap::OpenFile_flags::creation_pending, -- GitLab From 198fec36ea74d997444e2c07835b88341476bb6c Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 15:08:07 +0100 Subject: [PATCH 31/68] debug --- src/client/gkfs_functions.cpp | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index d580a1ad9..3822eeb33 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -73,7 +73,6 @@ extern "C" { #include #include #include -#include } using namespace std; @@ -151,44 +150,36 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, const auto* base = static_cast(addr); size_t total = 0; std::vector bounce(k_mmap_flush_chunk_size); - const long page_size_l = ::sysconf(_SC_PAGESIZE); - const size_t page_size = - page_size_l > 0 ? static_cast(page_size_l) : 4096U; while(total < len) { const size_t chunk = std::min(k_mmap_flush_chunk_size, len - total); const off_t chunk_off = static_cast(off + static_cast(total)); + const long page_size_l = ::sysconf(_SC_PAGESIZE); + const size_t page_size = + page_size_l > 0 ? static_cast(page_size_l) : 4096U; size_t copied = 0; - bool chunk_incomplete = false; while(copied < chunk) { const size_t to_copy = std::min(page_size, chunk - copied); - struct iovec local_iov{bounce.data() + copied, to_copy}; - struct iovec remote_iov{const_cast(base + total + copied), - to_copy}; - const ssize_t nread = ::process_vm_readv(::getpid(), &local_iov, 1, - &remote_iov, 1, 0); - if(nread <= 0) { - const int copy_errno = errno; - if(copy_errno == EFAULT || copy_errno == ENOMEM) { + void* src_ptr = const_cast(base + total + copied); + unsigned char mincore_vec = 0; + if(::mincore(src_ptr, page_size, &mincore_vec) != 0) { + if(errno == EFAULT || errno == ENOMEM) { if(copied == 0) { LOG(WARNING, "{}() source unavailable before copy path '{}' src {} len {}", - __func__, path, remote_iov.iov_base, - remote_iov.iov_len); + __func__, path, src_ptr, to_copy); return k_flush_unavailable; } - chunk_incomplete = true; break; } - errno = copy_errno; LOG(WARNING, - "{}() process_vm_readv failed path '{}' src {} len {} errno {}", - __func__, path, remote_iov.iov_base, remote_iov.iov_len, - copy_errno); + "{}() mincore failed path '{}' src {} len {} errno {}", + __func__, path, src_ptr, to_copy, errno); return -1; } - copied += static_cast(nread); + std::memcpy(bounce.data() + copied, src_ptr, to_copy); + copied += to_copy; } if(copied == 0) { @@ -215,7 +206,7 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, const size_t wrote = static_cast(wsize); gkfs::utils::update_file_size(path, wrote, chunk_off, false, false); total += wrote; - if(wrote < copied || chunk_incomplete) { + if(wrote < copied) { break; } } -- GitLab From 8ced2a98b4598daa4ef5c3d0a4eda463322bdd6a Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 16:16:35 +0100 Subject: [PATCH 32/68] seg --- src/client/gkfs_functions.cpp | 39 ++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 3822eeb33..fa6346071 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -69,6 +69,7 @@ extern "C" { #include // used for file types in the getdents{,64}() functions #include // used for definition of alignment macros +#include #include #include #include @@ -144,12 +145,28 @@ constexpr size_t k_mmap_flush_chunk_size = 8UL * 1024UL * 1024UL; constexpr int k_flush_ok = 0; constexpr int k_flush_unavailable = 1; +int +self_mem_fd() { + static int fd = -1; + if(fd >= 0) { + return fd; + } + fd = static_cast(::syscall_no_intercept( + SYS_openat, AT_FDCWD, "/proc/self/mem", O_RDONLY | O_CLOEXEC, 0)); + return fd; +} + int flush_range_chunked(const std::string& path, const void* addr, size_t len, off_t off) { const auto* base = static_cast(addr); size_t total = 0; std::vector bounce(k_mmap_flush_chunk_size); + const int mem_fd = self_mem_fd(); + if(mem_fd < 0) { + errno = EIO; + return -1; + } while(total < len) { const size_t chunk = std::min(k_mmap_flush_chunk_size, len - total); @@ -161,25 +178,27 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, size_t copied = 0; while(copied < chunk) { const size_t to_copy = std::min(page_size, chunk - copied); - void* src_ptr = const_cast(base + total + copied); - unsigned char mincore_vec = 0; - if(::mincore(src_ptr, page_size, &mincore_vec) != 0) { - if(errno == EFAULT || errno == ENOMEM) { + const auto src_off = static_cast( + reinterpret_cast(base + total + copied)); + const auto nread = static_cast(::syscall_no_intercept( + SYS_pread64, mem_fd, bounce.data() + copied, to_copy, + src_off)); + if(nread <= 0) { + if(errno == EFAULT || errno == ENOMEM || errno == EIO) { if(copied == 0) { LOG(WARNING, - "{}() source unavailable before copy path '{}' src {} len {}", - __func__, path, src_ptr, to_copy); + "{}() source unavailable before copy path '{}' off {} len {}", + __func__, path, src_off, to_copy); return k_flush_unavailable; } break; } LOG(WARNING, - "{}() mincore failed path '{}' src {} len {} errno {}", - __func__, path, src_ptr, to_copy, errno); + "{}() self-mem read failed path '{}' off {} len {} errno {}", + __func__, path, src_off, to_copy, errno); return -1; } - std::memcpy(bounce.data() + copied, src_ptr, to_copy); - copied += to_copy; + copied += static_cast(nread); } if(copied == 0) { -- GitLab From 4d9f92ba2b9306194e3faced2024f6e5cc817f9e Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 18:32:52 +0100 Subject: [PATCH 33/68] close --- src/client/gkfs_metadata.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 7e81624f6..2a2cb15f3 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -1773,9 +1773,8 @@ gkfs_close(unsigned int fd) { if(file->type() == gkfs::filemap::FileType::regular) { auto flush_err = gkfs_mmap_flush_all_for_path(file->path()); if(flush_err < 0) { - LOG(ERROR, "{}() mmap flush failed for path '{}'", __func__, + LOG(WARNING, "{}() mmap flush failed for path '{}'", __func__, file->path()); - return -1; } } -- GitLab From 16985eb44e702a79fc874931b8ac0142f2cc7196 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Fri, 27 Feb 2026 21:52:37 +0100 Subject: [PATCH 34/68] debug --- src/client/gkfs_functions.cpp | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index fa6346071..f2701197b 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -146,14 +146,9 @@ constexpr int k_flush_ok = 0; constexpr int k_flush_unavailable = 1; int -self_mem_fd() { - static int fd = -1; - if(fd >= 0) { - return fd; - } - fd = static_cast(::syscall_no_intercept( +open_self_mem_fd() { + return static_cast(::syscall_no_intercept( SYS_openat, AT_FDCWD, "/proc/self/mem", O_RDONLY | O_CLOEXEC, 0)); - return fd; } int @@ -162,7 +157,7 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, const auto* base = static_cast(addr); size_t total = 0; std::vector bounce(k_mmap_flush_chunk_size); - const int mem_fd = self_mem_fd(); + const int mem_fd = open_self_mem_fd(); if(mem_fd < 0) { errno = EIO; return -1; @@ -184,24 +179,28 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, SYS_pread64, mem_fd, bounce.data() + copied, to_copy, src_off)); if(nread <= 0) { - if(errno == EFAULT || errno == ENOMEM || errno == EIO) { + const auto saved_errno = errno; + if(saved_errno == EFAULT || saved_errno == ENOMEM) { if(copied == 0) { LOG(WARNING, "{}() source unavailable before copy path '{}' off {} len {}", __func__, path, src_off, to_copy); + (void) ::syscall_no_intercept(SYS_close, mem_fd); return k_flush_unavailable; } break; } LOG(WARNING, "{}() self-mem read failed path '{}' off {} len {} errno {}", - __func__, path, src_off, to_copy, errno); + __func__, path, src_off, to_copy, saved_errno); + (void) ::syscall_no_intercept(SYS_close, mem_fd); return -1; } copied += static_cast(nread); } if(copied == 0) { + (void) ::syscall_no_intercept(SYS_close, mem_fd); return k_flush_unavailable; } @@ -212,6 +211,7 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, LOG(WARNING, "{}() forward_write failed path '{}' off {} len {} err {}", __func__, path, chunk_off, copied, werr); + (void) ::syscall_no_intercept(SYS_close, mem_fd); return -1; } if(wsize <= 0) { @@ -219,6 +219,7 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, LOG(WARNING, "{}() forward_write invalid size {} path '{}' off {} len {}", __func__, wsize, path, chunk_off, copied); + (void) ::syscall_no_intercept(SYS_close, mem_fd); return -1; } @@ -230,6 +231,7 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, } } + (void) ::syscall_no_intercept(SYS_close, mem_fd); return (total == len) ? k_flush_ok : static_cast(total); } @@ -321,10 +323,16 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, } const int flush_rc = flush_range_chunked(path, addr, len, off); + LOG(DEBUG, + "{}() flush result path '{}' addr {} len {} off {} rc {}", + __func__, path, addr, len, off, flush_rc); if(flush_rc == k_flush_unavailable) { // Mapping raced with unmap; treat as best-effort flush miss and // continue without surfacing EIO. - ++it; + LOG(DEBUG, + "{}() dropping unavailable overlap path '{}' addr {} len {} off {}", + __func__, path, addr, len, off); + it = mmap_set.erase(it); continue; } if(flush_rc < 0) { @@ -355,6 +363,9 @@ gkfs_mmap_flush_all_for_path(const std::string& path) { const int flush_rc = flush_range_chunked(path, entry.addr, entry.length, entry.offset); + LOG(DEBUG, + "{}() flush-all result path '{}' addr {} len {} off {} rc {}", + __func__, path, entry.addr, entry.length, entry.offset, flush_rc); if(flush_rc == k_flush_unavailable) { // Mapping is no longer readable, likely already unmapped. continue; -- GitLab From 8e6c8ec1c3734759eeb0f3e2fef5b9a91185d359 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 08:57:41 +0100 Subject: [PATCH 35/68] extra debug --- src/client/gkfs_data.cpp | 18 ++++++++- src/client/gkfs_metadata.cpp | 71 ++++++++++++++++++++++++++++++++---- 2 files changed, 80 insertions(+), 9 deletions(-) diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 865fe6f82..dee185137 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -74,6 +74,13 @@ using namespace std; namespace gkfs::syscall { +namespace { +bool +is_offload_temp_path(const std::string& path) { + return path.rfind("/offload/t_", 0) == 0; +} +} // namespace + /** * Actual write function for all gkfs write operations * errno may be set @@ -331,14 +338,21 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, ssize_t gkfs_write_ws(gkfs::filemap::OpenFile& file, const char* buf, size_t count, off64_t offset, bool update_pos) { + const auto pos_before = file.pos(); #ifdef GKFS_ENABLE_CLIENT_METRICS auto start_t = std::chrono::high_resolution_clock::now(); auto written = gkfs_do_write(file, buf, count, offset, update_pos); CTX->write_metrics()->add_event(written, start_t); - return written; #else - return gkfs_do_write(file, buf, count, offset, update_pos); + auto written = gkfs_do_write(file, buf, count, offset, update_pos); #endif + if(is_offload_temp_path(file.path())) { + LOG(DEBUG, + "{}() offload path '{}' count {} offset {} update_pos {} written {} pos_before {} pos_after {}", + __func__, file.path(), count, offset, update_pos, written, + pos_before, file.pos()); + } + return written; } /** diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 2a2cb15f3..1c5731dcd 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -122,6 +122,11 @@ check_parent_dir(const std::string& path) { return 0; } +bool +is_offload_temp_path(const std::string& path) { + return path.rfind("/offload/t_", 0) == 0; +} + } // namespace namespace gkfs::syscall { @@ -224,6 +229,11 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if(CTX->protect_files_generator()) { generate_lock_file(path, true); } + if(is_offload_temp_path(path)) { + LOG(DEBUG, + "{}() offload open path '{}' fd {} flags {} create_pending 1", + __func__, path, fd, flags); + } return fd; } @@ -264,6 +274,10 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if(CTX->protect_files_generator()) { generate_lock_file(path, true); } + if(is_offload_temp_path(path)) { + LOG(DEBUG, "{}() offload open path '{}' fd {} flags {}", + __func__, path, fd, flags); + } // file was successfully created. Add to filemap return fd; } @@ -330,9 +344,14 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { md.inline_data(""); } // RENAMED OR SYMLINK NOT PROTECTED - return CTX->file_map()->add( + auto fd = CTX->file_map()->add( std::make_shared(new_path, flags)); + if(is_offload_temp_path(new_path)) { + LOG(DEBUG, "{}() offload open path '{}' fd {} flags {}", + __func__, new_path, fd, flags); + } + return fd; } } } @@ -367,6 +386,10 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if(CTX->protect_files_generator()) { generate_lock_file(path, true); } + if(is_offload_temp_path(path)) { + LOG(DEBUG, "{}() offload open path '{}' fd {} flags {}", __func__, + path, fd, flags); + } return fd; } @@ -1770,42 +1793,76 @@ int gkfs_close(unsigned int fd) { auto file = CTX->file_map()->get(fd); if(file) { + const auto path = file->path(); + const auto is_offload = is_offload_temp_path(path); + if(is_offload) { + LOG(DEBUG, + "{}() offload close-begin path '{}' fd {} pos {} type {} creation_pending {}", + __func__, path, fd, file->pos(), static_cast(file->type()), + file->get_flag(gkfs::filemap::OpenFile_flags::creation_pending)); + } if(file->type() == gkfs::filemap::FileType::regular) { - auto flush_err = gkfs_mmap_flush_all_for_path(file->path()); + auto flush_err = gkfs_mmap_flush_all_for_path(path); if(flush_err < 0) { LOG(WARNING, "{}() mmap flush failed for path '{}'", __func__, - file->path()); + path); + } + if(is_offload) { + LOG(DEBUG, "{}() offload close mmap_flush path '{}' rc {}", + __func__, path, flush_err); } } if(file->get_flag(gkfs::filemap::OpenFile_flags::creation_pending)) { - gkfs_create(file->path(), file->mode()); + gkfs_create(path, file->mode()); file->set_flag(gkfs::filemap::OpenFile_flags::creation_pending, false); + if(is_offload) { + LOG(DEBUG, + "{}() offload close created pending path '{}' mode {}", + __func__, path, file->mode()); + } } // flush write size cache to be server consistent if(CTX->use_write_size_cache() && CTX->write_size_cache()) { - auto err = CTX->write_size_cache()->flush(file->path(), true).first; + auto err = CTX->write_size_cache()->flush(path, true).first; if(err) { LOG(ERROR, "{}() write_size_cache() failed with err '{}'", __func__, err); errno = err; return -1; } + if(is_offload) { + auto [updates, cached_size] = CTX->write_size_cache()->get(path); + LOG(DEBUG, + "{}() offload close cache path '{}' updates {} cached_size {}", + __func__, path, updates, cached_size); + } } if(CTX->use_dentry_cache() && gkfs::config::cache::clear_dentry_cache_on_close) { // clear cache for directory if(file->type() == gkfs::filemap::FileType::directory) { - CTX->dentry_cache()->clear_dir(file->path()); + CTX->dentry_cache()->clear_dir(path); } } if(CTX->protect_files_generator()) { - auto path = file->path(); generate_lock_file(path, false); } + + if(is_offload) { + std::pair size_ret{}; + if(gkfs::config::proxy::fwd_get_size && CTX->use_proxy()) { + size_ret = gkfs::rpc::forward_get_metadentry_size_proxy(path); + } else { + size_ret = gkfs::rpc::forward_get_metadentry_size(path, 0); + } + LOG(DEBUG, + "{}() offload close-end path '{}' fd {} size_err {} server_size {}", + __func__, path, fd, size_ret.first, size_ret.second); + } // No call to the daemon is required CTX->file_map()->remove(fd); return 0; -- GitLab From 3745e9e66d6478e478e223f7df936e54f87a12a8 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 10:05:10 +0100 Subject: [PATCH 36/68] fix --- include/client/gkfs_functions.hpp | 3 +++ src/client/gkfs_functions.cpp | 8 ++++++++ src/client/gkfs_metadata.cpp | 31 +++++++++++++++++++++---------- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index bb70a8769..d28164f10 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -152,6 +152,9 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, int gkfs_mmap_flush_all_for_path(const std::string& path); +bool +gkfs_mmap_has_active_path(const std::string& path); + ssize_t gkfs_read(int fd, void* buf, size_t count); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index f2701197b..4073aadbb 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -382,6 +382,14 @@ gkfs_mmap_flush_all_for_path(const std::string& path) { return 0; } +bool +gkfs_mmap_has_active_path(const std::string& path) { + std::lock_guard lock(get_mmap_set_mutex()); + const auto& mmap_set = get_mmap_set(); + return std::any_of(mmap_set.begin(), mmap_set.end(), + [&path](const auto& entry) { return entry.path == path; }); +} + void* gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 1c5731dcd..cf91932c2 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -336,12 +336,18 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY))) { - if(gkfs_truncate(new_path, md.size(), 0)) { - LOG(ERROR, "Error truncating file"); - return -1; + if(gkfs_mmap_has_active_path(new_path)) { + LOG(DEBUG, + "{}() deferring O_TRUNC for active mmap path '{}'", + __func__, new_path); + } else { + if(gkfs_truncate(new_path, md.size(), 0)) { + LOG(ERROR, "Error truncating file"); + return -1; + } + md.size(0); + md.inline_data(""); } - md.size(0); - md.inline_data(""); } // RENAMED OR SYMLINK NOT PROTECTED auto fd = CTX->file_map()->add( @@ -363,12 +369,17 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { assert(S_ISREG(md.mode())); if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY))) { - if(gkfs_truncate(path, md.size(), 0)) { - LOG(ERROR, "Error truncating file"); - return -1; + if(gkfs_mmap_has_active_path(path)) { + LOG(DEBUG, "{}() deferring O_TRUNC for active mmap path '{}'", + __func__, path); + } else { + if(gkfs_truncate(path, md.size(), 0)) { + LOG(ERROR, "Error truncating file"); + return -1; + } + md.size(0); + md.inline_data(""); } - md.size(0); - md.inline_data(""); } auto file = std::make_shared(path, flags); if(gkfs::config::metadata::read_inline_prefetch and -- GitLab From 4656c7d4e1355c1cd02aaeca406704c7162673a4 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 10:45:23 +0100 Subject: [PATCH 37/68] mitigation --- src/client/gkfs_data.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index dee185137..0dea38a8e 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -642,6 +642,23 @@ gkfs_read(int fd, void* buf, size_t count) { gkfs_fd->path(), count, gkfs_fd->pos()); auto pos = gkfs_fd->pos(); // retrieve the current offset auto ret = gkfs_read_ws(*gkfs_fd, reinterpret_cast(buf), count, pos); + if(ret == 0 && pos == 0 && count >= 8 && is_offload_temp_path(gkfs_fd->path())) { + constexpr int k_offload_read_retry_count = 8; + constexpr auto k_offload_read_retry_sleep = std::chrono::milliseconds(2); + for(int attempt = 1; attempt <= k_offload_read_retry_count && ret == 0; + ++attempt) { + std::this_thread::sleep_for(k_offload_read_retry_sleep); + (void) gkfs_mmap_flush_for_path(gkfs_fd->path(), pos, count); + ret = gkfs_read_ws(*gkfs_fd, reinterpret_cast(buf), count, + pos); + if(ret > 0) { + LOG(DEBUG, + "{}() offload read recovered path '{}' after retry {} bytes {}", + __func__, gkfs_fd->path(), attempt, ret); + break; + } + } + } // Update offset in file descriptor in the file map if(ret > 0) { gkfs_fd->pos(pos + ret); -- GitLab From 9b07438b4eb819433cf16972805ff1a1da0d70c3 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 10:57:17 +0100 Subject: [PATCH 38/68] offload fix --- src/client/gkfs_data.cpp | 52 ++++++++++++++++++++++++----- src/client/gkfs_functions.cpp | 14 ++++++++ src/client/gkfs_libc.cpp | 62 ++++++++++++++++++++++++++++------- 3 files changed, 109 insertions(+), 19 deletions(-) diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 0dea38a8e..51622d02b 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -104,6 +104,7 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, } int err; auto path = make_unique(file.path()); + const bool offload_temp = is_offload_temp_path(*path); auto is_append = file.get_flag(gkfs::filemap::OpenFile_flags::append); ssize_t write_size = 0; auto num_replicas = CTX->get_replicas(); @@ -152,6 +153,12 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, (offset + count) <= gkfs::config::metadata::inline_data_size)) { bool allow_inline = true; + if(offload_temp) { + // Offload temp files frequently do sparse writes right after small + // headers. Keep them in chunk storage to avoid inline/chunk + // visibility races under concurrent readers. + allow_inline = false; + } // Check if the file is actually larger than the inline header limits us // to. This can happen if we have a write size cache enabled and the // file is large but the metadata is not updated yet on the server. If @@ -222,8 +229,8 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, } if(is_append) { - auto ret_offset = gkfs::utils::update_file_size(*path, count, offset, - is_append, migrated); + auto ret_offset = gkfs::utils::update_file_size( + *path, count, offset, is_append, migrated || offload_temp); err = ret_offset.first; if(err) { LOG(ERROR, "update_metadentry_size() failed with err '{}'", err); @@ -264,7 +271,8 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, // extra flags. For now, if migrated is true, we force an update. if(migrated) { auto ret_offset = gkfs::utils::update_file_size( - *path, count, offset, is_append, migrated); + *path, count, offset, is_append, + migrated || offload_temp); err = ret_offset.first; if(err) { LOG(ERROR, "update_metadentry_size() failed with err '{}'", @@ -275,8 +283,8 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, } } else if(!is_append) { - auto ret_offset = gkfs::utils::update_file_size(*path, count, offset, - is_append, migrated); + auto ret_offset = gkfs::utils::update_file_size( + *path, count, offset, is_append, migrated || offload_temp); err = ret_offset.first; if(err) { LOG(ERROR, "update_metadentry_size() failed with err '{}'", err); @@ -642,16 +650,44 @@ gkfs_read(int fd, void* buf, size_t count) { gkfs_fd->path(), count, gkfs_fd->pos()); auto pos = gkfs_fd->pos(); // retrieve the current offset auto ret = gkfs_read_ws(*gkfs_fd, reinterpret_cast(buf), count, pos); - if(ret == 0 && pos == 0 && count >= 8 && is_offload_temp_path(gkfs_fd->path())) { + const bool offload_magic_guard = + (pos == 0 && count >= 8 && is_offload_temp_path(gkfs_fd->path())); + auto should_retry_offload = [&](ssize_t read_ret) { + if(!offload_magic_guard) { + return false; + } + if(read_ret == 0) { + return true; + } + if(read_ret < 8) { + return false; + } + const auto* cbuf = reinterpret_cast(buf); + bool all_zero_prefix = true; + for(size_t i = 0; i < 8; ++i) { + if(cbuf[i] != 0) { + all_zero_prefix = false; + break; + } + } + if(all_zero_prefix) { + LOG(DEBUG, + "{}() offload read suspicious zero-prefix path '{}' bytes {}", + __func__, gkfs_fd->path(), read_ret); + } + return all_zero_prefix; + }; + if(should_retry_offload(ret)) { constexpr int k_offload_read_retry_count = 8; constexpr auto k_offload_read_retry_sleep = std::chrono::milliseconds(2); - for(int attempt = 1; attempt <= k_offload_read_retry_count && ret == 0; + for(int attempt = 1; + attempt <= k_offload_read_retry_count && should_retry_offload(ret); ++attempt) { std::this_thread::sleep_for(k_offload_read_retry_sleep); (void) gkfs_mmap_flush_for_path(gkfs_fd->path(), pos, count); ret = gkfs_read_ws(*gkfs_fd, reinterpret_cast(buf), count, pos); - if(ret > 0) { + if(!should_retry_offload(ret)) { LOG(DEBUG, "{}() offload read recovered path '{}' after retry {} bytes {}", __func__, gkfs_fd->path(), attempt, ret); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 4073aadbb..4f6ae8260 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -130,6 +130,11 @@ overlaps(addr_type a_begin, addr_type a_end, addr_type b_begin, return a_begin < b_end && b_begin < a_end; } +bool +is_offload_temp_path(const std::string& path) { + return path.rfind("/offload/t_", 0) == 0; +} + size_t tracked_segments_for_path(const std::string& path) { size_t segments = 0; @@ -267,6 +272,7 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, std::lock_guard lock(get_mmap_set_mutex()); auto& mmap_set = get_mmap_set(); size_t flushed_segments = 0; + bool had_overlap = false; const long page_size = ::sysconf(_SC_PAGESIZE); LOG(DEBUG, "{}() path '{}' tracked segments {} read-range [{}..{})", @@ -286,6 +292,7 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, ++it; continue; } + had_overlap = true; const auto flush_begin = std::max(entry_begin, read_begin); const auto flush_end = std::min(entry_end, read_end); @@ -347,6 +354,13 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, } LOG(DEBUG, "{}() path '{}' flushed segments {}", __func__, path, flushed_segments); + if(is_offload_temp_path(path) && had_overlap && flushed_segments == 0) { + LOG(DEBUG, + "{}() path '{}' overlap seen but no segments flushed; retry requested", + __func__, path); + errno = EAGAIN; + return -1; + } return 0; } diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 17da55b4a..076c779f8 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -60,6 +60,8 @@ #include // For std::remove_if in get_open_fds (commented out) #include // For strcpy, strchr, strlen #include // For malloc, free, qsort +#include +#include // Linux Specific (consider guarding if portability is a high concern) #include // For _STAT_VER (might be glibc specific, ensure availability) @@ -878,10 +880,23 @@ read(int fd, void* buf, size_t nbyte) { DEBUG_INFO("read(fd={}, nbyte={})", fd, nbyte); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd && gkfs::syscall::gkfs_mmap_flush_for_path( - gkfs_fd->path(), - static_cast(gkfs_fd->pos()), nbyte) < 0) - return -1; + if(gkfs_fd) { + constexpr int k_flush_retry_count = 8; + constexpr auto k_flush_retry_sleep = std::chrono::milliseconds(2); + int flush_rc = -1; + for(int attempt = 0; attempt <= k_flush_retry_count; ++attempt) { + flush_rc = gkfs::syscall::gkfs_mmap_flush_for_path( + gkfs_fd->path(), static_cast(gkfs_fd->pos()), + nbyte); + if(flush_rc == 0 || errno != EAGAIN) { + break; + } + std::this_thread::sleep_for(k_flush_retry_sleep); + } + if(flush_rc < 0) { + return -1; + } + } } GKFS_OPERATION(read, fd, buf, nbyte); GKFS_FALLBACK(read, fd, buf, nbyte); @@ -900,10 +915,22 @@ pread(int fd, void* buf, size_t count, off_t offset) { DEBUG_INFO("pread(fd={}, count={}, offset={})", fd, count, offset); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd && - gkfs::syscall::gkfs_mmap_flush_for_path( - gkfs_fd->path(), static_cast(offset), count) < 0) - return -1; + if(gkfs_fd) { + constexpr int k_flush_retry_count = 8; + constexpr auto k_flush_retry_sleep = std::chrono::milliseconds(2); + int flush_rc = -1; + for(int attempt = 0; attempt <= k_flush_retry_count; ++attempt) { + flush_rc = gkfs::syscall::gkfs_mmap_flush_for_path( + gkfs_fd->path(), static_cast(offset), count); + if(flush_rc == 0 || errno != EAGAIN) { + break; + } + std::this_thread::sleep_for(k_flush_retry_sleep); + } + if(flush_rc < 0) { + return -1; + } + } } GKFS_OPERATION(pread, fd, buf, count, offset); GKFS_FALLBACK(pread, fd, buf, count, offset); @@ -922,9 +949,22 @@ pread64(int fd, void* buf, size_t count, off64_t offset) { DEBUG_INFO("pread64(fd={}, count={}, offset={})", fd, count, offset); if(CTX->interception_enabled() && is_gkfs_fd(fd)) { auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd && gkfs::syscall::gkfs_mmap_flush_for_path( - gkfs_fd->path(), offset, count) < 0) - return -1; + if(gkfs_fd) { + constexpr int k_flush_retry_count = 8; + constexpr auto k_flush_retry_sleep = std::chrono::milliseconds(2); + int flush_rc = -1; + for(int attempt = 0; attempt <= k_flush_retry_count; ++attempt) { + flush_rc = gkfs::syscall::gkfs_mmap_flush_for_path( + gkfs_fd->path(), offset, count); + if(flush_rc == 0 || errno != EAGAIN) { + break; + } + std::this_thread::sleep_for(k_flush_retry_sleep); + } + if(flush_rc < 0) { + return -1; + } + } } GKFS_OPERATION(pread, fd, buf, count, offset); // GekkoFS pread likely handles large offsets -- GitLab From 1995f212af2289e12508844c4f5bd8557c1c5e7b Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 11:48:50 +0100 Subject: [PATCH 39/68] markers --- src/client/gkfs_functions.cpp | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 4f6ae8260..a069d4912 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -149,6 +149,7 @@ tracked_segments_for_path(const std::string& path) { constexpr size_t k_mmap_flush_chunk_size = 8UL * 1024UL * 1024UL; constexpr int k_flush_ok = 0; constexpr int k_flush_unavailable = 1; +constexpr int k_flush_retry = 2; int open_self_mem_fd() { @@ -209,6 +210,23 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, return k_flush_unavailable; } + if(is_offload_temp_path(path) && chunk_off == 0 && copied >= 8) { + bool all_zero_magic = true; + for(size_t i = 0; i < 8; ++i) { + if(static_cast(bounce[i]) != 0) { + all_zero_magic = false; + break; + } + } + if(all_zero_magic) { + LOG(DEBUG, + "{}() defer flush for path '{}' due zero magic prefix at off 0", + __func__, path); + (void) ::syscall_no_intercept(SYS_close, mem_fd); + return k_flush_retry; + } + } + auto [werr, wsize] = gkfs::rpc::forward_write(path, bounce.data(), chunk_off, copied, 0); if(werr) { @@ -339,7 +357,18 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, LOG(DEBUG, "{}() dropping unavailable overlap path '{}' addr {} len {} off {}", __func__, path, addr, len, off); - it = mmap_set.erase(it); + if(is_offload_temp_path(path)) { + ++it; + } else { + it = mmap_set.erase(it); + } + continue; + } + if(flush_rc == k_flush_retry) { + LOG(DEBUG, + "{}() deferred overlap path '{}' addr {} len {} off {}", + __func__, path, addr, len, off); + ++it; continue; } if(flush_rc < 0) { -- GitLab From 2eb7f8dbfba72308a4200b9ef54f4f37c86bfcf5 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 12:33:30 +0100 Subject: [PATCH 40/68] fix egain --- src/client/gkfs_libc.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 076c779f8..13d2c42de 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -894,7 +894,13 @@ read(int fd, void* buf, size_t nbyte) { std::this_thread::sleep_for(k_flush_retry_sleep); } if(flush_rc < 0) { - return -1; + if(errno == EAGAIN) { + // Avoid surfacing transient mmap flush races to userspace + // reads (Python may translate this to None reads). + errno = 0; + } else { + return -1; + } } } } @@ -928,7 +934,11 @@ pread(int fd, void* buf, size_t count, off_t offset) { std::this_thread::sleep_for(k_flush_retry_sleep); } if(flush_rc < 0) { - return -1; + if(errno == EAGAIN) { + errno = 0; + } else { + return -1; + } } } } @@ -962,7 +972,11 @@ pread64(int fd, void* buf, size_t count, off64_t offset) { std::this_thread::sleep_for(k_flush_retry_sleep); } if(flush_rc < 0) { - return -1; + if(errno == EAGAIN) { + errno = 0; + } else { + return -1; + } } } } -- GitLab From 32bf6c0515fb74033615243599a93a7b33063858 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 12:37:38 +0100 Subject: [PATCH 41/68] retry --- src/client/gkfs_data.cpp | 2 +- src/client/gkfs_functions.cpp | 21 +++------------------ 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 51622d02b..85f4283d1 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -678,7 +678,7 @@ gkfs_read(int fd, void* buf, size_t count) { return all_zero_prefix; }; if(should_retry_offload(ret)) { - constexpr int k_offload_read_retry_count = 8; + constexpr int k_offload_read_retry_count = 600; constexpr auto k_offload_read_retry_sleep = std::chrono::milliseconds(2); for(int attempt = 1; attempt <= k_offload_read_retry_count && should_retry_offload(ret); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index a069d4912..c32b85a3e 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -149,7 +149,6 @@ tracked_segments_for_path(const std::string& path) { constexpr size_t k_mmap_flush_chunk_size = 8UL * 1024UL * 1024UL; constexpr int k_flush_ok = 0; constexpr int k_flush_unavailable = 1; -constexpr int k_flush_retry = 2; int open_self_mem_fd() { @@ -223,7 +222,9 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, "{}() defer flush for path '{}' due zero magic prefix at off 0", __func__, path); (void) ::syscall_no_intercept(SYS_close, mem_fd); - return k_flush_retry; + // Treat as best-effort no-op: avoid propagating EAGAIN to + // userspace while writer is still preparing header bytes. + return k_flush_ok; } } @@ -290,7 +291,6 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, std::lock_guard lock(get_mmap_set_mutex()); auto& mmap_set = get_mmap_set(); size_t flushed_segments = 0; - bool had_overlap = false; const long page_size = ::sysconf(_SC_PAGESIZE); LOG(DEBUG, "{}() path '{}' tracked segments {} read-range [{}..{})", @@ -310,7 +310,6 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, ++it; continue; } - had_overlap = true; const auto flush_begin = std::max(entry_begin, read_begin); const auto flush_end = std::min(entry_end, read_end); @@ -364,13 +363,6 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, } continue; } - if(flush_rc == k_flush_retry) { - LOG(DEBUG, - "{}() deferred overlap path '{}' addr {} len {} off {}", - __func__, path, addr, len, off); - ++it; - continue; - } if(flush_rc < 0) { LOG(WARNING, "{}() chunked flush failed for path '{}' (addr {} len {} off {})", @@ -383,13 +375,6 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, } LOG(DEBUG, "{}() path '{}' flushed segments {}", __func__, path, flushed_segments); - if(is_offload_temp_path(path) && had_overlap && flushed_segments == 0) { - LOG(DEBUG, - "{}() path '{}' overlap seen but no segments flushed; retry requested", - __func__, path); - errno = EAGAIN; - return -1; - } return 0; } -- GitLab From 97c652833ee136b36f6319f557d5f7a3d311d1f4 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 12:45:51 +0100 Subject: [PATCH 42/68] fix --- src/client/gkfs_data.cpp | 137 +++++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 62 deletions(-) diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 85f4283d1..3ddff5517 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -559,34 +559,92 @@ gkfs_do_read(const gkfs::filemap::OpenFile& file, char* buf, size_t count, __func__); } - pair ret; - if(gkfs::config::proxy::fwd_io && CTX->use_proxy() && - count > gkfs::config::proxy::fwd_io_count_threshold) { - ret = gkfs::rpc::forward_read_proxy(file.path(), buf, offset, count); - } else { - std::set failed; // set with failed targets. - if(CTX->get_replicas() != 0) { + auto do_chunk_read = [&]() -> std::pair { + pair ret; + if(gkfs::config::proxy::fwd_io && CTX->use_proxy() && + count > gkfs::config::proxy::fwd_io_count_threshold) { + ret = + gkfs::rpc::forward_read_proxy(file.path(), buf, offset, count); + } else { + std::set failed; // set with failed targets. + if(CTX->get_replicas() != 0) { - ret = gkfs::rpc::forward_read(file.path(), buf, offset, count, - CTX->get_replicas(), failed); - while(ret.first == EIO) { ret = gkfs::rpc::forward_read(file.path(), buf, offset, count, CTX->get_replicas(), failed); - LOG(WARNING, "gkfs::rpc::forward_read() failed with ret '{}'", - ret.first); - } + while(ret.first == EIO) { + ret = gkfs::rpc::forward_read(file.path(), buf, offset, count, + CTX->get_replicas(), failed); + LOG(WARNING, "gkfs::rpc::forward_read() failed with ret '{}'", + ret.first); + } - } else { - ret = gkfs::rpc::forward_read(file.path(), buf, offset, count, 0, - failed); + } else { + ret = gkfs::rpc::forward_read(file.path(), buf, offset, count, 0, + failed); + } } - } + return ret; + }; + + auto ret = do_chunk_read(); auto err = ret.first; if(err) { LOG(WARNING, "gkfs::rpc::forward_read() failed with ret '{}'", err); errno = err; return -1; } + const bool offload_magic_guard = + (is_offload_temp_path(file.path()) && offset == 0 && count >= 8); + auto needs_offload_retry = [&](long read_ret) { + if(!offload_magic_guard) { + return false; + } + if(read_ret == 0) { + return true; + } + if(read_ret < 8) { + return false; + } + const auto* cbuf = reinterpret_cast(buf); + for(size_t i = 0; i < 8; ++i) { + if(cbuf[i] != 0) { + return false; + } + } + LOG(DEBUG, + "{}() offload read suspicious zero-prefix path '{}' bytes {}", + __func__, file.path(), read_ret); + return true; + }; + if(needs_offload_retry(ret.second)) { + constexpr int k_offload_eof_retry_count = 2500; + constexpr auto k_offload_eof_retry_sleep = std::chrono::milliseconds(2); + for(int attempt = 1; + attempt <= k_offload_eof_retry_count && needs_offload_retry(ret.second); + ++attempt) { + // Trigger pending mmap-backed writes, then wait for visible size. + (void) gkfs_mmap_flush_for_path(file.path(), offset, count); + std::this_thread::sleep_for(k_offload_eof_retry_sleep); + auto [size_err, remote_size] = + gkfs::rpc::forward_get_metadentry_size(file.path(), 0); + if(size_err == 0 && remote_size > 0) { + LOG(DEBUG, + "{}() offload EOF guard path '{}' attempt {} remote_size {}", + __func__, file.path(), attempt, remote_size); + } + ret = do_chunk_read(); + if(ret.first) { + errno = ret.first; + return -1; + } + if(!needs_offload_retry(ret.second)) { + LOG(DEBUG, + "{}() offload EOF guard recovered path '{}' after retry {} bytes {}", + __func__, file.path(), attempt, ret.second); + break; + } + } + } // XXX check that we don't try to read past end of the file return ret.second; // return read size } @@ -650,51 +708,6 @@ gkfs_read(int fd, void* buf, size_t count) { gkfs_fd->path(), count, gkfs_fd->pos()); auto pos = gkfs_fd->pos(); // retrieve the current offset auto ret = gkfs_read_ws(*gkfs_fd, reinterpret_cast(buf), count, pos); - const bool offload_magic_guard = - (pos == 0 && count >= 8 && is_offload_temp_path(gkfs_fd->path())); - auto should_retry_offload = [&](ssize_t read_ret) { - if(!offload_magic_guard) { - return false; - } - if(read_ret == 0) { - return true; - } - if(read_ret < 8) { - return false; - } - const auto* cbuf = reinterpret_cast(buf); - bool all_zero_prefix = true; - for(size_t i = 0; i < 8; ++i) { - if(cbuf[i] != 0) { - all_zero_prefix = false; - break; - } - } - if(all_zero_prefix) { - LOG(DEBUG, - "{}() offload read suspicious zero-prefix path '{}' bytes {}", - __func__, gkfs_fd->path(), read_ret); - } - return all_zero_prefix; - }; - if(should_retry_offload(ret)) { - constexpr int k_offload_read_retry_count = 600; - constexpr auto k_offload_read_retry_sleep = std::chrono::milliseconds(2); - for(int attempt = 1; - attempt <= k_offload_read_retry_count && should_retry_offload(ret); - ++attempt) { - std::this_thread::sleep_for(k_offload_read_retry_sleep); - (void) gkfs_mmap_flush_for_path(gkfs_fd->path(), pos, count); - ret = gkfs_read_ws(*gkfs_fd, reinterpret_cast(buf), count, - pos); - if(!should_retry_offload(ret)) { - LOG(DEBUG, - "{}() offload read recovered path '{}' after retry {} bytes {}", - __func__, gkfs_fd->path(), attempt, ret); - break; - } - } - } // Update offset in file descriptor in the file map if(ret > 0) { gkfs_fd->pos(pos + ret); -- GitLab From c32247410b885e1b7f7ec9d9d99c2e227ea301b4 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 12:51:51 +0100 Subject: [PATCH 43/68] fix --- src/client/gkfs_functions.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index c32b85a3e..84ad135ca 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -158,7 +158,7 @@ open_self_mem_fd() { int flush_range_chunked(const std::string& path, const void* addr, size_t len, - off_t off) { + off_t off, bool allow_zero_magic_defer = false) { const auto* base = static_cast(addr); size_t total = 0; std::vector bounce(k_mmap_flush_chunk_size); @@ -209,7 +209,8 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, return k_flush_unavailable; } - if(is_offload_temp_path(path) && chunk_off == 0 && copied >= 8) { + if(allow_zero_magic_defer && is_offload_temp_path(path) && + chunk_off == 0 && copied >= 8) { bool all_zero_magic = true; for(size_t i = 0; i < 8; ++i) { if(static_cast(bounce[i]) != 0) { @@ -346,7 +347,8 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, continue; } - const int flush_rc = flush_range_chunked(path, addr, len, off); + const int flush_rc = + flush_range_chunked(path, addr, len, off, true); LOG(DEBUG, "{}() flush result path '{}' addr {} len {} off {} rc {}", __func__, path, addr, len, off, flush_rc); @@ -390,7 +392,7 @@ gkfs_mmap_flush_all_for_path(const std::string& path) { } const int flush_rc = flush_range_chunked(path, entry.addr, entry.length, - entry.offset); + entry.offset, false); LOG(DEBUG, "{}() flush-all result path '{}' addr {} len {} off {} rc {}", __func__, path, entry.addr, entry.length, entry.offset, flush_rc); @@ -509,7 +511,7 @@ gkfs_msync(void* addr, size_t length, int flags) { for(const auto& range : flush_ranges) { (void) range.fd; if(flush_range_chunked(range.path, range.addr, range.length, - range.offset) < 0) { + range.offset, false) < 0) { return -1; } } @@ -598,7 +600,7 @@ gkfs_munmap(void* addr, size_t length) { for(const auto& range : flush_ranges) { (void) range.fd; if(flush_range_chunked(range.path, range.addr, range.length, - range.offset) < 0) { + range.offset, false) < 0) { return -1; } } -- GitLab From e3731a16066df80449c4dcaec21b9a55346a82f0 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 17:52:39 +0100 Subject: [PATCH 44/68] fix defered --- src/client/gkfs_metadata.cpp | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index cf91932c2..929065b60 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -127,6 +127,18 @@ is_offload_temp_path(const std::string& path) { return path.rfind("/offload/t_", 0) == 0; } +bool +should_defer_offload_trunc(const std::string& path, int flags) { + if(!is_offload_temp_path(path)) { + return false; + } + // Offload temp files are often reopened with O_CREAT|O_TRUNC and then + // closed without writes by user code. Truncating eagerly here can expose + // empty files to concurrent readers. + return (flags & O_CREAT) && (flags & O_TRUNC) && + ((flags & O_WRONLY) || (flags & O_RDWR)); +} + } // namespace namespace gkfs::syscall { @@ -336,7 +348,11 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY))) { - if(gkfs_mmap_has_active_path(new_path)) { + if(should_defer_offload_trunc(new_path, flags)) { + LOG(DEBUG, + "{}() deferring O_TRUNC for offload temp path '{}' flags {}", + __func__, new_path, flags); + } else if(gkfs_mmap_has_active_path(new_path)) { LOG(DEBUG, "{}() deferring O_TRUNC for active mmap path '{}'", __func__, new_path); @@ -369,7 +385,11 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { assert(S_ISREG(md.mode())); if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY))) { - if(gkfs_mmap_has_active_path(path)) { + if(should_defer_offload_trunc(path, flags)) { + LOG(DEBUG, + "{}() deferring O_TRUNC for offload temp path '{}' flags {}", + __func__, path, flags); + } else if(gkfs_mmap_has_active_path(path)) { LOG(DEBUG, "{}() deferring O_TRUNC for active mmap path '{}'", __func__, path); } else { -- GitLab From 287c0c8cadc76a2da2497788b0151a80004e829d Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 18:43:29 +0100 Subject: [PATCH 45/68] fix --- src/client/gkfs_data.cpp | 5 +---- src/client/gkfs_functions.cpp | 33 ++++++--------------------------- 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 3ddff5517..6d14f26dc 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -611,9 +611,6 @@ gkfs_do_read(const gkfs::filemap::OpenFile& file, char* buf, size_t count, return false; } } - LOG(DEBUG, - "{}() offload read suspicious zero-prefix path '{}' bytes {}", - __func__, file.path(), read_ret); return true; }; if(needs_offload_retry(ret.second)) { @@ -627,7 +624,7 @@ gkfs_do_read(const gkfs::filemap::OpenFile& file, char* buf, size_t count, std::this_thread::sleep_for(k_offload_eof_retry_sleep); auto [size_err, remote_size] = gkfs::rpc::forward_get_metadentry_size(file.path(), 0); - if(size_err == 0 && remote_size > 0) { + if(size_err == 0 && remote_size > 0 && (attempt % 50) == 0) { LOG(DEBUG, "{}() offload EOF guard path '{}' attempt {} remote_size {}", __func__, file.path(), attempt, remote_size); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 84ad135ca..3cce27694 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -158,7 +158,7 @@ open_self_mem_fd() { int flush_range_chunked(const std::string& path, const void* addr, size_t len, - off_t off, bool allow_zero_magic_defer = false) { + off_t off) { const auto* base = static_cast(addr); size_t total = 0; std::vector bounce(k_mmap_flush_chunk_size); @@ -209,26 +209,6 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, return k_flush_unavailable; } - if(allow_zero_magic_defer && is_offload_temp_path(path) && - chunk_off == 0 && copied >= 8) { - bool all_zero_magic = true; - for(size_t i = 0; i < 8; ++i) { - if(static_cast(bounce[i]) != 0) { - all_zero_magic = false; - break; - } - } - if(all_zero_magic) { - LOG(DEBUG, - "{}() defer flush for path '{}' due zero magic prefix at off 0", - __func__, path); - (void) ::syscall_no_intercept(SYS_close, mem_fd); - // Treat as best-effort no-op: avoid propagating EAGAIN to - // userspace while writer is still preparing header bytes. - return k_flush_ok; - } - } - auto [werr, wsize] = gkfs::rpc::forward_write(path, bounce.data(), chunk_off, copied, 0); if(werr) { @@ -347,8 +327,7 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, continue; } - const int flush_rc = - flush_range_chunked(path, addr, len, off, true); + const int flush_rc = flush_range_chunked(path, addr, len, off); LOG(DEBUG, "{}() flush result path '{}' addr {} len {} off {} rc {}", __func__, path, addr, len, off, flush_rc); @@ -391,8 +370,8 @@ gkfs_mmap_flush_all_for_path(const std::string& path) { continue; } - const int flush_rc = flush_range_chunked(path, entry.addr, entry.length, - entry.offset, false); + const int flush_rc = + flush_range_chunked(path, entry.addr, entry.length, entry.offset); LOG(DEBUG, "{}() flush-all result path '{}' addr {} len {} off {} rc {}", __func__, path, entry.addr, entry.length, entry.offset, flush_rc); @@ -511,7 +490,7 @@ gkfs_msync(void* addr, size_t length, int flags) { for(const auto& range : flush_ranges) { (void) range.fd; if(flush_range_chunked(range.path, range.addr, range.length, - range.offset, false) < 0) { + range.offset) < 0) { return -1; } } @@ -600,7 +579,7 @@ gkfs_munmap(void* addr, size_t length) { for(const auto& range : flush_ranges) { (void) range.fd; if(flush_range_chunked(range.path, range.addr, range.length, - range.offset, false) < 0) { + range.offset) < 0) { return -1; } } -- GitLab From ed843b6af625b20b1e46109008ed99dda2cc5a11 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 18:54:30 +0100 Subject: [PATCH 46/68] fix --- src/client/gkfs_functions.cpp | 30 +++++++++++++++++++++++------- src/client/gkfs_metadata.cpp | 28 ++++++++++++++++++++-------- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 3cce27694..2f9f8db22 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -87,6 +87,7 @@ struct mmap_entry { size_t length; off_t offset; int prot; + int flags; // We use addr as the unique key for the set bool @@ -130,6 +131,22 @@ overlaps(addr_type a_begin, addr_type a_end, addr_type b_begin, return a_begin < b_end && b_begin < a_end; } +bool +is_shared_map_type(int flags) { + const int map_type = flags & MAP_TYPE; +#ifdef MAP_SHARED_VALIDATE + return map_type == MAP_SHARED || map_type == MAP_SHARED_VALIDATE; +#else + return map_type == MAP_SHARED; +#endif +} + +bool +should_flush_mapping(const mmap_entry& entry) { + return (entry.prot & PROT_WRITE) && is_shared_map_type(entry.flags) && + entry.length != 0; +} + bool is_offload_temp_path(const std::string& path) { return path.rfind("/offload/t_", 0) == 0; @@ -279,8 +296,7 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, for(auto it = mmap_set.begin(); it != mmap_set.end();) { const auto& entry = *it; - if(entry.path != path || !(entry.prot & PROT_WRITE) || - entry.length == 0) { + if(entry.path != path || !should_flush_mapping(entry)) { ++it; continue; } @@ -365,8 +381,7 @@ gkfs_mmap_flush_all_for_path(const std::string& path) { size_t flushed_segments = 0; for(const auto& entry : get_mmap_set()) { - if(entry.path != path || !(entry.prot & PROT_WRITE) || - entry.length == 0) { + if(entry.path != path || !should_flush_mapping(entry)) { continue; } @@ -426,7 +441,8 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Register mapping under lock so concurrent threads don't race on mmap_set { std::lock_guard lock(get_mmap_set_mutex()); - get_mmap_set().insert(mmap_entry{ptr, fd, path, length, offset, prot}); + get_mmap_set().insert( + mmap_entry{ptr, fd, path, length, offset, prot, flags}); } gkfs::syscall::gkfs_pread(fd, ptr, length, offset); @@ -464,7 +480,7 @@ gkfs_msync(void* addr, size_t length, int flags) { std::vector flush_ranges; std::unique_lock lock(get_mmap_set_mutex()); for(const auto& entry : get_mmap_set()) { - if(!(entry.prot & PROT_WRITE) || entry.length == 0) { + if(!should_flush_mapping(entry)) { continue; } const auto entry_begin = ptr_to_addr(entry.addr); @@ -544,7 +560,7 @@ gkfs_munmap(void* addr, size_t length) { static_cast(cut_begin), static_cast(cut_end)); - if((entry.prot & PROT_WRITE) && cut_end > cut_begin) { + if(should_flush_mapping(entry) && cut_end > cut_begin) { const auto rel = cut_begin - entry_begin; flush_ranges.push_back( flush_range{addr_to_ptr(cut_begin), diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 929065b60..8c232b54a 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -1833,14 +1833,26 @@ gkfs_close(unsigned int fd) { file->get_flag(gkfs::filemap::OpenFile_flags::creation_pending)); } if(file->type() == gkfs::filemap::FileType::regular) { - auto flush_err = gkfs_mmap_flush_all_for_path(path); - if(flush_err < 0) { - LOG(WARNING, "{}() mmap flush failed for path '{}'", __func__, - path); - } - if(is_offload) { - LOG(DEBUG, "{}() offload close mmap_flush path '{}' rc {}", - __func__, path, flush_err); + const bool should_close_flush = + file->get_flag(gkfs::filemap::OpenFile_flags::creation_pending) || + file->get_flag(gkfs::filemap::OpenFile_flags::wronly) || + file->get_flag(gkfs::filemap::OpenFile_flags::append) || + file->get_flag(gkfs::filemap::OpenFile_flags::trunc) || + file->get_flag(gkfs::filemap::OpenFile_flags::creat); + if(should_close_flush) { + auto flush_err = gkfs_mmap_flush_all_for_path(path); + if(flush_err < 0) { + LOG(WARNING, "{}() mmap flush failed for path '{}'", __func__, + path); + } + if(is_offload) { + LOG(DEBUG, "{}() offload close mmap_flush path '{}' rc {}", + __func__, path, flush_err); + } + } else if(is_offload) { + LOG(DEBUG, + "{}() offload close skip mmap_flush path '{}' (read-intent fd)", + __func__, path); } } -- GitLab From a6675c6a86cb97aa12c16d18ab96c95791984547 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sat, 28 Feb 2026 22:35:29 +0100 Subject: [PATCH 47/68] fix --- src/client/gkfs_functions.cpp | 15 ++++++++++++--- src/client/gkfs_metadata.cpp | 1 + 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 2f9f8db22..86b478e03 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -88,6 +88,7 @@ struct mmap_entry { off_t offset; int prot; int flags; + bool write_intent; // We use addr as the unique key for the set bool @@ -143,8 +144,8 @@ is_shared_map_type(int flags) { bool should_flush_mapping(const mmap_entry& entry) { - return (entry.prot & PROT_WRITE) && is_shared_map_type(entry.flags) && - entry.length != 0; + return entry.write_intent && (entry.prot & PROT_WRITE) && + is_shared_map_type(entry.flags) && entry.length != 0; } bool @@ -431,6 +432,13 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, return MAP_FAILED; } std::string path = gkfs_fd->path(); + const bool write_intent = + gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::wronly) || + gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::rdwr) || + gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::append) || + gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::trunc) || + gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::creat) || + gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::creation_pending); void* ptr = ::mmap(addr, length, prot | PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); @@ -442,7 +450,8 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, { std::lock_guard lock(get_mmap_set_mutex()); get_mmap_set().insert( - mmap_entry{ptr, fd, path, length, offset, prot, flags}); + mmap_entry{ptr, fd, path, length, offset, prot, flags, + write_intent}); } gkfs::syscall::gkfs_pread(fd, ptr, length, offset); diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 8c232b54a..f20600fe7 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -1836,6 +1836,7 @@ gkfs_close(unsigned int fd) { const bool should_close_flush = file->get_flag(gkfs::filemap::OpenFile_flags::creation_pending) || file->get_flag(gkfs::filemap::OpenFile_flags::wronly) || + file->get_flag(gkfs::filemap::OpenFile_flags::rdwr) || file->get_flag(gkfs::filemap::OpenFile_flags::append) || file->get_flag(gkfs::filemap::OpenFile_flags::trunc) || file->get_flag(gkfs::filemap::OpenFile_flags::creat); -- GitLab From 66c82c9acb5b4baf13a98d78f69241dd3786098a Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sun, 1 Mar 2026 07:18:39 +0100 Subject: [PATCH 48/68] fix --- src/client/gkfs_functions.cpp | 1 - src/client/gkfs_metadata.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 86b478e03..3cf9e120e 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -434,7 +434,6 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, std::string path = gkfs_fd->path(); const bool write_intent = gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::wronly) || - gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::rdwr) || gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::append) || gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::trunc) || gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::creat) || diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index f20600fe7..8c232b54a 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -1836,7 +1836,6 @@ gkfs_close(unsigned int fd) { const bool should_close_flush = file->get_flag(gkfs::filemap::OpenFile_flags::creation_pending) || file->get_flag(gkfs::filemap::OpenFile_flags::wronly) || - file->get_flag(gkfs::filemap::OpenFile_flags::rdwr) || file->get_flag(gkfs::filemap::OpenFile_flags::append) || file->get_flag(gkfs::filemap::OpenFile_flags::trunc) || file->get_flag(gkfs::filemap::OpenFile_flags::creat); -- GitLab From 0e18dacfdc527c0c77a4e7dad5b0e9801e378f2f Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sun, 1 Mar 2026 11:03:17 +0100 Subject: [PATCH 49/68] fix mmap --- include/client/gkfs_functions.hpp | 11 +++++++++ src/client/gkfs_data.cpp | 23 ++++++++--------- src/client/gkfs_functions.cpp | 41 ++++++++++++++++++++----------- src/client/gkfs_metadata.cpp | 22 ++++++++--------- 4 files changed, 60 insertions(+), 37 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index d28164f10..3e61c38a5 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -202,6 +202,17 @@ gkfs_munmap(void* addr, size_t length); int gkfs_msync(void* addr, size_t length, int flags); +/** + * Returns true only for file descriptors that carry explicit write intent for + * mmap-backed persistence decisions. + * + * Note: O_RDWR by itself is not considered sufficient write intent here; many + * reader paths use read/write descriptors without intending to publish mmap + * modifications. + */ +bool +gkfs_has_explicit_mmap_write_intent(gkfs::filemap::OpenFile& file); + } // namespace gkfs::syscall // gkfs_getsingleserverdir is using extern "C" to demangle it for C usage diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 6d14f26dc..2bb410020 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -271,8 +271,7 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, // extra flags. For now, if migrated is true, we force an update. if(migrated) { auto ret_offset = gkfs::utils::update_file_size( - *path, count, offset, is_append, - migrated || offload_temp); + *path, count, offset, is_append, migrated || offload_temp); err = ret_offset.first; if(err) { LOG(ERROR, "update_metadentry_size() failed with err '{}'", @@ -563,8 +562,8 @@ gkfs_do_read(const gkfs::filemap::OpenFile& file, char* buf, size_t count, pair ret; if(gkfs::config::proxy::fwd_io && CTX->use_proxy() && count > gkfs::config::proxy::fwd_io_count_threshold) { - ret = - gkfs::rpc::forward_read_proxy(file.path(), buf, offset, count); + ret = gkfs::rpc::forward_read_proxy(file.path(), buf, offset, + count); } else { std::set failed; // set with failed targets. if(CTX->get_replicas() != 0) { @@ -572,15 +571,17 @@ gkfs_do_read(const gkfs::filemap::OpenFile& file, char* buf, size_t count, ret = gkfs::rpc::forward_read(file.path(), buf, offset, count, CTX->get_replicas(), failed); while(ret.first == EIO) { - ret = gkfs::rpc::forward_read(file.path(), buf, offset, count, - CTX->get_replicas(), failed); - LOG(WARNING, "gkfs::rpc::forward_read() failed with ret '{}'", + ret = gkfs::rpc::forward_read(file.path(), buf, offset, + count, CTX->get_replicas(), + failed); + LOG(WARNING, + "gkfs::rpc::forward_read() failed with ret '{}'", ret.first); } } else { - ret = gkfs::rpc::forward_read(file.path(), buf, offset, count, 0, - failed); + ret = gkfs::rpc::forward_read(file.path(), buf, offset, count, + 0, failed); } } return ret; @@ -616,8 +617,8 @@ gkfs_do_read(const gkfs::filemap::OpenFile& file, char* buf, size_t count, if(needs_offload_retry(ret.second)) { constexpr int k_offload_eof_retry_count = 2500; constexpr auto k_offload_eof_retry_sleep = std::chrono::milliseconds(2); - for(int attempt = 1; - attempt <= k_offload_eof_retry_count && needs_offload_retry(ret.second); + for(int attempt = 1; attempt <= k_offload_eof_retry_count && + needs_offload_retry(ret.second); ++attempt) { // Trigger pending mmap-backed writes, then wait for visible size. (void) gkfs_mmap_flush_for_path(file.path(), offset, count); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 3cf9e120e..481b4180d 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -144,6 +144,8 @@ is_shared_map_type(int flags) { bool should_flush_mapping(const mmap_entry& entry) { + // Only shared + writable maps with explicit write intent are eligible for + // daemon flush. This avoids publishing reader-side anonymous snapshots. return entry.write_intent && (entry.prot & PROT_WRITE) && is_shared_map_type(entry.flags) && entry.length != 0; } @@ -262,6 +264,19 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, namespace gkfs::syscall { +bool +gkfs_has_explicit_mmap_write_intent(gkfs::filemap::OpenFile& file) { + // Keep this predicate as the single policy source for mmap-backed flush + // decisions. O_RDWR alone is intentionally excluded: some reader-side + // workflows open files as read/write but never intend to publish mmap + // modifications to the daemon. + return file.get_flag(gkfs::filemap::OpenFile_flags::wronly) || + file.get_flag(gkfs::filemap::OpenFile_flags::append) || + file.get_flag(gkfs::filemap::OpenFile_flags::trunc) || + file.get_flag(gkfs::filemap::OpenFile_flags::creat) || + file.get_flag(gkfs::filemap::OpenFile_flags::creation_pending); +} + /** * Flush any pending write-mmap for `path` directly to the GekkoFS daemon. * @@ -345,8 +360,7 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, } const int flush_rc = flush_range_chunked(path, addr, len, off); - LOG(DEBUG, - "{}() flush result path '{}' addr {} len {} off {} rc {}", + LOG(DEBUG, "{}() flush result path '{}' addr {} len {} off {} rc {}", __func__, path, addr, len, off, flush_rc); if(flush_rc == k_flush_unavailable) { // Mapping raced with unmap; treat as best-effort flush miss and @@ -386,8 +400,8 @@ gkfs_mmap_flush_all_for_path(const std::string& path) { continue; } - const int flush_rc = - flush_range_chunked(path, entry.addr, entry.length, entry.offset); + const int flush_rc = flush_range_chunked(path, entry.addr, entry.length, + entry.offset); LOG(DEBUG, "{}() flush-all result path '{}' addr {} len {} off {} rc {}", __func__, path, entry.addr, entry.length, entry.offset, flush_rc); @@ -411,8 +425,9 @@ bool gkfs_mmap_has_active_path(const std::string& path) { std::lock_guard lock(get_mmap_set_mutex()); const auto& mmap_set = get_mmap_set(); - return std::any_of(mmap_set.begin(), mmap_set.end(), - [&path](const auto& entry) { return entry.path == path; }); + return std::any_of( + mmap_set.begin(), mmap_set.end(), + [&path](const auto& entry) { return entry.path == path; }); } void* @@ -432,12 +447,9 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, return MAP_FAILED; } std::string path = gkfs_fd->path(); - const bool write_intent = - gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::wronly) || - gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::append) || - gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::trunc) || - gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::creat) || - gkfs_fd->get_flag(gkfs::filemap::OpenFile_flags::creation_pending); + // Persist the open-time write intent with the mapping so read-triggered + // flushes and close-time flushes follow exactly the same policy. + const bool write_intent = gkfs_has_explicit_mmap_write_intent(*gkfs_fd); void* ptr = ::mmap(addr, length, prot | PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); @@ -448,9 +460,8 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Register mapping under lock so concurrent threads don't race on mmap_set { std::lock_guard lock(get_mmap_set_mutex()); - get_mmap_set().insert( - mmap_entry{ptr, fd, path, length, offset, prot, flags, - write_intent}); + get_mmap_set().insert(mmap_entry{ptr, fd, path, length, offset, prot, + flags, write_intent}); } gkfs::syscall::gkfs_pread(fd, ptr, length, offset); diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 8c232b54a..1e85043bd 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -418,8 +418,8 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { generate_lock_file(path, true); } if(is_offload_temp_path(path)) { - LOG(DEBUG, "{}() offload open path '{}' fd {} flags {}", __func__, - path, fd, flags); + LOG(DEBUG, "{}() offload open path '{}' fd {} flags {}", __func__, path, + fd, flags); } return fd; } @@ -1830,20 +1830,19 @@ gkfs_close(unsigned int fd) { LOG(DEBUG, "{}() offload close-begin path '{}' fd {} pos {} type {} creation_pending {}", __func__, path, fd, file->pos(), static_cast(file->type()), - file->get_flag(gkfs::filemap::OpenFile_flags::creation_pending)); + file->get_flag( + gkfs::filemap::OpenFile_flags::creation_pending)); } if(file->type() == gkfs::filemap::FileType::regular) { + // Reuse the same write-intent predicate as mmap registration to + // keep close-time and read-triggered flush behavior consistent. const bool should_close_flush = - file->get_flag(gkfs::filemap::OpenFile_flags::creation_pending) || - file->get_flag(gkfs::filemap::OpenFile_flags::wronly) || - file->get_flag(gkfs::filemap::OpenFile_flags::append) || - file->get_flag(gkfs::filemap::OpenFile_flags::trunc) || - file->get_flag(gkfs::filemap::OpenFile_flags::creat); + gkfs_has_explicit_mmap_write_intent(*file); if(should_close_flush) { auto flush_err = gkfs_mmap_flush_all_for_path(path); if(flush_err < 0) { - LOG(WARNING, "{}() mmap flush failed for path '{}'", __func__, - path); + LOG(WARNING, "{}() mmap flush failed for path '{}'", + __func__, path); } if(is_offload) { LOG(DEBUG, "{}() offload close mmap_flush path '{}' rc {}", @@ -1877,7 +1876,8 @@ gkfs_close(unsigned int fd) { return -1; } if(is_offload) { - auto [updates, cached_size] = CTX->write_size_cache()->get(path); + auto [updates, cached_size] = + CTX->write_size_cache()->get(path); LOG(DEBUG, "{}() offload close cache path '{}' updates {} cached_size {}", __func__, path, updates, cached_size); -- GitLab From 78f83447f8a16e49b23518f982dd62abc803db58 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sun, 1 Mar 2026 22:20:56 +0100 Subject: [PATCH 50/68] fix s3d --- include/client/gkfs_functions.hpp | 9 +++-- src/client/gkfs_functions.cpp | 15 ++++---- src/client/hooks.cpp | 60 ++++++++++++++++++++++--------- 3 files changed, 56 insertions(+), 28 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index 3e61c38a5..0f585b1ab 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -203,12 +203,11 @@ int gkfs_msync(void* addr, size_t length, int flags); /** - * Returns true only for file descriptors that carry explicit write intent for - * mmap-backed persistence decisions. + * Returns true for file descriptors that should publish mmap-backed writes to + * the daemon. * - * Note: O_RDWR by itself is not considered sufficient write intent here; many - * reader paths use read/write descriptors without intending to publish mmap - * modifications. + * This predicate is path-agnostic on purpose: mmap flush policy should depend + * on open intent, not on workload-specific pathname conventions. */ bool gkfs_has_explicit_mmap_write_intent(gkfs::filemap::OpenFile& file); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 481b4180d..ce5e63df4 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -267,14 +267,15 @@ namespace gkfs::syscall { bool gkfs_has_explicit_mmap_write_intent(gkfs::filemap::OpenFile& file) { // Keep this predicate as the single policy source for mmap-backed flush - // decisions. O_RDWR alone is intentionally excluded: some reader-side - // workflows open files as read/write but never intend to publish mmap - // modifications to the daemon. + // decisions. + // Path-specific behavior is intentionally avoided to keep policy consistent + // across syscall/libc interception modes and applications. return file.get_flag(gkfs::filemap::OpenFile_flags::wronly) || file.get_flag(gkfs::filemap::OpenFile_flags::append) || file.get_flag(gkfs::filemap::OpenFile_flags::trunc) || file.get_flag(gkfs::filemap::OpenFile_flags::creat) || - file.get_flag(gkfs::filemap::OpenFile_flags::creation_pending); + file.get_flag(gkfs::filemap::OpenFile_flags::creation_pending) || + file.get_flag(gkfs::filemap::OpenFile_flags::rdwr); } /** @@ -443,8 +444,10 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // storing the path lets munmap/msync flush data back even when fd is gone. auto gkfs_fd = CTX->file_map()->get(fd); if(!gkfs_fd) { - errno = EBADF; - return MAP_FAILED; + // Descriptor tracking can race with concurrent close/reuse in + // syscall-intercepted multi-threaded runtimes (e.g., MPI internals). + // In that case, gracefully defer to the kernel mmap path. + return ::mmap(addr, length, prot, flags, fd, offset); } std::string path = gkfs_fd->path(); // Persist the open-time write intent with the mapping so read-triggered diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index a0a68188a..406b699ea 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -277,8 +277,11 @@ hook_read(unsigned int fd, void* buf, size_t count) { return -EFAULT; } - if(CTX->file_map()->exist(fd)) { - return with_errno(gkfs::syscall::gkfs_read(fd, buf, count)); + if(CTX->file_map()->get(fd)) { + auto rv = gkfs::syscall::gkfs_read(fd, buf, count); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return syscall_no_intercept_wrapper(SYS_read, fd, buf, count); } @@ -289,8 +292,11 @@ hook_pread(unsigned int fd, char* buf, size_t count, loff_t pos) { LOG(DEBUG, "{}() called with fd: {}, buf: {}, count: {}, pos: {}", __func__, fd, fmt::ptr(buf), count, pos); - if(CTX->file_map()->exist(fd)) { - return with_errno(gkfs::syscall::gkfs_pread(fd, buf, count, pos)); + if(CTX->file_map()->get(fd)) { + auto rv = gkfs::syscall::gkfs_pread(fd, buf, count, pos); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } /* Since kernel 2.6: pread() became pread64(), and pwrite() became * pwrite64(). */ @@ -303,8 +309,11 @@ hook_readv(unsigned long fd, const struct iovec* iov, unsigned long iovcnt) { LOG(DEBUG, "{}() called with fd: {}, iov: {}, iovcnt: {}", __func__, fd, fmt::ptr(iov), iovcnt); - if(CTX->file_map()->exist(fd)) { - return with_errno(gkfs::syscall::gkfs_readv(fd, iov, iovcnt)); + if(CTX->file_map()->get(fd)) { + auto rv = gkfs::syscall::gkfs_readv(fd, iov, iovcnt); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return syscall_no_intercept_wrapper(SYS_readv, fd, iov, iovcnt); } @@ -319,8 +328,11 @@ hook_preadv(unsigned long fd, const struct iovec* iov, unsigned long iovcnt, // "pos_h: {}", // __func__, fd, fmt::ptr(iov), iovcnt, pos_l, pos_h); - if(CTX->file_map()->exist(fd)) { - return with_errno(gkfs::syscall::gkfs_preadv(fd, iov, iovcnt, pos_l)); + if(CTX->file_map()->get(fd)) { + auto rv = gkfs::syscall::gkfs_preadv(fd, iov, iovcnt, pos_l); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return syscall_no_intercept_wrapper(SYS_preadv, fd, iov, iovcnt, pos_l); } @@ -334,8 +346,11 @@ hook_write(unsigned int fd, const char* buf, size_t count) { return -EFAULT; } - if(CTX->file_map()->exist(fd)) { - return with_errno(gkfs::syscall::gkfs_write(fd, buf, count)); + if(CTX->file_map()->get(fd)) { + auto rv = gkfs::syscall::gkfs_write(fd, buf, count); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return syscall_no_intercept_wrapper(SYS_write, fd, buf, count); } @@ -346,8 +361,11 @@ hook_pwrite(unsigned int fd, const char* buf, size_t count, loff_t pos) { LOG(DEBUG, "{}() called with fd: {}, buf: {}, count: {}, pos: {}", __func__, fd, fmt::ptr(buf), count, pos); - if(CTX->file_map()->exist(fd)) { - return with_errno(gkfs::syscall::gkfs_pwrite(fd, buf, count, pos)); + if(CTX->file_map()->get(fd)) { + auto rv = gkfs::syscall::gkfs_pwrite(fd, buf, count, pos); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } /* Since kernel 2.6: pread() became pread64(), and pwrite() became * pwrite64(). */ @@ -360,8 +378,11 @@ hook_writev(unsigned long fd, const struct iovec* iov, unsigned long iovcnt) { LOG(DEBUG, "{}() called with fd: {}, iov: {}, iovcnt: {}", __func__, fd, fmt::ptr(iov), iovcnt); - if(CTX->file_map()->exist(fd)) { - return with_errno(gkfs::syscall::gkfs_writev(fd, iov, iovcnt)); + if(CTX->file_map()->get(fd)) { + auto rv = gkfs::syscall::gkfs_writev(fd, iov, iovcnt); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return syscall_no_intercept_wrapper(SYS_writev, fd, iov, iovcnt); } @@ -376,8 +397,11 @@ hook_pwritev(unsigned long fd, const struct iovec* iov, unsigned long iovcnt, "pos_h: {}", __func__, fd, fmt::ptr(iov), iovcnt, pos_l, pos_h); - if(CTX->file_map()->exist(fd)) { - return with_errno(gkfs::syscall::gkfs_pwritev(fd, iov, iovcnt, pos_l)); + if(CTX->file_map()->get(fd)) { + auto rv = gkfs::syscall::gkfs_pwritev(fd, iov, iovcnt, pos_l); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return syscall_no_intercept_wrapper(SYS_pwritev, fd, iov, iovcnt, pos_l); } @@ -1346,7 +1370,9 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, "{}() called with addr '{}' length '{}' prot '{}' flags '{}' fd '{}' offset '{}'", __func__, fmt::ptr(addr), length, prot, flags, fd, offset); - if(CTX->file_map()->exist(fd)) { + // Single-lookup check to avoid TOCTTOU races with concurrent close/reuse. + // If the descriptor is no longer tracked, fall back to the real syscall. + if(CTX->file_map()->get(fd)) { return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, offset); } return reinterpret_cast(syscall_no_intercept_wrapper( -- GitLab From 4a5291c334efa49fbf37f9ad857007056a581d02 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sun, 1 Mar 2026 22:48:23 +0100 Subject: [PATCH 51/68] s3d+flex --- include/client/gkfs_functions.hpp | 15 +++ include/client/open_file_map.hpp | 1 + src/client/gkfs_data.cpp | 95 ++++++++-------- src/client/gkfs_functions.cpp | 119 ++++++++++++++++---- src/client/gkfs_metadata.cpp | 173 ++++++++++++++++-------------- src/client/hooks.cpp | 49 ++++----- 6 files changed, 280 insertions(+), 172 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index 0f585b1ab..6ed26beb2 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -155,6 +155,21 @@ gkfs_mmap_flush_all_for_path(const std::string& path); bool gkfs_mmap_has_active_path(const std::string& path); +bool +gkfs_mmap_has_active_write_path(const std::string& path); + +void +gkfs_register_deferred_trunc(const std::string& path); + +void +gkfs_unregister_deferred_trunc(const std::string& path); + +bool +gkfs_has_deferred_trunc(const std::string& path); + +int +gkfs_publish_deferred_trunc(gkfs::filemap::OpenFile& file); + ssize_t gkfs_read(int fd, void* buf, size_t count); diff --git a/include/client/open_file_map.hpp b/include/client/open_file_map.hpp index 8dbed3c38..e4b6bd609 100644 --- a/include/client/open_file_map.hpp +++ b/include/client/open_file_map.hpp @@ -63,6 +63,7 @@ enum class OpenFile_flags { cloexec, created, // indicates if the file was created during open creation_pending, // indicates if the file creation is delayed + trunc_pending, // indicates O_TRUNC has been deferred flag_count // this is purely used as a size variable of this enum class }; diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 2bb410020..0f651a3e7 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -76,8 +76,31 @@ namespace gkfs::syscall { namespace { bool -is_offload_temp_path(const std::string& path) { - return path.rfind("/offload/t_", 0) == 0; +is_suspicious_zero_prefix(const void* buf, long read_ret, off64_t offset, + size_t count) { + if(offset != 0 || count < 8) { + return false; + } + if(read_ret == 0) { + return true; + } + if(read_ret < 8) { + return false; + } + const auto* cbuf = reinterpret_cast(buf); + for(size_t i = 0; i < 8; ++i) { + if(cbuf[i] != 0U) { + return false; + } + } + return true; +} + +bool +should_retry_read_after_flush(const gkfs::filemap::OpenFile& file, long read_ret, + const void* buf, off64_t offset, size_t count) { + return gkfs_mmap_has_active_write_path(file.path()) && + is_suspicious_zero_prefix(buf, read_ret, offset, count); } } // namespace @@ -104,7 +127,6 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, } int err; auto path = make_unique(file.path()); - const bool offload_temp = is_offload_temp_path(*path); auto is_append = file.get_flag(gkfs::filemap::OpenFile_flags::append); ssize_t write_size = 0; auto num_replicas = CTX->get_replicas(); @@ -141,6 +163,9 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, } file.set_flag(gkfs::filemap::OpenFile_flags::creation_pending, false); } + if(gkfs_publish_deferred_trunc(file) < 0) { + return -1; + } // clear inline data cache as it is stale if(!file.inline_data().empty()) @@ -153,12 +178,6 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, (offset + count) <= gkfs::config::metadata::inline_data_size)) { bool allow_inline = true; - if(offload_temp) { - // Offload temp files frequently do sparse writes right after small - // headers. Keep them in chunk storage to avoid inline/chunk - // visibility races under concurrent readers. - allow_inline = false; - } // Check if the file is actually larger than the inline header limits us // to. This can happen if we have a write size cache enabled and the // file is large but the metadata is not updated yet on the server. If @@ -230,7 +249,7 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, if(is_append) { auto ret_offset = gkfs::utils::update_file_size( - *path, count, offset, is_append, migrated || offload_temp); + *path, count, offset, is_append, migrated); err = ret_offset.first; if(err) { LOG(ERROR, "update_metadentry_size() failed with err '{}'", err); @@ -271,7 +290,7 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, // extra flags. For now, if migrated is true, we force an update. if(migrated) { auto ret_offset = gkfs::utils::update_file_size( - *path, count, offset, is_append, migrated || offload_temp); + *path, count, offset, is_append, migrated); err = ret_offset.first; if(err) { LOG(ERROR, "update_metadentry_size() failed with err '{}'", @@ -283,7 +302,7 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, } else if(!is_append) { auto ret_offset = gkfs::utils::update_file_size( - *path, count, offset, is_append, migrated || offload_temp); + *path, count, offset, is_append, migrated); err = ret_offset.first; if(err) { LOG(ERROR, "update_metadentry_size() failed with err '{}'", err); @@ -353,12 +372,10 @@ gkfs_write_ws(gkfs::filemap::OpenFile& file, const char* buf, size_t count, #else auto written = gkfs_do_write(file, buf, count, offset, update_pos); #endif - if(is_offload_temp_path(file.path())) { - LOG(DEBUG, - "{}() offload path '{}' count {} offset {} update_pos {} written {} pos_before {} pos_after {}", - __func__, file.path(), count, offset, update_pos, written, - pos_before, file.pos()); - } + LOG(DEBUG, + "{}() path '{}' count {} offset {} update_pos {} written {} pos_before {} pos_after {}", + __func__, file.path(), count, offset, update_pos, written, pos_before, + file.pos()); return written; } @@ -594,40 +611,21 @@ gkfs_do_read(const gkfs::filemap::OpenFile& file, char* buf, size_t count, errno = err; return -1; } - const bool offload_magic_guard = - (is_offload_temp_path(file.path()) && offset == 0 && count >= 8); - auto needs_offload_retry = [&](long read_ret) { - if(!offload_magic_guard) { - return false; - } - if(read_ret == 0) { - return true; - } - if(read_ret < 8) { - return false; - } - const auto* cbuf = reinterpret_cast(buf); - for(size_t i = 0; i < 8; ++i) { - if(cbuf[i] != 0) { - return false; - } - } - return true; - }; - if(needs_offload_retry(ret.second)) { - constexpr int k_offload_eof_retry_count = 2500; - constexpr auto k_offload_eof_retry_sleep = std::chrono::milliseconds(2); - for(int attempt = 1; attempt <= k_offload_eof_retry_count && - needs_offload_retry(ret.second); + if(should_retry_read_after_flush(file, ret.second, buf, offset, count)) { + constexpr int k_read_retry_count = 2500; + constexpr auto k_read_retry_sleep = std::chrono::milliseconds(2); + for(int attempt = 1; + attempt <= k_read_retry_count && + should_retry_read_after_flush(file, ret.second, buf, offset, count); ++attempt) { - // Trigger pending mmap-backed writes, then wait for visible size. + // Trigger pending mmap-backed publication, then retry the read. (void) gkfs_mmap_flush_for_path(file.path(), offset, count); - std::this_thread::sleep_for(k_offload_eof_retry_sleep); + std::this_thread::sleep_for(k_read_retry_sleep); auto [size_err, remote_size] = gkfs::rpc::forward_get_metadentry_size(file.path(), 0); if(size_err == 0 && remote_size > 0 && (attempt % 50) == 0) { LOG(DEBUG, - "{}() offload EOF guard path '{}' attempt {} remote_size {}", + "{}() read retry path '{}' attempt {} remote_size {}", __func__, file.path(), attempt, remote_size); } ret = do_chunk_read(); @@ -635,9 +633,10 @@ gkfs_do_read(const gkfs::filemap::OpenFile& file, char* buf, size_t count, errno = ret.first; return -1; } - if(!needs_offload_retry(ret.second)) { + if(!should_retry_read_after_flush(file, ret.second, buf, offset, + count)) { LOG(DEBUG, - "{}() offload EOF guard recovered path '{}' after retry {} bytes {}", + "{}() read retry recovered path '{}' after retry {} bytes {}", __func__, file.path(), attempt, ret.second); break; } diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index ce5e63df4..ccdce441d 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -59,6 +59,8 @@ #include #include #include +#include +#include #include #ifdef GKFS_ENABLE_CLIENT_METRICS @@ -89,6 +91,8 @@ struct mmap_entry { int prot; int flags; bool write_intent; + size_t seeded_bytes; + uint64_t flush_epoch; // We use addr as the unique key for the set bool @@ -150,11 +154,6 @@ should_flush_mapping(const mmap_entry& entry) { is_shared_map_type(entry.flags) && entry.length != 0; } -bool -is_offload_temp_path(const std::string& path) { - return path.rfind("/offload/t_", 0) == 0; -} - size_t tracked_segments_for_path(const std::string& path) { size_t segments = 0; @@ -169,6 +168,40 @@ tracked_segments_for_path(const std::string& path) { constexpr size_t k_mmap_flush_chunk_size = 8UL * 1024UL * 1024UL; constexpr int k_flush_ok = 0; constexpr int k_flush_unavailable = 1; +constexpr int k_flush_deferred = 2; + +std::atomic& +flush_epoch_counter() { + static std::atomic counter{0}; + return counter; +} + +enum class flush_mode { read_triggered, explicit_sync }; + +bool +has_non_zero_prefix(const char* buf, size_t size) { + const size_t prefix = std::min(size, 8); + for(size_t i = 0; i < prefix; ++i) { + if(static_cast(buf[i]) != 0U) { + return true; + } + } + return false; +} + +bool +remote_prefix_is_non_zero(const std::string& path) { + std::array remote_prefix{}; + std::set failed; + auto [rerr, rsize] = + gkfs::rpc::forward_read(path, remote_prefix.data(), 0, + remote_prefix.size(), 0, failed); + if(rerr != 0 || rsize <= 0) { + return false; + } + return has_non_zero_prefix(remote_prefix.data(), + static_cast(rsize)); +} int open_self_mem_fd() { @@ -178,7 +211,7 @@ open_self_mem_fd() { int flush_range_chunked(const std::string& path, const void* addr, size_t len, - off_t off) { + off_t off, flush_mode mode) { const auto* base = static_cast(addr); size_t total = 0; std::vector bounce(k_mmap_flush_chunk_size); @@ -229,6 +262,18 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, return k_flush_unavailable; } + // Read-triggered publication is best-effort and should not overwrite + // already-visible non-zero headers with a stale all-zero snapshot. + if(mode == flush_mode::read_triggered && chunk_off == 0 && + !has_non_zero_prefix(bounce.data(), copied) && + remote_prefix_is_non_zero(path)) { + LOG(DEBUG, + "{}() skipping stale zero-prefix publication path '{}' off {} len {}", + __func__, path, chunk_off, copied); + (void) ::syscall_no_intercept(SYS_close, mem_fd); + return k_flush_deferred; + } + auto [werr, wsize] = gkfs::rpc::forward_write(path, bounce.data(), chunk_off, copied, 0); if(werr) { @@ -278,6 +323,17 @@ gkfs_has_explicit_mmap_write_intent(gkfs::filemap::OpenFile& file) { file.get_flag(gkfs::filemap::OpenFile_flags::rdwr); } +bool +gkfs_mmap_has_active_write_path(const std::string& path) { + std::lock_guard lock(get_mmap_set_mutex()); + const auto& mmap_set = get_mmap_set(); + return std::any_of(mmap_set.begin(), mmap_set.end(), + [&path](const auto& entry) { + return entry.path == path && + should_flush_mapping(entry); + }); +} + /** * Flush any pending write-mmap for `path` directly to the GekkoFS daemon. * @@ -334,6 +390,14 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, LOG(DEBUG, "{}() flushing overlap path '{}' addr {} len {} file-off {}", __func__, path, addr, len, off); + if(gkfs_has_deferred_trunc(path)) { + LOG(DEBUG, + "{}() deferring read-triggered flush path '{}' while truncate is pending", + __func__, path); + ++it; + continue; + } + // Avoid raw page dereferences here: stale segments can exist when // munmap happens outside the interceptor path. mincore() lets us detect // unmapped pages safely and drop stale tracking entries. @@ -360,7 +424,9 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, continue; } - const int flush_rc = flush_range_chunked(path, addr, len, off); + const int flush_rc = + flush_range_chunked(path, addr, len, off, + flush_mode::read_triggered); LOG(DEBUG, "{}() flush result path '{}' addr {} len {} off {} rc {}", __func__, path, addr, len, off, flush_rc); if(flush_rc == k_flush_unavailable) { @@ -369,11 +435,11 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, LOG(DEBUG, "{}() dropping unavailable overlap path '{}' addr {} len {} off {}", __func__, path, addr, len, off); - if(is_offload_temp_path(path)) { - ++it; - } else { - it = mmap_set.erase(it); - } + it = mmap_set.erase(it); + continue; + } + if(flush_rc == k_flush_deferred) { + ++it; continue; } if(flush_rc < 0) { @@ -383,6 +449,11 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, errno = EIO; return -1; } + const auto next_epoch = flush_epoch_counter().fetch_add(1) + 1; + auto updated = entry; + updated.flush_epoch = next_epoch; + it = mmap_set.erase(it); + it = mmap_set.insert(updated).first; ++flushed_segments; ++it; } @@ -401,8 +472,9 @@ gkfs_mmap_flush_all_for_path(const std::string& path) { continue; } - const int flush_rc = flush_range_chunked(path, entry.addr, entry.length, - entry.offset); + const int flush_rc = + flush_range_chunked(path, entry.addr, entry.length, + entry.offset, flush_mode::explicit_sync); LOG(DEBUG, "{}() flush-all result path '{}' addr {} len {} off {} rc {}", __func__, path, entry.addr, entry.length, entry.offset, flush_rc); @@ -464,10 +536,21 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, { std::lock_guard lock(get_mmap_set_mutex()); get_mmap_set().insert(mmap_entry{ptr, fd, path, length, offset, prot, - flags, write_intent}); + flags, write_intent, 0, 0}); } - gkfs::syscall::gkfs_pread(fd, ptr, length, offset); + const auto seeded = gkfs::syscall::gkfs_pread(fd, ptr, length, offset); + if(seeded > 0) { + std::lock_guard lock(get_mmap_set_mutex()); + auto it = std::find_if(get_mmap_set().begin(), get_mmap_set().end(), + [ptr](const auto& e) { return e.addr == ptr; }); + if(it != get_mmap_set().end()) { + auto updated = *it; + get_mmap_set().erase(it); + updated.seeded_bytes = static_cast(seeded); + get_mmap_set().insert(std::move(updated)); + } + } if(!(prot & PROT_WRITE) && prot != (prot | PROT_READ | PROT_WRITE)) { ::mprotect(ptr, length, prot); @@ -528,7 +611,7 @@ gkfs_msync(void* addr, size_t length, int flags) { for(const auto& range : flush_ranges) { (void) range.fd; if(flush_range_chunked(range.path, range.addr, range.length, - range.offset) < 0) { + range.offset, flush_mode::explicit_sync) < 0) { return -1; } } @@ -617,7 +700,7 @@ gkfs_munmap(void* addr, size_t length) { for(const auto& range : flush_ranges) { (void) range.fd; if(flush_range_chunked(range.path, range.addr, range.length, - range.offset) < 0) { + range.offset, flush_mode::explicit_sync) < 0) { return -1; } } diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 1e85043bd..9f53b7950 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -56,6 +56,7 @@ #include #include #include +#include #include #ifdef GKFS_ENABLE_CLIENT_METRICS @@ -122,27 +123,77 @@ check_parent_dir(const std::string& path) { return 0; } -bool -is_offload_temp_path(const std::string& path) { - return path.rfind("/offload/t_", 0) == 0; +std::mutex& +deferred_trunc_mutex() { + static std::mutex mtx; + return mtx; +} + +std::unordered_map& +deferred_trunc_paths() { + static std::unordered_map paths; + return paths; } bool -should_defer_offload_trunc(const std::string& path, int flags) { - if(!is_offload_temp_path(path)) { +should_defer_trunc_for_consistency(const std::string& path, int flags) { + if(!(flags & O_TRUNC) || !((flags & O_WRONLY) || (flags & O_RDWR))) { return false; } - // Offload temp files are often reopened with O_CREAT|O_TRUNC and then - // closed without writes by user code. Truncating eagerly here can expose - // empty files to concurrent readers. - return (flags & O_CREAT) && (flags & O_TRUNC) && - ((flags & O_WRONLY) || (flags & O_RDWR)); + // Keep truncate publication ordered with mmap-backed publication to avoid + // exposing mixed old/new views to concurrent readers. + return gkfs::syscall::gkfs_mmap_has_active_path(path) || + gkfs::syscall::gkfs_has_deferred_trunc(path); } } // namespace namespace gkfs::syscall { +void +gkfs_register_deferred_trunc(const std::string& path) { + std::lock_guard lock(deferred_trunc_mutex()); + auto& deferred = deferred_trunc_paths(); + deferred[path]++; +} + +void +gkfs_unregister_deferred_trunc(const std::string& path) { + std::lock_guard lock(deferred_trunc_mutex()); + auto& deferred = deferred_trunc_paths(); + auto it = deferred.find(path); + if(it == deferred.end()) { + return; + } + if(it->second <= 1) { + deferred.erase(it); + } else { + --(it->second); + } +} + +bool +gkfs_has_deferred_trunc(const std::string& path) { + std::lock_guard lock(deferred_trunc_mutex()); + const auto& deferred = deferred_trunc_paths(); + return deferred.find(path) != deferred.end(); +} + +int +gkfs_publish_deferred_trunc(gkfs::filemap::OpenFile& file) { + if(!file.get_flag(gkfs::filemap::OpenFile_flags::trunc_pending)) { + return 0; + } + const auto path = file.path(); + auto md = gkfs::utils::get_metadata(path); + if(md && gkfs_truncate(path, md->size(), 0)) { + return -1; + } + file.set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, false); + gkfs_unregister_deferred_trunc(path); + return 0; +} + /** * @brief generate_lock_file @@ -238,14 +289,16 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { file->mode(mode); file->set_flag(gkfs::filemap::OpenFile_flags::creation_pending, true); + if(should_defer_trunc_for_consistency(path, flags)) { + file->set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, + true); + file->inline_data(""); + file->inline_data_size(0); + gkfs_register_deferred_trunc(path); + } if(CTX->protect_files_generator()) { generate_lock_file(path, true); } - if(is_offload_temp_path(path)) { - LOG(DEBUG, - "{}() offload open path '{}' fd {} flags {} create_pending 1", - __func__, path, fd, flags); - } return fd; } @@ -286,10 +339,6 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if(CTX->protect_files_generator()) { generate_lock_file(path, true); } - if(is_offload_temp_path(path)) { - LOG(DEBUG, "{}() offload open path '{}' fd {} flags {}", - __func__, path, fd, flags); - } // file was successfully created. Add to filemap return fd; } @@ -348,13 +397,9 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY))) { - if(should_defer_offload_trunc(new_path, flags)) { + if(should_defer_trunc_for_consistency(new_path, flags)) { LOG(DEBUG, - "{}() deferring O_TRUNC for offload temp path '{}' flags {}", - __func__, new_path, flags); - } else if(gkfs_mmap_has_active_path(new_path)) { - LOG(DEBUG, - "{}() deferring O_TRUNC for active mmap path '{}'", + "{}() deferring O_TRUNC for consistency path '{}'", __func__, new_path); } else { if(gkfs_truncate(new_path, md.size(), 0)) { @@ -366,13 +411,19 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { } } // RENAMED OR SYMLINK NOT PROTECTED - auto fd = CTX->file_map()->add( + auto file = std::make_shared(new_path, - flags)); - if(is_offload_temp_path(new_path)) { - LOG(DEBUG, "{}() offload open path '{}' fd {} flags {}", - __func__, new_path, fd, flags); + flags); + if((flags & O_TRUNC) && + ((flags & O_RDWR) || (flags & O_WRONLY)) && + should_defer_trunc_for_consistency(new_path, flags)) { + file->set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, + true); + file->inline_data(""); + file->inline_data_size(0); + gkfs_register_deferred_trunc(new_path); } + auto fd = CTX->file_map()->add(file); return fd; } } @@ -385,12 +436,8 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { assert(S_ISREG(md.mode())); if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY))) { - if(should_defer_offload_trunc(path, flags)) { - LOG(DEBUG, - "{}() deferring O_TRUNC for offload temp path '{}' flags {}", - __func__, path, flags); - } else if(gkfs_mmap_has_active_path(path)) { - LOG(DEBUG, "{}() deferring O_TRUNC for active mmap path '{}'", + if(should_defer_trunc_for_consistency(path, flags)) { + LOG(DEBUG, "{}() deferring O_TRUNC for consistency path '{}'", __func__, path); } else { if(gkfs_truncate(path, md.size(), 0)) { @@ -407,6 +454,13 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { file->inline_data(md.inline_data()); file->inline_data_size(md.size()); // Store the actual file size } + if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY)) && + should_defer_trunc_for_consistency(path, flags)) { + file->set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, true); + file->inline_data(""); + file->inline_data_size(0); + gkfs_register_deferred_trunc(path); + } auto fd = CTX->file_map()->add(file); @@ -417,10 +471,6 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if(CTX->protect_files_generator()) { generate_lock_file(path, true); } - if(is_offload_temp_path(path)) { - LOG(DEBUG, "{}() offload open path '{}' fd {} flags {}", __func__, path, - fd, flags); - } return fd; } @@ -1825,14 +1875,6 @@ gkfs_close(unsigned int fd) { auto file = CTX->file_map()->get(fd); if(file) { const auto path = file->path(); - const auto is_offload = is_offload_temp_path(path); - if(is_offload) { - LOG(DEBUG, - "{}() offload close-begin path '{}' fd {} pos {} type {} creation_pending {}", - __func__, path, fd, file->pos(), static_cast(file->type()), - file->get_flag( - gkfs::filemap::OpenFile_flags::creation_pending)); - } if(file->type() == gkfs::filemap::FileType::regular) { // Reuse the same write-intent predicate as mmap registration to // keep close-time and read-triggered flush behavior consistent. @@ -1844,14 +1886,6 @@ gkfs_close(unsigned int fd) { LOG(WARNING, "{}() mmap flush failed for path '{}'", __func__, path); } - if(is_offload) { - LOG(DEBUG, "{}() offload close mmap_flush path '{}' rc {}", - __func__, path, flush_err); - } - } else if(is_offload) { - LOG(DEBUG, - "{}() offload close skip mmap_flush path '{}' (read-intent fd)", - __func__, path); } } @@ -1859,11 +1893,10 @@ gkfs_close(unsigned int fd) { gkfs_create(path, file->mode()); file->set_flag(gkfs::filemap::OpenFile_flags::creation_pending, false); - if(is_offload) { - LOG(DEBUG, - "{}() offload close created pending path '{}' mode {}", - __func__, path, file->mode()); - } + } + + if(gkfs_publish_deferred_trunc(*file) < 0) { + return -1; } // flush write size cache to be server consistent @@ -1875,13 +1908,6 @@ gkfs_close(unsigned int fd) { errno = err; return -1; } - if(is_offload) { - auto [updates, cached_size] = - CTX->write_size_cache()->get(path); - LOG(DEBUG, - "{}() offload close cache path '{}' updates {} cached_size {}", - __func__, path, updates, cached_size); - } } if(CTX->use_dentry_cache() && gkfs::config::cache::clear_dentry_cache_on_close) { @@ -1895,17 +1921,6 @@ gkfs_close(unsigned int fd) { generate_lock_file(path, false); } - if(is_offload) { - std::pair size_ret{}; - if(gkfs::config::proxy::fwd_get_size && CTX->use_proxy()) { - size_ret = gkfs::rpc::forward_get_metadentry_size_proxy(path); - } else { - size_ret = gkfs::rpc::forward_get_metadentry_size(path, 0); - } - LOG(DEBUG, - "{}() offload close-end path '{}' fd {} size_err {} server_size {}", - __func__, path, fd, size_ret.first, size_ret.second); - } // No call to the daemon is required CTX->file_map()->remove(fd); return 0; diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index 406b699ea..b2e8477c3 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -602,7 +602,7 @@ hook_lseek(unsigned int fd, off_t offset, unsigned int whence) { LOG(DEBUG, "{}() called with fd: {}, offset: {}, whence: {}", __func__, fd, offset, whence); - if(CTX->file_map()->exist(fd)) { + if(CTX->file_map()->get(fd)) { auto off_ret = gkfs::syscall::gkfs_lseek( fd, static_cast(offset), whence); if(off_ret > std::numeric_limits::max()) { @@ -649,7 +649,7 @@ hook_dup(unsigned int fd) { LOG(DEBUG, "{}() called with oldfd: {}", __func__, fd); - if(CTX->file_map()->exist(fd)) { + if(CTX->file_map()->get(fd)) { return with_errno(gkfs::syscall::gkfs_dup(fd)); } return gsl::narrow_cast(syscall_no_intercept_wrapper(SYS_dup, fd)); @@ -660,7 +660,7 @@ hook_dup2(unsigned int oldfd, unsigned int newfd) { LOG(DEBUG, "{}() called with oldfd: {}, newfd: {}", __func__, oldfd, newfd); - if(CTX->file_map()->exist(oldfd)) { + if(CTX->file_map()->get(oldfd)) { return with_errno(gkfs::syscall::gkfs_dup2(oldfd, newfd)); } return gsl::narrow_cast( @@ -673,7 +673,7 @@ hook_dup3(unsigned int oldfd, unsigned int newfd, int flags) { LOG(DEBUG, "{}() called with oldfd: {}, newfd: {}, flags: {}", __func__, oldfd, newfd, flags); - if(CTX->file_map()->exist(oldfd)) { + if(CTX->file_map()->get(oldfd)) { if(flags & O_CLOEXEC) { LOG(WARNING, "{}() Not supported", __func__); return -ENOTSUP; @@ -693,7 +693,7 @@ hook_getdents(unsigned int fd, struct linux_dirent* dirp, unsigned int count) { return -EFAULT; } - if(CTX->file_map()->exist(fd)) { + if(CTX->file_map()->get(fd)) { return with_errno(gkfs::syscall::gkfs_getdents(fd, dirp, count)); } return syscall_no_intercept_wrapper(SYS_getdents, fd, dirp, count); @@ -710,7 +710,7 @@ hook_getdents64(unsigned int fd, struct linux_dirent64* dirp, return -EFAULT; } - if(CTX->file_map()->exist(fd)) { + if(CTX->file_map()->get(fd)) { LOG(WARNING, "DEBUG: hook_getdents64 fd {} exists in filemap", fd); return with_errno(gkfs::syscall::gkfs_getdents64(fd, dirp, count)); } @@ -760,7 +760,7 @@ int hook_fchmod(unsigned int fd, mode_t mode) { LOG(DEBUG, "{}() called with fd: {}, mode: {}", __func__, fd, mode); - if(CTX->file_map()->exist(fd)) { + if(CTX->file_map()->get(fd)) { LOG(WARNING, "{}() operation not supported, returning success", __func__); return 0; @@ -827,7 +827,7 @@ hook_fchown(unsigned int fd, uid_t owner, gid_t group) { LOG(DEBUG, "{}() called with fd: {}, owner: {}, group: {}", __func__, fd, owner, group); - if(CTX->file_map()->exist(fd)) { + if(CTX->file_map()->get(fd)) { LOG(WARNING, "{}() operation not supported, returning success", __func__); return 0; @@ -1060,7 +1060,8 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { // cmd, // arg); - if(!CTX->file_map()->exist(fd)) { + auto file = CTX->file_map()->get(fd); + if(!file) { return gsl::narrow_cast( syscall_no_intercept_wrapper(SYS_fcntl, fd, cmd, arg)); } @@ -1077,13 +1078,12 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { if(ret == -1) { return -errno; } - CTX->file_map()->get(fd)->set_flag( - gkfs::filemap::OpenFile_flags::cloexec, true); + file->set_flag(gkfs::filemap::OpenFile_flags::cloexec, true); return ret; case F_GETFD: LOG(DEBUG, "{}() F_GETFD on fd {}", __func__, fd); - if(CTX->file_map()->get(fd)->get_flag( + if(file->get_flag( gkfs::filemap::OpenFile_flags::cloexec)) { return FD_CLOEXEC; } @@ -1092,15 +1092,15 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { case F_GETFL: LOG(DEBUG, "{}() F_GETFL on fd {}", __func__, fd); ret = 0; - if(CTX->file_map()->get(fd)->get_flag( + if(file->get_flag( gkfs::filemap::OpenFile_flags::rdonly)) { ret |= O_RDONLY; } - if(CTX->file_map()->get(fd)->get_flag( + if(file->get_flag( gkfs::filemap::OpenFile_flags::wronly)) { ret |= O_WRONLY; } - if(CTX->file_map()->get(fd)->get_flag( + if(file->get_flag( gkfs::filemap::OpenFile_flags::rdwr)) { ret |= O_RDWR; } @@ -1109,20 +1109,16 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { LOG(DEBUG, "{}() F_SETFL on fd {}", __func__, fd); // get flags from arg and setup if(arg & O_RDONLY) { - CTX->file_map()->get(fd)->set_flag( - gkfs::filemap::OpenFile_flags::rdonly, true); + file->set_flag(gkfs::filemap::OpenFile_flags::rdonly, true); } if(arg & O_WRONLY) { - CTX->file_map()->get(fd)->set_flag( - gkfs::filemap::OpenFile_flags::wronly, true); + file->set_flag(gkfs::filemap::OpenFile_flags::wronly, true); } if(arg & O_RDWR) { - CTX->file_map()->get(fd)->set_flag( - gkfs::filemap::OpenFile_flags::rdwr, true); + file->set_flag(gkfs::filemap::OpenFile_flags::rdwr, true); } if(arg & O_APPEND) { - CTX->file_map()->get(fd)->set_flag( - gkfs::filemap::OpenFile_flags::append, true); + file->set_flag(gkfs::filemap::OpenFile_flags::append, true); } if(arg & O_NONBLOCK) { LOG(DEBUG, "[GKFS] F_SETFL {} NONBLOCK", fd); @@ -1132,16 +1128,15 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { } if(arg & O_CLOEXEC) { LOG(DEBUG, "[GKFS] F_SETFL {} CLOEXEC", fd); - CTX->file_map()->get(fd)->set_flag( - gkfs::filemap::OpenFile_flags::cloexec, true); + file->set_flag(gkfs::filemap::OpenFile_flags::cloexec, true); } return 0; case F_SETFD: LOG(DEBUG, "{}() [fd: {}, cmd: F_SETFD, FD_CLOEXEC: {}]", __func__, fd, (arg & FD_CLOEXEC)); - CTX->file_map()->get(fd)->set_flag( - gkfs::filemap::OpenFile_flags::cloexec, (arg & FD_CLOEXEC)); + file->set_flag(gkfs::filemap::OpenFile_flags::cloexec, + (arg & FD_CLOEXEC)); return 0; default: LOG(ERROR, "{}() unrecognized command {} on fd {}", __func__, cmd, -- GitLab From 0df5e947ce4457cddc34bdac3e9862456fed9fdb Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sun, 1 Mar 2026 22:54:26 +0100 Subject: [PATCH 52/68] fix --- include/client/gkfs_functions.hpp | 3 +++ src/client/gkfs_functions.cpp | 15 +++++++++------ src/client/gkfs_metadata.cpp | 26 ++++++++++++++++++++++---- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index 6ed26beb2..f7ee815dc 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -170,6 +170,9 @@ gkfs_has_deferred_trunc(const std::string& path); int gkfs_publish_deferred_trunc(gkfs::filemap::OpenFile& file); +int +gkfs_publish_deferred_trunc_for_path(const std::string& path); + ssize_t gkfs_read(int fd, void* buf, size_t count); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index ccdce441d..25610b0d5 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -363,6 +363,7 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, auto& mmap_set = get_mmap_set(); size_t flushed_segments = 0; const long page_size = ::sysconf(_SC_PAGESIZE); + bool trunc_published = false; LOG(DEBUG, "{}() path '{}' tracked segments {} read-range [{}..{})", __func__, path, tracked_segments_for_path(path), read_begin, read_end); @@ -390,12 +391,14 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, LOG(DEBUG, "{}() flushing overlap path '{}' addr {} len {} file-off {}", __func__, path, addr, len, off); - if(gkfs_has_deferred_trunc(path)) { - LOG(DEBUG, - "{}() deferring read-triggered flush path '{}' while truncate is pending", - __func__, path); - ++it; - continue; + if(!trunc_published && gkfs_has_deferred_trunc(path)) { + // Defer-truncation must be published before the first mmap-backed + // write publication to preserve O_TRUNC semantics without dropping + // concurrent reader visibility. + if(gkfs_publish_deferred_trunc_for_path(path) < 0) { + return -1; + } + trunc_published = true; } // Avoid raw page dereferences here: stale segments can exist when diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 9f53b7950..648e87007 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -185,6 +185,12 @@ gkfs_publish_deferred_trunc(gkfs::filemap::OpenFile& file) { return 0; } const auto path = file.path(); + if(!gkfs_has_deferred_trunc(path)) { + // Truncation was already published via another publication path + // (e.g., mmap read-triggered flush). Avoid truncating again. + file.set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, false); + return 0; + } auto md = gkfs::utils::get_metadata(path); if(md && gkfs_truncate(path, md->size(), 0)) { return -1; @@ -194,6 +200,19 @@ gkfs_publish_deferred_trunc(gkfs::filemap::OpenFile& file) { return 0; } +int +gkfs_publish_deferred_trunc_for_path(const std::string& path) { + if(!gkfs_has_deferred_trunc(path)) { + return 0; + } + auto md = gkfs::utils::get_metadata(path); + if(md && gkfs_truncate(path, md->size(), 0)) { + return -1; + } + gkfs_unregister_deferred_trunc(path); + return 0; +} + /** * @brief generate_lock_file @@ -1875,6 +1894,9 @@ gkfs_close(unsigned int fd) { auto file = CTX->file_map()->get(fd); if(file) { const auto path = file->path(); + if(gkfs_publish_deferred_trunc(*file) < 0) { + return -1; + } if(file->type() == gkfs::filemap::FileType::regular) { // Reuse the same write-intent predicate as mmap registration to // keep close-time and read-triggered flush behavior consistent. @@ -1895,10 +1917,6 @@ gkfs_close(unsigned int fd) { false); } - if(gkfs_publish_deferred_trunc(*file) < 0) { - return -1; - } - // flush write size cache to be server consistent if(CTX->use_write_size_cache() && CTX->write_size_cache()) { auto err = CTX->write_size_cache()->flush(path, true).first; -- GitLab From 4b0b64187a6025752f41194e3fa2d1921540e085 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Sun, 1 Mar 2026 22:59:50 +0100 Subject: [PATCH 53/68] fix --- src/client/gkfs_functions.cpp | 39 +++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 25610b0d5..7af17b0c1 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -211,7 +211,7 @@ open_self_mem_fd() { int flush_range_chunked(const std::string& path, const void* addr, size_t len, - off_t off, flush_mode mode) { + off_t off, flush_mode mode, size_t seeded_bytes = 0) { const auto* base = static_cast(addr); size_t total = 0; std::vector bounce(k_mmap_flush_chunk_size); @@ -262,16 +262,29 @@ flush_range_chunked(const std::string& path, const void* addr, size_t len, return k_flush_unavailable; } - // Read-triggered publication is best-effort and should not overwrite - // already-visible non-zero headers with a stale all-zero snapshot. if(mode == flush_mode::read_triggered && chunk_off == 0 && - !has_non_zero_prefix(bounce.data(), copied) && - remote_prefix_is_non_zero(path)) { - LOG(DEBUG, - "{}() skipping stale zero-prefix publication path '{}' off {} len {}", - __func__, path, chunk_off, copied); - (void) ::syscall_no_intercept(SYS_close, mem_fd); - return k_flush_deferred; + !has_non_zero_prefix(bounce.data(), copied)) { + // Read-triggered publication must not synthesize a brand-new + // all-zero header at file offset 0 from an mmap view that was never + // seeded from existing file bytes. That can race with concurrent + // writers and make readers observe a permanent invalid magic. + if(seeded_bytes < 8) { + LOG(DEBUG, + "{}() deferring unseeded zero-prefix publication path '{}' off {} len {} seeded {}", + __func__, path, chunk_off, copied, seeded_bytes); + (void) ::syscall_no_intercept(SYS_close, mem_fd); + return k_flush_deferred; + } + + // Also avoid clobbering an already-visible non-zero header with a + // stale all-zero snapshot. + if(remote_prefix_is_non_zero(path)) { + LOG(DEBUG, + "{}() skipping stale zero-prefix publication path '{}' off {} len {}", + __func__, path, chunk_off, copied); + (void) ::syscall_no_intercept(SYS_close, mem_fd); + return k_flush_deferred; + } } auto [werr, wsize] = gkfs::rpc::forward_write(path, bounce.data(), @@ -427,9 +440,9 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, continue; } - const int flush_rc = - flush_range_chunked(path, addr, len, off, - flush_mode::read_triggered); + const int flush_rc = flush_range_chunked( + path, addr, len, off, flush_mode::read_triggered, + entry.seeded_bytes); LOG(DEBUG, "{}() flush result path '{}' addr {} len {} off {} rc {}", __func__, path, addr, len, off, flush_rc); if(flush_rc == k_flush_unavailable) { -- GitLab From 9f2e1c2bb80b6acbc9bf6e83adda98b7243edcb6 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 2 Mar 2026 11:04:30 +0100 Subject: [PATCH 54/68] locks3d --- src/client/hooks.cpp | 219 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 188 insertions(+), 31 deletions(-) diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index b2e8477c3..cbff3fd44 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -48,6 +48,8 @@ #include #include +#include +#include #include @@ -67,6 +69,30 @@ with_errno(T ret) { return (ret < 0) ? -errno : ret; } +int +get_lock_shim_fd(const std::string& path) { + static std::mutex shim_mutex; + static std::unordered_map shim_fds; + + std::lock_guard lock(shim_mutex); + const auto it = shim_fds.find(path); + if(it != shim_fds.end()) { + return it->second; + } + + const auto shim_name = + std::string{"/tmp/gkfs_fcntl_lock_"} + + std::to_string(std::hash{}(path)); + + const auto fd = gsl::narrow_cast(syscall_no_intercept_wrapper( + SYS_openat, AT_FDCWD, shim_name.c_str(), O_RDWR | O_CREAT, 0600)); + if(fd < 0) { + return -1; + } + shim_fds.emplace(path, fd); + return fd; +} + } // namespace namespace gkfs::hook { @@ -119,12 +145,24 @@ hook_close(int fd) { LOG(DEBUG, "{}() called with fd: {}", __func__, fd); + // Prefer closing a real kernel fd first. This prevents stale GKFS fd-map + // entries from hijacking close() after descriptor reuse. + const auto kret = syscall_no_intercept_wrapper(SYS_close, fd); + if(kret == 0) { + // Best-effort stale-entry cleanup if this fd was still tracked. + (void) gkfs::syscall::gkfs_close(fd); + return 0; + } + if(errno != EBADF) { + return gsl::narrow_cast(kret); + } + auto ret = gkfs::syscall::gkfs_close(fd); if(ret == 0) return 0; - return gsl::narrow_cast(syscall_no_intercept_wrapper(SYS_close, fd)); + return gsl::narrow_cast(kret); } #ifdef SYS_stat int @@ -144,7 +182,7 @@ hook_stat(const char* path, struct stat* buf) { } return gsl::narrow_cast( - syscall_no_intercept_wrapper(SYS_stat, rel_path.c_str(), buf)); + syscall_no_intercept_wrapper(SYS_stat, path, buf)); } #endif @@ -203,7 +241,7 @@ hook_lstat(const char* path, struct stat* buf) { return with_errno(gkfs::syscall::gkfs_stat(rel_path, buf, false)); } return gsl::narrow_cast( - syscall_no_intercept_wrapper(SYS_lstat, rel_path.c_str(), buf)); + syscall_no_intercept_wrapper(SYS_lstat, path, buf)); } #endif @@ -525,7 +563,7 @@ hook_access(const char* path, int mask) { return ret; } return gsl::narrow_cast( - syscall_no_intercept_wrapper(SYS_access, rel_path.c_str(), mask)); + syscall_no_intercept_wrapper(SYS_access, path, mask)); } #endif @@ -607,11 +645,12 @@ hook_lseek(unsigned int fd, off_t offset, unsigned int whence) { fd, static_cast(offset), whence); if(off_ret > std::numeric_limits::max()) { return -EOVERFLOW; - } else if(off_ret < 0) { + } else if(off_ret < 0 && errno != EBADF) { return -errno; + } else if(off_ret >= 0) { + LOG(DEBUG, "{}() returning {}", __func__, off_ret); + return off_ret; } - LOG(DEBUG, "{}() returning {}", __func__, off_ret); - return off_ret; } return syscall_no_intercept_wrapper(SYS_lseek, fd, offset, whence); } @@ -628,7 +667,7 @@ hook_truncate(const char* path, long length) { return with_errno(gkfs::syscall::gkfs_truncate(rel_path, length)); } return gsl::narrow_cast(syscall_no_intercept_wrapper( - SYS_truncate, rel_path.c_str(), length)); + SYS_truncate, path, length)); } int @@ -638,7 +677,10 @@ hook_ftruncate(unsigned int fd, unsigned long length) { if(auto file = CTX->file_map()->get(fd)) { auto path = file->path(); - return with_errno(gkfs::syscall::gkfs_truncate(path, length)); + auto rv = gkfs::syscall::gkfs_truncate(path, length); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return gsl::narrow_cast( syscall_no_intercept_wrapper(SYS_ftruncate, fd, length)); @@ -650,7 +692,10 @@ hook_dup(unsigned int fd) { LOG(DEBUG, "{}() called with oldfd: {}", __func__, fd); if(CTX->file_map()->get(fd)) { - return with_errno(gkfs::syscall::gkfs_dup(fd)); + auto rv = gkfs::syscall::gkfs_dup(fd); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return gsl::narrow_cast(syscall_no_intercept_wrapper(SYS_dup, fd)); } @@ -661,7 +706,10 @@ hook_dup2(unsigned int oldfd, unsigned int newfd) { LOG(DEBUG, "{}() called with oldfd: {}, newfd: {}", __func__, oldfd, newfd); if(CTX->file_map()->get(oldfd)) { - return with_errno(gkfs::syscall::gkfs_dup2(oldfd, newfd)); + auto rv = gkfs::syscall::gkfs_dup2(oldfd, newfd); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return gsl::narrow_cast( syscall_no_intercept_wrapper(SYS_dup2, oldfd, newfd)); @@ -677,8 +725,12 @@ hook_dup3(unsigned int oldfd, unsigned int newfd, int flags) { if(flags & O_CLOEXEC) { LOG(WARNING, "{}() Not supported", __func__); return -ENOTSUP; - } else - return with_errno(gkfs::syscall::gkfs_dup2(oldfd, newfd)); + } else { + auto rv = gkfs::syscall::gkfs_dup2(oldfd, newfd); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } + } } return gsl::narrow_cast( syscall_no_intercept_wrapper(SYS_dup3, oldfd, newfd, flags)); @@ -694,7 +746,10 @@ hook_getdents(unsigned int fd, struct linux_dirent* dirp, unsigned int count) { } if(CTX->file_map()->get(fd)) { - return with_errno(gkfs::syscall::gkfs_getdents(fd, dirp, count)); + auto rv = gkfs::syscall::gkfs_getdents(fd, dirp, count); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return syscall_no_intercept_wrapper(SYS_getdents, fd, dirp, count); } @@ -711,10 +766,11 @@ hook_getdents64(unsigned int fd, struct linux_dirent64* dirp, } if(CTX->file_map()->get(fd)) { - LOG(WARNING, "DEBUG: hook_getdents64 fd {} exists in filemap", fd); - return with_errno(gkfs::syscall::gkfs_getdents64(fd, dirp, count)); + auto rv = gkfs::syscall::gkfs_getdents64(fd, dirp, count); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } - LOG(WARNING, "DEBUG: hook_getdents64 fd {} does NOT exist in filemap", fd); return syscall_no_intercept_wrapper(SYS_getdents64, fd, dirp, count); } @@ -760,12 +816,17 @@ int hook_fchmod(unsigned int fd, mode_t mode) { LOG(DEBUG, "{}() called with fd: {}, mode: {}", __func__, fd, mode); + auto rv = syscall_no_intercept_wrapper(SYS_fchmod, fd, mode); + if(rv >= 0 || errno != EBADF) { + return rv; + } + if(CTX->file_map()->get(fd)) { LOG(WARNING, "{}() operation not supported, returning success", __func__); return 0; } - return syscall_no_intercept_wrapper(SYS_fchmod, fd, mode); + return rv; } @@ -827,12 +888,17 @@ hook_fchown(unsigned int fd, uid_t owner, gid_t group) { LOG(DEBUG, "{}() called with fd: {}, owner: {}, group: {}", __func__, fd, owner, group); + auto rv = syscall_no_intercept_wrapper(SYS_fchown, fd, owner, group); + if(rv >= 0 || errno != EBADF) { + return rv; + } + if(CTX->file_map()->get(fd)) { LOG(WARNING, "{}() operation not supported, returning success", __func__); return 0; } - return syscall_no_intercept_wrapper(SYS_fchown, fd, owner, group); + return rv; } int @@ -1060,10 +1126,14 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { // cmd, // arg); + auto kernel_ret = syscall_no_intercept_wrapper(SYS_fcntl, fd, cmd, arg); + if(kernel_ret != -1 || errno != EBADF) { + return gsl::narrow_cast(kernel_ret); + } + auto file = CTX->file_map()->get(fd); if(!file) { - return gsl::narrow_cast( - syscall_no_intercept_wrapper(SYS_fcntl, fd, cmd, arg)); + return gsl::narrow_cast(kernel_ret); } int ret; switch(cmd) { @@ -1138,6 +1208,74 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { file->set_flag(gkfs::filemap::OpenFile_flags::cloexec, (arg & FD_CLOEXEC)); return 0; + + case F_GETLK: { + auto* lk = reinterpret_cast(arg); + if(lk == nullptr) { + return -EINVAL; + } + + const auto lock_fd = get_lock_shim_fd(file->path()); + if(lock_fd < 0) { + return -errno; + } + + const auto lock_ret = syscall_no_intercept_wrapper(SYS_fcntl, + lock_fd, cmd, + arg); + return with_errno(gsl::narrow_cast(lock_ret)); + } + + case F_SETLK: + case F_SETLKW: { + const auto lock_fd = get_lock_shim_fd(file->path()); + if(lock_fd < 0) { + return -errno; + } + + const auto lock_ret = syscall_no_intercept_wrapper(SYS_fcntl, + lock_fd, cmd, + arg); + return with_errno(gsl::narrow_cast(lock_ret)); + } + +#if defined(F_GETLK64) && (!defined(F_GETLK) || (F_GETLK64 != F_GETLK)) + case F_GETLK64: { + auto* lk = reinterpret_cast(arg); + if(lk == nullptr) { + return -EINVAL; + } + + const auto lock_fd = get_lock_shim_fd(file->path()); + if(lock_fd < 0) { + return -errno; + } + + const auto lock_ret = syscall_no_intercept_wrapper(SYS_fcntl, + lock_fd, cmd, + arg); + return with_errno(gsl::narrow_cast(lock_ret)); + } +#endif + +#if defined(F_SETLK64) && (!defined(F_SETLK) || (F_SETLK64 != F_SETLK)) + case F_SETLK64: +#endif +#if defined(F_SETLKW64) && (!defined(F_SETLKW) || (F_SETLKW64 != F_SETLKW)) + case F_SETLKW64: +#endif + { + const auto lock_fd = get_lock_shim_fd(file->path()); + if(lock_fd < 0) { + return -errno; + } + + const auto lock_ret = syscall_no_intercept_wrapper(SYS_fcntl, + lock_fd, cmd, + arg); + return with_errno(gsl::narrow_cast(lock_ret)); + } + default: LOG(ERROR, "{}() unrecognized command {} on fd {}", __func__, cmd, fd); @@ -1229,10 +1367,15 @@ hook_statfs(const char* path, struct statfs* buf) { std::string rel_path; if(CTX->relativize_path(path, rel_path)) { + // Keep POSIX statfs semantics for internal paths: the referenced path + // must exist. MPI-IO coordination logic relies on ENOENT here. + if(rel_path != "/" && !gkfs::utils::get_metadata(rel_path, false)) { + return -ENOENT; + } return with_errno(gkfs::syscall::gkfs_statfs(buf)); } return gsl::narrow_cast( - syscall_no_intercept_wrapper(SYS_statfs, rel_path.c_str(), buf)); + syscall_no_intercept_wrapper(SYS_statfs, path, buf)); } int @@ -1243,11 +1386,15 @@ hook_fstatfs(unsigned int fd, struct statfs* buf) { return -EFAULT; } - if(CTX->file_map()->exist(fd)) { + auto rv = syscall_no_intercept_wrapper(SYS_fstatfs, fd, buf); + if(rv >= 0 || errno != EBADF) { + return rv; + } + + if(CTX->file_map()->get(fd)) { return with_errno(gkfs::syscall::gkfs_statfs(buf)); } - return gsl::narrow_cast( - syscall_no_intercept_wrapper(SYS_fstatfs, fd, buf)); + return rv; } /* The function should broadcast a flush message (pmem_persist i.e.) if the @@ -1256,8 +1403,11 @@ int hook_fsync(unsigned int fd) { LOG(DEBUG, "{}() called with fd: {}", __func__, fd); - if(CTX->file_map()->exist(fd)) { - return with_errno(gkfs::syscall::gkfs_fsync(fd)); + if(CTX->file_map()->get(fd)) { + auto rv = gkfs::syscall::gkfs_fsync(fd); + if(rv >= 0 || errno != EBADF) { + return with_errno(rv); + } } return gsl::narrow_cast(syscall_no_intercept_wrapper(SYS_fsync, fd)); @@ -1365,10 +1515,17 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, "{}() called with addr '{}' length '{}' prot '{}' flags '{}' fd '{}' offset '{}'", __func__, fmt::ptr(addr), length, prot, flags, fd, offset); - // Single-lookup check to avoid TOCTTOU races with concurrent close/reuse. - // If the descriptor is no longer tracked, fall back to the real syscall. - if(CTX->file_map()->get(fd)) { - return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, offset); + // If fd is tracked as GKFS, verify that it is not a reused kernel fd. + if(auto file = CTX->file_map()->get(fd)) { + struct stat st {}; + const auto kret = syscall_no_intercept_wrapper(SYS_fstat, fd, &st); + if(kret == -1 && errno == EBADF) { + return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, + offset); + } + LOG(DEBUG, + "{}() detected kernel-backed fd reuse for fd {} (tracked path '{}'), bypassing GKFS mmap", + __func__, fd, file->path()); } return reinterpret_cast(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); -- GitLab From 6860f8da861147f0d8f2f2d2d81c94a30747142b Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 2 Mar 2026 19:20:23 +0100 Subject: [PATCH 55/68] fix s3d and flex --- src/client/gkfs_data.cpp | 13 +- src/client/gkfs_functions.cpp | 239 +++++++++++++++--------- src/client/gkfs_libc.cpp | 15 +- src/client/gkfs_metadata.cpp | 5 +- src/client/hooks.cpp | 334 +++++++++------------------------- 5 files changed, 244 insertions(+), 362 deletions(-) diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 0f651a3e7..5b1aea829 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -97,8 +97,9 @@ is_suspicious_zero_prefix(const void* buf, long read_ret, off64_t offset, } bool -should_retry_read_after_flush(const gkfs::filemap::OpenFile& file, long read_ret, - const void* buf, off64_t offset, size_t count) { +should_retry_read_after_flush(const gkfs::filemap::OpenFile& file, + long read_ret, const void* buf, off64_t offset, + size_t count) { return gkfs_mmap_has_active_write_path(file.path()) && is_suspicious_zero_prefix(buf, read_ret, offset, count); } @@ -248,8 +249,8 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, } if(is_append) { - auto ret_offset = gkfs::utils::update_file_size( - *path, count, offset, is_append, migrated); + auto ret_offset = gkfs::utils::update_file_size(*path, count, offset, + is_append, migrated); err = ret_offset.first; if(err) { LOG(ERROR, "update_metadentry_size() failed with err '{}'", err); @@ -301,8 +302,8 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, } } else if(!is_append) { - auto ret_offset = gkfs::utils::update_file_size( - *path, count, offset, is_append, migrated); + auto ret_offset = gkfs::utils::update_file_size(*path, count, offset, + is_append, migrated); err = ret_offset.first; if(err) { LOG(ERROR, "update_metadentry_size() failed with err '{}'", err); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 7af17b0c1..a9dc4138f 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -193,9 +193,8 @@ bool remote_prefix_is_non_zero(const std::string& path) { std::array remote_prefix{}; std::set failed; - auto [rerr, rsize] = - gkfs::rpc::forward_read(path, remote_prefix.data(), 0, - remote_prefix.size(), 0, failed); + auto [rerr, rsize] = gkfs::rpc::forward_read( + path, remote_prefix.data(), 0, remote_prefix.size(), 0, failed); if(rerr != 0 || rsize <= 0) { return false; } @@ -340,11 +339,10 @@ bool gkfs_mmap_has_active_write_path(const std::string& path) { std::lock_guard lock(get_mmap_set_mutex()); const auto& mmap_set = get_mmap_set(); - return std::any_of(mmap_set.begin(), mmap_set.end(), - [&path](const auto& entry) { - return entry.path == path && - should_flush_mapping(entry); - }); + return std::any_of( + mmap_set.begin(), mmap_set.end(), [&path](const auto& entry) { + return entry.path == path && should_flush_mapping(entry); + }); } /** @@ -372,107 +370,161 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, return -1; } - std::lock_guard lock(get_mmap_set_mutex()); - auto& mmap_set = get_mmap_set(); - size_t flushed_segments = 0; - const long page_size = ::sysconf(_SC_PAGESIZE); - bool trunc_published = false; + struct flush_candidate { + void* addr; + size_t len; + off_t off; + size_t seeded_bytes; + void* map_base; + size_t map_len; + off_t map_off; + }; - LOG(DEBUG, "{}() path '{}' tracked segments {} read-range [{}..{})", - __func__, path, tracked_segments_for_path(path), read_begin, read_end); + std::vector candidates; + { + std::lock_guard lock(get_mmap_set_mutex()); + LOG(DEBUG, "{}() path '{}' tracked segments {} read-range [{}..{})", + __func__, path, tracked_segments_for_path(path), read_begin, + read_end); - for(auto it = mmap_set.begin(); it != mmap_set.end();) { - const auto& entry = *it; - if(entry.path != path || !should_flush_mapping(entry)) { - ++it; - continue; - } + for(const auto& entry : get_mmap_set()) { + if(entry.path != path || !should_flush_mapping(entry)) { + continue; + } - const auto entry_begin = static_cast(entry.offset); - const auto entry_end = entry_begin + static_cast(entry.length); - if(entry_end <= read_begin || read_end <= entry_begin) { - ++it; - continue; + const auto entry_begin = static_cast(entry.offset); + const auto entry_end = + entry_begin + static_cast(entry.length); + if(entry_end <= read_begin || read_end <= entry_begin) { + continue; + } + + const auto flush_begin = std::max(entry_begin, read_begin); + const auto flush_end = std::min(entry_end, read_end); + const auto rel_begin = + static_cast(flush_begin - entry_begin); + const size_t len = static_cast(flush_end - flush_begin); + candidates.push_back(flush_candidate{ + addr_to_ptr(ptr_to_addr(entry.addr) + rel_begin), len, + static_cast(flush_begin), entry.seeded_bytes, + entry.addr, entry.length, entry.offset}); } + } - const auto flush_begin = std::max(entry_begin, read_begin); - const auto flush_end = std::min(entry_end, read_end); - const auto rel_begin = static_cast(flush_begin - entry_begin); - const size_t len = static_cast(flush_end - flush_begin); - void* addr = addr_to_ptr(ptr_to_addr(entry.addr) + rel_begin); - off_t off = static_cast(flush_begin); - LOG(DEBUG, "{}() flushing overlap path '{}' addr {} len {} file-off {}", - __func__, path, addr, len, off); + if(candidates.empty()) { + LOG(DEBUG, "{}() path '{}' flushed segments {}", __func__, path, 0); + return 0; + } - if(!trunc_published && gkfs_has_deferred_trunc(path)) { - // Defer-truncation must be published before the first mmap-backed - // write publication to preserve O_TRUNC semantics without dropping - // concurrent reader visibility. - if(gkfs_publish_deferred_trunc_for_path(path) < 0) { - return -1; - } - trunc_published = true; - } + // Publish deferred truncation once before first mmap-backed write + // publication. This keeps O_TRUNC ordering consistent while avoiding + // holding the mmap-tracking lock across RPC. + if(gkfs_has_deferred_trunc(path) && + gkfs_publish_deferred_trunc_for_path(path) < 0) { + return -1; + } - // Avoid raw page dereferences here: stale segments can exist when - // munmap happens outside the interceptor path. mincore() lets us detect - // unmapped pages safely and drop stale tracking entries. - bool mapped = true; + const long page_size = ::sysconf(_SC_PAGESIZE); + size_t flushed_segments = 0; + for(const auto& c : candidates) { + LOG(DEBUG, "{}() flushing overlap path '{}' addr {} len {} file-off {}", + __func__, path, c.addr, c.len, c.off); + + bool definitely_unmapped = false; if(page_size > 0) { - const auto start = ptr_to_addr(addr); - const auto end = start + len; + const auto page_size_u = static_cast(page_size); + const auto start = ptr_to_addr(c.addr); + const auto end = start + c.len; + const auto aligned_start = start & ~(page_size_u - 1U); + const auto aligned_end = + (end + page_size_u - 1U) & ~(page_size_u - 1U); unsigned char mincore_vec = 0; - for(auto p = start; p < end; - p += static_cast(page_size)) { - if(::mincore(addr_to_ptr(p), static_cast(page_size), + for(auto p = aligned_start; p < aligned_end; p += page_size_u) { + if(::mincore(addr_to_ptr(p), static_cast(page_size_u), &mincore_vec) != 0) { - mapped = false; + if(errno == ENOMEM) { + definitely_unmapped = true; + } else { + LOG(DEBUG, + "{}() mincore indeterminate for path '{}' addr {} errno {}, continuing with self-mem copy", + __func__, path, addr_to_ptr(p), errno); + } break; } } } - if(!mapped) { - LOG(WARNING, - "{}() dropping stale mmap segment path '{}' addr {} len {}", - __func__, path, addr, len); - it = mmap_set.erase(it); + if(definitely_unmapped) { + std::lock_guard lock(get_mmap_set_mutex()); + auto& mmap_set = get_mmap_set(); + auto it = std::find_if( + mmap_set.begin(), mmap_set.end(), [&](const auto& entry) { + return entry.path == path && entry.addr == c.map_base && + entry.length == c.map_len && + entry.offset == c.map_off; + }); + if(it != mmap_set.end()) { + LOG(WARNING, + "{}() dropping stale mmap segment path '{}' addr {} len {}", + __func__, path, c.addr, c.len); + mmap_set.erase(it); + } continue; } - const int flush_rc = flush_range_chunked( - path, addr, len, off, flush_mode::read_triggered, - entry.seeded_bytes); + const int flush_rc = + flush_range_chunked(path, c.addr, c.len, c.off, + flush_mode::read_triggered, c.seeded_bytes); LOG(DEBUG, "{}() flush result path '{}' addr {} len {} off {} rc {}", - __func__, path, addr, len, off, flush_rc); + __func__, path, c.addr, c.len, c.off, flush_rc); if(flush_rc == k_flush_unavailable) { - // Mapping raced with unmap; treat as best-effort flush miss and - // continue without surfacing EIO. - LOG(DEBUG, - "{}() dropping unavailable overlap path '{}' addr {} len {} off {}", - __func__, path, addr, len, off); - it = mmap_set.erase(it); + std::lock_guard lock(get_mmap_set_mutex()); + auto& mmap_set = get_mmap_set(); + auto it = std::find_if( + mmap_set.begin(), mmap_set.end(), [&](const auto& entry) { + return entry.path == path && entry.addr == c.map_base && + entry.length == c.map_len && + entry.offset == c.map_off; + }); + if(it != mmap_set.end()) { + LOG(DEBUG, + "{}() dropping unavailable overlap path '{}' addr {} len {} off {}", + __func__, path, c.addr, c.len, c.off); + mmap_set.erase(it); + } continue; } if(flush_rc == k_flush_deferred) { - ++it; continue; } if(flush_rc < 0) { LOG(WARNING, "{}() chunked flush failed for path '{}' (addr {} len {} off {})", - __func__, path, addr, len, off); + __func__, path, c.addr, c.len, c.off); errno = EIO; return -1; } - const auto next_epoch = flush_epoch_counter().fetch_add(1) + 1; - auto updated = entry; - updated.flush_epoch = next_epoch; - it = mmap_set.erase(it); - it = mmap_set.insert(updated).first; + + { + std::lock_guard lock(get_mmap_set_mutex()); + auto& mmap_set = get_mmap_set(); + auto it = std::find_if( + mmap_set.begin(), mmap_set.end(), [&](const auto& entry) { + return entry.path == path && entry.addr == c.map_base && + entry.length == c.map_len && + entry.offset == c.map_off; + }); + if(it != mmap_set.end()) { + auto updated = *it; + const auto next_epoch = flush_epoch_counter().fetch_add(1) + 1; + updated.flush_epoch = next_epoch; + mmap_set.erase(it); + mmap_set.insert(std::move(updated)); + } + } ++flushed_segments; - ++it; } + LOG(DEBUG, "{}() path '{}' flushed segments {}", __func__, path, flushed_segments); return 0; @@ -480,14 +532,18 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, int gkfs_mmap_flush_all_for_path(const std::string& path) { - std::lock_guard lock(get_mmap_set_mutex()); - size_t flushed_segments = 0; - - for(const auto& entry : get_mmap_set()) { - if(entry.path != path || !should_flush_mapping(entry)) { - continue; + std::vector entries; + { + std::lock_guard lock(get_mmap_set_mutex()); + for(const auto& entry : get_mmap_set()) { + if(entry.path == path && should_flush_mapping(entry)) { + entries.push_back(entry); + } } + } + size_t flushed_segments = 0; + for(const auto& entry : entries) { const int flush_rc = flush_range_chunked(path, entry.addr, entry.length, entry.offset, flush_mode::explicit_sync); @@ -620,9 +676,9 @@ gkfs_msync(void* addr, size_t length, int flags) { entry.fd, entry.path}); } if(flush_ranges.empty()) { - errno = ENODEV; return -1; } + lock.unlock(); for(const auto& range : flush_ranges) { (void) range.fd; @@ -709,10 +765,18 @@ gkfs_munmap(void* addr, size_t length) { } if(!handled) { - errno = ENODEV; return -1; } + // Publish post-unmap interval state before performing expensive I/O/RPC. + // This avoids holding the mmap tracking lock while lower layers may trigger + // their own mmap/munmap activity. + get_mmap_set().clear(); + for(const auto& entry : new_entries) { + get_mmap_set().insert(entry); + } + lock.unlock(); + for(const auto& range : flush_ranges) { (void) range.fd; if(flush_range_chunked(range.path, range.addr, range.length, @@ -725,14 +789,9 @@ gkfs_munmap(void* addr, size_t length) { return -1; } - get_mmap_set().clear(); - for(const auto& entry : new_entries) { - get_mmap_set().insert(entry); - } - LOG(DEBUG, "{}() unmap [{}..{}) updated tracked segments {}", __func__, static_cast(unmap_begin), - static_cast(unmap_end), get_mmap_set().size()); + static_cast(unmap_end), new_entries.size()); return 0; } diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 13d2c42de..c84d57e72 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -2664,11 +2664,11 @@ msync(void* addr, size_t length, int flags) { DEBUG_INFO("[GKFS] msync handled by GekkoFS for addr={}", addr); return 0; } - if(errno != ENODEV) { - // It was a GKFS mapping path but flush/sync failed: do not silently - // bypass to libc, propagate the failure. - return -1; - } + // If gkfs_ret indicates "not my memory" (e.g., -1 with + // errno=ENOMEM/ENODEV), then fallback. Assuming non-zero means GekkoFS + // didn't handle it or errored internally. The original just checked for + // 0. If gkfs_msync sets errno for "not my memory", we might need to + // clear it before fallback. DEBUG_INFO("[BYPASS] msync for addr={}", addr); GKFS_FALLBACK(msync, addr, length, flags); } @@ -2683,11 +2683,6 @@ munmap(void* addr, size_t length) { DEBUG_INFO("[GKFS] munmap handled by GekkoFS for addr={}", addr); return 0; } - if(errno != ENODEV) { - // It was a GKFS mapping path but flush/unmap failed: do not silently - // bypass and lose dirty mmap contents. - return -1; - } DEBUG_INFO("[BYPASS] munmap for addr={}", addr); GKFS_FALLBACK(munmap, addr, length); } diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 648e87007..badb1c849 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -430,9 +430,8 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { } } // RENAMED OR SYMLINK NOT PROTECTED - auto file = - std::make_shared(new_path, - flags); + auto file = std::make_shared(new_path, + flags); if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY)) && should_defer_trunc_for_consistency(new_path, flags)) { diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index cbff3fd44..1cc249830 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -48,8 +48,6 @@ #include #include -#include -#include #include @@ -69,30 +67,6 @@ with_errno(T ret) { return (ret < 0) ? -errno : ret; } -int -get_lock_shim_fd(const std::string& path) { - static std::mutex shim_mutex; - static std::unordered_map shim_fds; - - std::lock_guard lock(shim_mutex); - const auto it = shim_fds.find(path); - if(it != shim_fds.end()) { - return it->second; - } - - const auto shim_name = - std::string{"/tmp/gkfs_fcntl_lock_"} + - std::to_string(std::hash{}(path)); - - const auto fd = gsl::narrow_cast(syscall_no_intercept_wrapper( - SYS_openat, AT_FDCWD, shim_name.c_str(), O_RDWR | O_CREAT, 0600)); - if(fd < 0) { - return -1; - } - shim_fds.emplace(path, fd); - return fd; -} - } // namespace namespace gkfs::hook { @@ -145,24 +119,12 @@ hook_close(int fd) { LOG(DEBUG, "{}() called with fd: {}", __func__, fd); - // Prefer closing a real kernel fd first. This prevents stale GKFS fd-map - // entries from hijacking close() after descriptor reuse. - const auto kret = syscall_no_intercept_wrapper(SYS_close, fd); - if(kret == 0) { - // Best-effort stale-entry cleanup if this fd was still tracked. - (void) gkfs::syscall::gkfs_close(fd); - return 0; - } - if(errno != EBADF) { - return gsl::narrow_cast(kret); - } - auto ret = gkfs::syscall::gkfs_close(fd); if(ret == 0) return 0; - return gsl::narrow_cast(kret); + return gsl::narrow_cast(syscall_no_intercept_wrapper(SYS_close, fd)); } #ifdef SYS_stat int @@ -182,7 +144,7 @@ hook_stat(const char* path, struct stat* buf) { } return gsl::narrow_cast( - syscall_no_intercept_wrapper(SYS_stat, path, buf)); + syscall_no_intercept_wrapper(SYS_stat, rel_path.c_str(), buf)); } #endif @@ -241,7 +203,7 @@ hook_lstat(const char* path, struct stat* buf) { return with_errno(gkfs::syscall::gkfs_stat(rel_path, buf, false)); } return gsl::narrow_cast( - syscall_no_intercept_wrapper(SYS_lstat, path, buf)); + syscall_no_intercept_wrapper(SYS_lstat, rel_path.c_str(), buf)); } #endif @@ -315,11 +277,10 @@ hook_read(unsigned int fd, void* buf, size_t count) { return -EFAULT; } - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_read(fd, buf, count); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(auto file = CTX->file_map()->get(fd)) { + LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, + file->path()); + return with_errno(gkfs::syscall::gkfs_read(fd, buf, count)); } return syscall_no_intercept_wrapper(SYS_read, fd, buf, count); } @@ -330,11 +291,8 @@ hook_pread(unsigned int fd, char* buf, size_t count, loff_t pos) { LOG(DEBUG, "{}() called with fd: {}, buf: {}, count: {}, pos: {}", __func__, fd, fmt::ptr(buf), count, pos); - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_pread(fd, buf, count, pos); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + return with_errno(gkfs::syscall::gkfs_pread(fd, buf, count, pos)); } /* Since kernel 2.6: pread() became pread64(), and pwrite() became * pwrite64(). */ @@ -347,11 +305,8 @@ hook_readv(unsigned long fd, const struct iovec* iov, unsigned long iovcnt) { LOG(DEBUG, "{}() called with fd: {}, iov: {}, iovcnt: {}", __func__, fd, fmt::ptr(iov), iovcnt); - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_readv(fd, iov, iovcnt); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + return with_errno(gkfs::syscall::gkfs_readv(fd, iov, iovcnt)); } return syscall_no_intercept_wrapper(SYS_readv, fd, iov, iovcnt); } @@ -366,11 +321,8 @@ hook_preadv(unsigned long fd, const struct iovec* iov, unsigned long iovcnt, // "pos_h: {}", // __func__, fd, fmt::ptr(iov), iovcnt, pos_l, pos_h); - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_preadv(fd, iov, iovcnt, pos_l); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + return with_errno(gkfs::syscall::gkfs_preadv(fd, iov, iovcnt, pos_l)); } return syscall_no_intercept_wrapper(SYS_preadv, fd, iov, iovcnt, pos_l); } @@ -384,11 +336,10 @@ hook_write(unsigned int fd, const char* buf, size_t count) { return -EFAULT; } - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_write(fd, buf, count); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(auto file = CTX->file_map()->get(fd)) { + LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, + file->path()); + return with_errno(gkfs::syscall::gkfs_write(fd, buf, count)); } return syscall_no_intercept_wrapper(SYS_write, fd, buf, count); } @@ -399,11 +350,8 @@ hook_pwrite(unsigned int fd, const char* buf, size_t count, loff_t pos) { LOG(DEBUG, "{}() called with fd: {}, buf: {}, count: {}, pos: {}", __func__, fd, fmt::ptr(buf), count, pos); - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_pwrite(fd, buf, count, pos); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + return with_errno(gkfs::syscall::gkfs_pwrite(fd, buf, count, pos)); } /* Since kernel 2.6: pread() became pread64(), and pwrite() became * pwrite64(). */ @@ -416,11 +364,8 @@ hook_writev(unsigned long fd, const struct iovec* iov, unsigned long iovcnt) { LOG(DEBUG, "{}() called with fd: {}, iov: {}, iovcnt: {}", __func__, fd, fmt::ptr(iov), iovcnt); - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_writev(fd, iov, iovcnt); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + return with_errno(gkfs::syscall::gkfs_writev(fd, iov, iovcnt)); } return syscall_no_intercept_wrapper(SYS_writev, fd, iov, iovcnt); } @@ -435,11 +380,8 @@ hook_pwritev(unsigned long fd, const struct iovec* iov, unsigned long iovcnt, "pos_h: {}", __func__, fd, fmt::ptr(iov), iovcnt, pos_l, pos_h); - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_pwritev(fd, iov, iovcnt, pos_l); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + return with_errno(gkfs::syscall::gkfs_pwritev(fd, iov, iovcnt, pos_l)); } return syscall_no_intercept_wrapper(SYS_pwritev, fd, iov, iovcnt, pos_l); } @@ -563,7 +505,7 @@ hook_access(const char* path, int mask) { return ret; } return gsl::narrow_cast( - syscall_no_intercept_wrapper(SYS_access, path, mask)); + syscall_no_intercept_wrapper(SYS_access, rel_path.c_str(), mask)); } #endif @@ -640,17 +582,16 @@ hook_lseek(unsigned int fd, off_t offset, unsigned int whence) { LOG(DEBUG, "{}() called with fd: {}, offset: {}, whence: {}", __func__, fd, offset, whence); - if(CTX->file_map()->get(fd)) { + if(CTX->file_map()->exist(fd)) { auto off_ret = gkfs::syscall::gkfs_lseek( fd, static_cast(offset), whence); if(off_ret > std::numeric_limits::max()) { return -EOVERFLOW; - } else if(off_ret < 0 && errno != EBADF) { + } else if(off_ret < 0) { return -errno; - } else if(off_ret >= 0) { - LOG(DEBUG, "{}() returning {}", __func__, off_ret); - return off_ret; } + LOG(DEBUG, "{}() returning {}", __func__, off_ret); + return off_ret; } return syscall_no_intercept_wrapper(SYS_lseek, fd, offset, whence); } @@ -667,7 +608,7 @@ hook_truncate(const char* path, long length) { return with_errno(gkfs::syscall::gkfs_truncate(rel_path, length)); } return gsl::narrow_cast(syscall_no_intercept_wrapper( - SYS_truncate, path, length)); + SYS_truncate, rel_path.c_str(), length)); } int @@ -675,12 +616,9 @@ hook_ftruncate(unsigned int fd, unsigned long length) { LOG(DEBUG, "{}() called with fd: {}, offset: {}", __func__, fd, length); - if(auto file = CTX->file_map()->get(fd)) { - auto path = file->path(); - auto rv = gkfs::syscall::gkfs_truncate(path, length); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + auto path = CTX->file_map()->get(fd)->path(); + return with_errno(gkfs::syscall::gkfs_truncate(path, length)); } return gsl::narrow_cast( syscall_no_intercept_wrapper(SYS_ftruncate, fd, length)); @@ -691,11 +629,8 @@ hook_dup(unsigned int fd) { LOG(DEBUG, "{}() called with oldfd: {}", __func__, fd); - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_dup(fd); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + return with_errno(gkfs::syscall::gkfs_dup(fd)); } return gsl::narrow_cast(syscall_no_intercept_wrapper(SYS_dup, fd)); } @@ -705,11 +640,8 @@ hook_dup2(unsigned int oldfd, unsigned int newfd) { LOG(DEBUG, "{}() called with oldfd: {}, newfd: {}", __func__, oldfd, newfd); - if(CTX->file_map()->get(oldfd)) { - auto rv = gkfs::syscall::gkfs_dup2(oldfd, newfd); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(oldfd)) { + return with_errno(gkfs::syscall::gkfs_dup2(oldfd, newfd)); } return gsl::narrow_cast( syscall_no_intercept_wrapper(SYS_dup2, oldfd, newfd)); @@ -721,16 +653,12 @@ hook_dup3(unsigned int oldfd, unsigned int newfd, int flags) { LOG(DEBUG, "{}() called with oldfd: {}, newfd: {}, flags: {}", __func__, oldfd, newfd, flags); - if(CTX->file_map()->get(oldfd)) { + if(CTX->file_map()->exist(oldfd)) { if(flags & O_CLOEXEC) { LOG(WARNING, "{}() Not supported", __func__); return -ENOTSUP; - } else { - auto rv = gkfs::syscall::gkfs_dup2(oldfd, newfd); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } - } + } else + return with_errno(gkfs::syscall::gkfs_dup2(oldfd, newfd)); } return gsl::narrow_cast( syscall_no_intercept_wrapper(SYS_dup3, oldfd, newfd, flags)); @@ -745,11 +673,8 @@ hook_getdents(unsigned int fd, struct linux_dirent* dirp, unsigned int count) { return -EFAULT; } - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_getdents(fd, dirp, count); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + return with_errno(gkfs::syscall::gkfs_getdents(fd, dirp, count)); } return syscall_no_intercept_wrapper(SYS_getdents, fd, dirp, count); } @@ -765,12 +690,11 @@ hook_getdents64(unsigned int fd, struct linux_dirent64* dirp, return -EFAULT; } - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_getdents64(fd, dirp, count); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + LOG(WARNING, "DEBUG: hook_getdents64 fd {} exists in filemap", fd); + return with_errno(gkfs::syscall::gkfs_getdents64(fd, dirp, count)); } + LOG(WARNING, "DEBUG: hook_getdents64 fd {} does NOT exist in filemap", fd); return syscall_no_intercept_wrapper(SYS_getdents64, fd, dirp, count); } @@ -816,17 +740,12 @@ int hook_fchmod(unsigned int fd, mode_t mode) { LOG(DEBUG, "{}() called with fd: {}, mode: {}", __func__, fd, mode); - auto rv = syscall_no_intercept_wrapper(SYS_fchmod, fd, mode); - if(rv >= 0 || errno != EBADF) { - return rv; - } - - if(CTX->file_map()->get(fd)) { + if(CTX->file_map()->exist(fd)) { LOG(WARNING, "{}() operation not supported, returning success", __func__); return 0; } - return rv; + return syscall_no_intercept_wrapper(SYS_fchmod, fd, mode); } @@ -888,17 +807,12 @@ hook_fchown(unsigned int fd, uid_t owner, gid_t group) { LOG(DEBUG, "{}() called with fd: {}, owner: {}, group: {}", __func__, fd, owner, group); - auto rv = syscall_no_intercept_wrapper(SYS_fchown, fd, owner, group); - if(rv >= 0 || errno != EBADF) { - return rv; - } - - if(CTX->file_map()->get(fd)) { + if(CTX->file_map()->exist(fd)) { LOG(WARNING, "{}() operation not supported, returning success", __func__); return 0; } - return rv; + return syscall_no_intercept_wrapper(SYS_fchown, fd, owner, group); } int @@ -1073,8 +987,8 @@ hook_futimens(unsigned int fd, const struct timespec times[2]) { LOG(DEBUG, "{}() called with fd: {}, times: {}", __func__, fd, fmt::ptr(times)); - if(auto file = CTX->file_map()->get(fd)) { - auto path = file->path(); + if(CTX->file_map()->exist(fd)) { + auto path = CTX->file_map()->get(fd)->path(); return with_errno(gkfs::syscall::gkfs_utimensat(path, times)); } return gsl::narrow_cast( @@ -1126,14 +1040,9 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { // cmd, // arg); - auto kernel_ret = syscall_no_intercept_wrapper(SYS_fcntl, fd, cmd, arg); - if(kernel_ret != -1 || errno != EBADF) { - return gsl::narrow_cast(kernel_ret); - } - - auto file = CTX->file_map()->get(fd); - if(!file) { - return gsl::narrow_cast(kernel_ret); + if(!CTX->file_map()->exist(fd)) { + return gsl::narrow_cast( + syscall_no_intercept_wrapper(SYS_fcntl, fd, cmd, arg)); } int ret; switch(cmd) { @@ -1148,12 +1057,13 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { if(ret == -1) { return -errno; } - file->set_flag(gkfs::filemap::OpenFile_flags::cloexec, true); + CTX->file_map()->get(fd)->set_flag( + gkfs::filemap::OpenFile_flags::cloexec, true); return ret; case F_GETFD: LOG(DEBUG, "{}() F_GETFD on fd {}", __func__, fd); - if(file->get_flag( + if(CTX->file_map()->get(fd)->get_flag( gkfs::filemap::OpenFile_flags::cloexec)) { return FD_CLOEXEC; } @@ -1162,15 +1072,15 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { case F_GETFL: LOG(DEBUG, "{}() F_GETFL on fd {}", __func__, fd); ret = 0; - if(file->get_flag( + if(CTX->file_map()->get(fd)->get_flag( gkfs::filemap::OpenFile_flags::rdonly)) { ret |= O_RDONLY; } - if(file->get_flag( + if(CTX->file_map()->get(fd)->get_flag( gkfs::filemap::OpenFile_flags::wronly)) { ret |= O_WRONLY; } - if(file->get_flag( + if(CTX->file_map()->get(fd)->get_flag( gkfs::filemap::OpenFile_flags::rdwr)) { ret |= O_RDWR; } @@ -1179,16 +1089,20 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { LOG(DEBUG, "{}() F_SETFL on fd {}", __func__, fd); // get flags from arg and setup if(arg & O_RDONLY) { - file->set_flag(gkfs::filemap::OpenFile_flags::rdonly, true); + CTX->file_map()->get(fd)->set_flag( + gkfs::filemap::OpenFile_flags::rdonly, true); } if(arg & O_WRONLY) { - file->set_flag(gkfs::filemap::OpenFile_flags::wronly, true); + CTX->file_map()->get(fd)->set_flag( + gkfs::filemap::OpenFile_flags::wronly, true); } if(arg & O_RDWR) { - file->set_flag(gkfs::filemap::OpenFile_flags::rdwr, true); + CTX->file_map()->get(fd)->set_flag( + gkfs::filemap::OpenFile_flags::rdwr, true); } if(arg & O_APPEND) { - file->set_flag(gkfs::filemap::OpenFile_flags::append, true); + CTX->file_map()->get(fd)->set_flag( + gkfs::filemap::OpenFile_flags::append, true); } if(arg & O_NONBLOCK) { LOG(DEBUG, "[GKFS] F_SETFL {} NONBLOCK", fd); @@ -1198,84 +1112,17 @@ hook_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) { } if(arg & O_CLOEXEC) { LOG(DEBUG, "[GKFS] F_SETFL {} CLOEXEC", fd); - file->set_flag(gkfs::filemap::OpenFile_flags::cloexec, true); + CTX->file_map()->get(fd)->set_flag( + gkfs::filemap::OpenFile_flags::cloexec, true); } return 0; case F_SETFD: LOG(DEBUG, "{}() [fd: {}, cmd: F_SETFD, FD_CLOEXEC: {}]", __func__, fd, (arg & FD_CLOEXEC)); - file->set_flag(gkfs::filemap::OpenFile_flags::cloexec, - (arg & FD_CLOEXEC)); + CTX->file_map()->get(fd)->set_flag( + gkfs::filemap::OpenFile_flags::cloexec, (arg & FD_CLOEXEC)); return 0; - - case F_GETLK: { - auto* lk = reinterpret_cast(arg); - if(lk == nullptr) { - return -EINVAL; - } - - const auto lock_fd = get_lock_shim_fd(file->path()); - if(lock_fd < 0) { - return -errno; - } - - const auto lock_ret = syscall_no_intercept_wrapper(SYS_fcntl, - lock_fd, cmd, - arg); - return with_errno(gsl::narrow_cast(lock_ret)); - } - - case F_SETLK: - case F_SETLKW: { - const auto lock_fd = get_lock_shim_fd(file->path()); - if(lock_fd < 0) { - return -errno; - } - - const auto lock_ret = syscall_no_intercept_wrapper(SYS_fcntl, - lock_fd, cmd, - arg); - return with_errno(gsl::narrow_cast(lock_ret)); - } - -#if defined(F_GETLK64) && (!defined(F_GETLK) || (F_GETLK64 != F_GETLK)) - case F_GETLK64: { - auto* lk = reinterpret_cast(arg); - if(lk == nullptr) { - return -EINVAL; - } - - const auto lock_fd = get_lock_shim_fd(file->path()); - if(lock_fd < 0) { - return -errno; - } - - const auto lock_ret = syscall_no_intercept_wrapper(SYS_fcntl, - lock_fd, cmd, - arg); - return with_errno(gsl::narrow_cast(lock_ret)); - } -#endif - -#if defined(F_SETLK64) && (!defined(F_SETLK) || (F_SETLK64 != F_SETLK)) - case F_SETLK64: -#endif -#if defined(F_SETLKW64) && (!defined(F_SETLKW) || (F_SETLKW64 != F_SETLKW)) - case F_SETLKW64: -#endif - { - const auto lock_fd = get_lock_shim_fd(file->path()); - if(lock_fd < 0) { - return -errno; - } - - const auto lock_ret = syscall_no_intercept_wrapper(SYS_fcntl, - lock_fd, cmd, - arg); - return with_errno(gsl::narrow_cast(lock_ret)); - } - default: LOG(ERROR, "{}() unrecognized command {} on fd {}", __func__, cmd, fd); @@ -1367,15 +1214,10 @@ hook_statfs(const char* path, struct statfs* buf) { std::string rel_path; if(CTX->relativize_path(path, rel_path)) { - // Keep POSIX statfs semantics for internal paths: the referenced path - // must exist. MPI-IO coordination logic relies on ENOENT here. - if(rel_path != "/" && !gkfs::utils::get_metadata(rel_path, false)) { - return -ENOENT; - } return with_errno(gkfs::syscall::gkfs_statfs(buf)); } return gsl::narrow_cast( - syscall_no_intercept_wrapper(SYS_statfs, path, buf)); + syscall_no_intercept_wrapper(SYS_statfs, rel_path.c_str(), buf)); } int @@ -1386,15 +1228,11 @@ hook_fstatfs(unsigned int fd, struct statfs* buf) { return -EFAULT; } - auto rv = syscall_no_intercept_wrapper(SYS_fstatfs, fd, buf); - if(rv >= 0 || errno != EBADF) { - return rv; - } - - if(CTX->file_map()->get(fd)) { + if(CTX->file_map()->exist(fd)) { return with_errno(gkfs::syscall::gkfs_statfs(buf)); } - return rv; + return gsl::narrow_cast( + syscall_no_intercept_wrapper(SYS_fstatfs, fd, buf)); } /* The function should broadcast a flush message (pmem_persist i.e.) if the @@ -1403,11 +1241,8 @@ int hook_fsync(unsigned int fd) { LOG(DEBUG, "{}() called with fd: {}", __func__, fd); - if(CTX->file_map()->get(fd)) { - auto rv = gkfs::syscall::gkfs_fsync(fd); - if(rv >= 0 || errno != EBADF) { - return with_errno(rv); - } + if(CTX->file_map()->exist(fd)) { + return with_errno(gkfs::syscall::gkfs_fsync(fd)); } return gsl::narrow_cast(syscall_no_intercept_wrapper(SYS_fsync, fd)); @@ -1515,17 +1350,10 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd, "{}() called with addr '{}' length '{}' prot '{}' flags '{}' fd '{}' offset '{}'", __func__, fmt::ptr(addr), length, prot, flags, fd, offset); - // If fd is tracked as GKFS, verify that it is not a reused kernel fd. if(auto file = CTX->file_map()->get(fd)) { - struct stat st {}; - const auto kret = syscall_no_intercept_wrapper(SYS_fstat, fd, &st); - if(kret == -1 && errno == EBADF) { - return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, - offset); - } - LOG(DEBUG, - "{}() detected kernel-backed fd reuse for fd {} (tracked path '{}'), bypassing GKFS mmap", - __func__, fd, file->path()); + LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd, + file->path()); + return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, offset); } return reinterpret_cast(syscall_no_intercept_wrapper( SYS_mmap, addr, length, prot, flags, fd, offset)); -- GitLab From 359e94300698eb206b5a12c60bf7c6779861c22c Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 2 Mar 2026 19:57:24 +0100 Subject: [PATCH 56/68] fix flex --- src/client/gkfs_functions.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index a9dc4138f..918280715 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -165,7 +165,9 @@ tracked_segments_for_path(const std::string& path) { return segments; } -constexpr size_t k_mmap_flush_chunk_size = 8UL * 1024UL * 1024UL; +// Keep RPC payloads conservative for mmap publication. Very large chunks can +// trigger transport-side registration failures and stall close/munmap paths. +constexpr size_t k_mmap_flush_chunk_size = 512UL * 1024UL; constexpr int k_flush_ok = 0; constexpr int k_flush_unavailable = 1; constexpr int k_flush_deferred = 2; -- GitLab From ddcd527d68721db7fd37014249a19ad94d1021b6 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 2 Mar 2026 20:09:41 +0100 Subject: [PATCH 57/68] add sendfile / copy file range --- include/client/hooks.hpp | 9 +++ src/client/gkfs_libc.cpp | 138 ++++++++++++++++++++++++++++++++++ src/client/hooks.cpp | 155 +++++++++++++++++++++++++++++++++++++++ src/client/intercept.cpp | 15 ++++ 4 files changed, 317 insertions(+) diff --git a/include/client/hooks.hpp b/include/client/hooks.hpp index 1b15b0bd5..a2951478f 100644 --- a/include/client/hooks.hpp +++ b/include/client/hooks.hpp @@ -135,6 +135,15 @@ ssize_t hook_pwritev(unsigned long fd, const struct iovec* iov, unsigned long iovcnt, unsigned long pos_l, unsigned long pos_h); +ssize_t +hook_sendfile(int out_fd, int in_fd, off_t* offset, size_t count); + +#ifdef SYS_copy_file_range +ssize_t +hook_copy_file_range(int fd_in, loff_t* off_in, int fd_out, loff_t* off_out, + size_t len, unsigned int flags); +#endif + int hook_unlinkat(int dirfd, const char* cpath, int flags); diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index c84d57e72..81fab8885 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -45,6 +45,7 @@ #include // For AT_FDCWD, O_* flags etc. #include // For DIR, struct dirent #include // For unshare, close, etc. +#include #include // For mode_t, struct stat #include // For off_t, ssize_t, etc. @@ -386,6 +387,69 @@ is_gkfs_fd(int fd) { return CTX->file_map()->exist(fd); } +constexpr size_t k_fd_copy_chunk_size = 1UL * 1024UL * 1024UL; + +static ssize_t +copy_between_fds(int out_fd, int in_fd, off64_t* in_off, off64_t* out_off, + size_t count) { + if(count == 0) { + return 0; + } + + std::vector buffer(std::min(count, k_fd_copy_chunk_size)); + size_t total = 0; + off64_t in_pos = in_off ? *in_off : 0; + off64_t out_pos = out_off ? *out_off : 0; + + while(total < count) { + const auto chunk = std::min(buffer.size(), count - total); + const ssize_t nread = + in_off ? pread64(in_fd, buffer.data(), chunk, in_pos) + : read(in_fd, buffer.data(), chunk); + if(nread < 0) { + return total > 0 ? static_cast(total) : -1; + } + if(nread == 0) { + break; + } + + if(in_off) { + in_pos += static_cast(nread); + } + + size_t consumed = 0; + while(consumed < static_cast(nread)) { + const ssize_t nwrite = + out_off + ? pwrite64(out_fd, buffer.data() + consumed, + static_cast(nread) - consumed, + out_pos) + : write(out_fd, buffer.data() + consumed, + static_cast(nread) - consumed); + if(nwrite < 0) { + return total > 0 ? static_cast(total) : -1; + } + if(nwrite == 0) { + errno = EIO; + return total > 0 ? static_cast(total) : -1; + } + consumed += static_cast(nwrite); + if(out_off) { + out_pos += static_cast(nwrite); + } + } + total += consumed; + } + + if(in_off) { + *in_off = in_pos; + } + if(out_off) { + *out_off = out_pos; + } + return static_cast(total); +} + struct GkfsDir { // Hypothetical structure that might be used if DIR is cast int fd; char* path; @@ -521,6 +585,20 @@ DLSYM_WRAPPER(ssize_t, pwritev2, (int fd, const struct iovec* iov, int iovcnt, off_t offset, int flags), (fd, iov, iovcnt, offset, flags), "pwritev2") +DLSYM_WRAPPER(ssize_t, sendfile, + (int out_fd, int in_fd, off_t* offset, size_t count), + (out_fd, in_fd, offset, count), "sendfile") +#if defined(__USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE) || \ + defined(__linux__) +DLSYM_WRAPPER(ssize_t, sendfile64, + (int out_fd, int in_fd, off64_t* offset, size_t count), + (out_fd, in_fd, offset, count), "sendfile64") +#endif +DLSYM_WRAPPER(ssize_t, copy_file_range, + (int fd_in, off64_t* off_in, int fd_out, off64_t* off_out, + size_t len, unsigned int flags), + (fd_in, off_in, fd_out, off_out, len, flags), + "copy_file_range") DLSYM_WRAPPER(off_t, lseek, (int fd, off_t offset, int whence), (fd, offset, whence), "lseek") DLSYM_WRAPPER(off64_t, lseek64, (int fd, off64_t offset, int whence), @@ -1039,6 +1117,66 @@ pwritev2(int fd, const struct iovec* iov, int iovcnt, off_t offset, int flags) { GKFS_FALLBACK(pwritev2, fd, iov, iovcnt, offset, flags) } +ssize_t +sendfile(int out_fd, int in_fd, off_t* offset, size_t count) { + gkfs_init_routine_placeholder(); + if(CTX->interception_enabled() && + (is_gkfs_fd(in_fd) || is_gkfs_fd(out_fd))) { + off64_t in_off = offset ? static_cast(*offset) : 0; + const auto ret = copy_between_fds(out_fd, in_fd, + offset ? &in_off : nullptr, nullptr, + count); + if(ret >= 0 && offset) { + *offset = static_cast(in_off); + } + return ret; + } + GKFS_FALLBACK(sendfile, out_fd, in_fd, offset, count); +} + +#if defined(__USE_LARGEFILE64) || defined(_LARGEFILE64_SOURCE) || \ + defined(__linux__) +ssize_t +sendfile64(int out_fd, int in_fd, off64_t* offset, size_t count) { + gkfs_init_routine_placeholder(); + if(CTX->interception_enabled() && + (is_gkfs_fd(in_fd) || is_gkfs_fd(out_fd))) { + const auto ret = copy_between_fds(out_fd, in_fd, offset, nullptr, + count); + return ret; + } + GKFS_FALLBACK(sendfile64, out_fd, in_fd, offset, count); +} +#endif + +ssize_t +copy_file_range(int fd_in, off64_t* off_in, int fd_out, off64_t* off_out, + size_t len, unsigned int flags) { + gkfs_init_routine_placeholder(); + if(CTX->interception_enabled() && + (is_gkfs_fd(fd_in) || is_gkfs_fd(fd_out))) { + if(flags != 0) { + errno = EINVAL; + return -1; + } + off64_t in_off = off_in ? *off_in : 0; + off64_t out_off = off_out ? *off_out : 0; + const auto ret = + copy_between_fds(fd_out, fd_in, off_in ? &in_off : nullptr, + off_out ? &out_off : nullptr, len); + if(ret >= 0) { + if(off_in) { + *off_in = in_off; + } + if(off_out) { + *off_out = out_off; + } + } + return ret; + } + GKFS_FALLBACK(copy_file_range, fd_in, off_in, fd_out, off_out, len, flags); +} + off_t lseek(int fd, off_t offset, int whence) { gkfs_init_routine_placeholder(); diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index 1cc249830..f891ae43f 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -47,7 +47,9 @@ #include +#include #include +#include #include @@ -67,6 +69,98 @@ with_errno(T ret) { return (ret < 0) ? -errno : ret; } +constexpr size_t k_fd_copy_chunk_size = 1UL * 1024UL * 1024UL; + +ssize_t +copy_between_fds(int out_fd, int in_fd, off64_t* in_off, off64_t* out_off, + size_t count) { + auto in_file = CTX->file_map()->get(in_fd); + auto out_file = CTX->file_map()->get(out_fd); + + std::vector buffer(std::min(count, k_fd_copy_chunk_size)); + size_t total = 0; + off64_t in_pos = in_off ? *in_off : 0; + off64_t out_pos = out_off ? *out_off : 0; + + auto read_once = [&](char* dst, size_t len, off64_t pos) -> ssize_t { + if(in_off) { + if(in_file) { + auto ret = gkfs::syscall::gkfs_pread(in_fd, dst, len, pos); + return ret < 0 ? -errno : ret; + } + return syscall_no_intercept_wrapper(SYS_pread64, in_fd, dst, len, + pos); + } + + if(in_file) { + auto ret = gkfs::syscall::gkfs_read(in_fd, dst, len); + return ret < 0 ? -errno : ret; + } + return syscall_no_intercept_wrapper(SYS_read, in_fd, dst, len); + }; + + auto write_once = [&](const char* src, size_t len, off64_t pos) -> ssize_t { + if(out_off) { + if(out_file) { + auto ret = gkfs::syscall::gkfs_pwrite(out_fd, src, len, pos); + return ret < 0 ? -errno : ret; + } + return syscall_no_intercept_wrapper(SYS_pwrite64, out_fd, src, len, + pos); + } + + if(out_file) { + auto ret = gkfs::syscall::gkfs_write(out_fd, src, len); + return ret < 0 ? -errno : ret; + } + return syscall_no_intercept_wrapper(SYS_write, out_fd, src, len); + }; + + while(total < count) { + const auto chunk = std::min(buffer.size(), count - total); + const auto nread = read_once(buffer.data(), chunk, in_pos); + if(nread < 0) { + return total > 0 ? static_cast(total) : nread; + } + if(nread == 0) { + break; // EOF + } + + if(in_off) { + in_pos += static_cast(nread); + } + + size_t consumed = 0; + while(consumed < static_cast(nread)) { + const auto nwrite = write_once(buffer.data() + consumed, + static_cast(nread) - consumed, + out_pos); + if(nwrite < 0) { + return total > 0 ? static_cast(total) : nwrite; + } + if(nwrite == 0) { + errno = EIO; + return total > 0 ? static_cast(total) : -EIO; + } + consumed += static_cast(nwrite); + if(out_off) { + out_pos += static_cast(nwrite); + } + } + + total += consumed; + } + + if(in_off) { + *in_off = in_pos; + } + if(out_off) { + *out_off = out_pos; + } + + return static_cast(total); +} + } // namespace namespace gkfs::hook { @@ -386,6 +480,67 @@ hook_pwritev(unsigned long fd, const struct iovec* iov, unsigned long iovcnt, return syscall_no_intercept_wrapper(SYS_pwritev, fd, iov, iovcnt, pos_l); } +ssize_t +hook_sendfile(int out_fd, int in_fd, off_t* offset, size_t count) { + LOG(DEBUG, + "{}() called with out_fd: {}, in_fd: {}, offset: {}, count: {}", + __func__, out_fd, in_fd, fmt::ptr(offset), count); + + auto in_file = CTX->file_map()->get(in_fd); + auto out_file = CTX->file_map()->get(out_fd); + if(!in_file && !out_file) { +#ifdef SYS_sendfile + return syscall_no_intercept_wrapper(SYS_sendfile, out_fd, in_fd, offset, + count); +#else + return -ENOSYS; +#endif + } + + off64_t in_off = offset ? static_cast(*offset) : 0; + auto ret = copy_between_fds(out_fd, in_fd, offset ? &in_off : nullptr, + nullptr, count); + if(ret >= 0 && offset) { + *offset = static_cast(in_off); + } + return ret; +} + +#ifdef SYS_copy_file_range +ssize_t +hook_copy_file_range(int fd_in, loff_t* off_in, int fd_out, loff_t* off_out, + size_t len, unsigned int flags) { + LOG(DEBUG, + "{}() called with fd_in: {}, off_in: {}, fd_out: {}, off_out: {}, len: {}, flags: {}", + __func__, fd_in, fmt::ptr(off_in), fd_out, fmt::ptr(off_out), len, + flags); + + auto in_file = CTX->file_map()->get(fd_in); + auto out_file = CTX->file_map()->get(fd_out); + if(!in_file && !out_file) { + return syscall_no_intercept_wrapper(SYS_copy_file_range, fd_in, off_in, + fd_out, off_out, len, flags); + } + if(flags != 0) { + return -EINVAL; + } + + off64_t in_off = off_in ? static_cast(*off_in) : 0; + off64_t out_off = off_out ? static_cast(*off_out) : 0; + auto ret = copy_between_fds(fd_out, fd_in, off_in ? &in_off : nullptr, + off_out ? &out_off : nullptr, len); + if(ret >= 0) { + if(off_in) { + *off_in = static_cast(in_off); + } + if(off_out) { + *off_out = static_cast(out_off); + } + } + return ret; +} +#endif + int hook_unlinkat(int dirfd, const char* cpath, int flags) { diff --git a/src/client/intercept.cpp b/src/client/intercept.cpp index ee6797ca6..0cf78d905 100644 --- a/src/client/intercept.cpp +++ b/src/client/intercept.cpp @@ -695,6 +695,21 @@ hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, static_cast(arg3), static_cast(arg4)); break; +#ifdef SYS_sendfile + case SYS_sendfile: + *result = gkfs::hook::hook_sendfile( + static_cast(arg0), static_cast(arg1), + reinterpret_cast(arg2), static_cast(arg3)); + break; +#endif +#ifdef SYS_copy_file_range + case SYS_copy_file_range: + *result = gkfs::hook::hook_copy_file_range( + static_cast(arg0), reinterpret_cast(arg1), + static_cast(arg2), reinterpret_cast(arg3), + static_cast(arg4), static_cast(arg5)); + break; +#endif #ifdef SYS_unlink case SYS_unlink: *result = gkfs::hook::hook_unlinkat( -- GitLab From 246f1aadef4dcbaf3bcf6279a0bb54f97e42e441 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 2 Mar 2026 20:16:14 +0100 Subject: [PATCH 58/68] fix overlapping --- src/client/gkfs_functions.cpp | 59 ++++++++++++++++++++++++++++++++--- src/client/gkfs_libc.cpp | 21 ++++++------- src/client/hooks.cpp | 9 +++--- 3 files changed, 67 insertions(+), 22 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 918280715..a7dc44ab0 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -93,6 +93,7 @@ struct mmap_entry { bool write_intent; size_t seeded_bytes; uint64_t flush_epoch; + uint64_t map_epoch; // We use addr as the unique key for the set bool @@ -178,6 +179,12 @@ flush_epoch_counter() { return counter; } +std::atomic& +map_epoch_counter() { + static std::atomic counter{0}; + return counter; +} + enum class flush_mode { read_triggered, explicit_sync }; bool @@ -377,11 +384,27 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, size_t len; off_t off; size_t seeded_bytes; + uint64_t flush_epoch; + uint64_t map_epoch; void* map_base; size_t map_len; off_t map_off; }; + const auto prefer_candidate = [](const flush_candidate& newer, + const flush_candidate& older) { + // Read-triggered publication is best-effort. If multiple tracked mmaps + // target the same file range, prefer the newest mapping snapshot to + // avoid stale generations clobbering current data. + if(newer.map_epoch != older.map_epoch) { + return newer.map_epoch > older.map_epoch; + } + if(newer.seeded_bytes != older.seeded_bytes) { + return newer.seeded_bytes > older.seeded_bytes; + } + return newer.flush_epoch > older.flush_epoch; + }; + std::vector candidates; { std::lock_guard lock(get_mmap_set_mutex()); @@ -406,10 +429,35 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, const auto rel_begin = static_cast(flush_begin - entry_begin); const size_t len = static_cast(flush_end - flush_begin); - candidates.push_back(flush_candidate{ - addr_to_ptr(ptr_to_addr(entry.addr) + rel_begin), len, - static_cast(flush_begin), entry.seeded_bytes, - entry.addr, entry.length, entry.offset}); + auto candidate = flush_candidate{ + addr_to_ptr(ptr_to_addr(entry.addr) + rel_begin), + len, + static_cast(flush_begin), + entry.seeded_bytes, + entry.flush_epoch, + entry.map_epoch, + entry.addr, + entry.length, + entry.offset}; + + bool merged = false; + for(auto& selected : candidates) { + if(selected.off == candidate.off && + selected.len == candidate.len) { + if(prefer_candidate(candidate, selected)) { + LOG(DEBUG, + "{}() replacing candidate path '{}' off {} len {} map_epoch {} -> {}", + __func__, path, candidate.off, candidate.len, + selected.map_epoch, candidate.map_epoch); + selected = candidate; + } + merged = true; + break; + } + } + if(!merged) { + candidates.push_back(std::move(candidate)); + } } } @@ -608,9 +656,10 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Register mapping under lock so concurrent threads don't race on mmap_set { + const auto map_epoch = map_epoch_counter().fetch_add(1) + 1; std::lock_guard lock(get_mmap_set_mutex()); get_mmap_set().insert(mmap_entry{ptr, fd, path, length, offset, prot, - flags, write_intent, 0, 0}); + flags, write_intent, 0, 0, map_epoch}); } const auto seeded = gkfs::syscall::gkfs_pread(fd, ptr, length, offset); diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 81fab8885..05d7444c0 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -42,9 +42,9 @@ #include #include #include -#include // For AT_FDCWD, O_* flags etc. -#include // For DIR, struct dirent -#include // For unshare, close, etc. +#include // For AT_FDCWD, O_* flags etc. +#include // For DIR, struct dirent +#include // For unshare, close, etc. #include #include // For mode_t, struct stat #include // For off_t, ssize_t, etc. @@ -420,8 +420,7 @@ copy_between_fds(int out_fd, int in_fd, off64_t* in_off, off64_t* out_off, size_t consumed = 0; while(consumed < static_cast(nread)) { const ssize_t nwrite = - out_off - ? pwrite64(out_fd, buffer.data() + consumed, + out_off ? pwrite64(out_fd, buffer.data() + consumed, static_cast(nread) - consumed, out_pos) : write(out_fd, buffer.data() + consumed, @@ -597,8 +596,7 @@ DLSYM_WRAPPER(ssize_t, sendfile64, DLSYM_WRAPPER(ssize_t, copy_file_range, (int fd_in, off64_t* off_in, int fd_out, off64_t* off_out, size_t len, unsigned int flags), - (fd_in, off_in, fd_out, off_out, len, flags), - "copy_file_range") + (fd_in, off_in, fd_out, off_out, len, flags), "copy_file_range") DLSYM_WRAPPER(off_t, lseek, (int fd, off_t offset, int whence), (fd, offset, whence), "lseek") DLSYM_WRAPPER(off64_t, lseek64, (int fd, off64_t offset, int whence), @@ -1123,9 +1121,8 @@ sendfile(int out_fd, int in_fd, off_t* offset, size_t count) { if(CTX->interception_enabled() && (is_gkfs_fd(in_fd) || is_gkfs_fd(out_fd))) { off64_t in_off = offset ? static_cast(*offset) : 0; - const auto ret = copy_between_fds(out_fd, in_fd, - offset ? &in_off : nullptr, nullptr, - count); + const auto ret = copy_between_fds( + out_fd, in_fd, offset ? &in_off : nullptr, nullptr, count); if(ret >= 0 && offset) { *offset = static_cast(in_off); } @@ -1141,8 +1138,8 @@ sendfile64(int out_fd, int in_fd, off64_t* offset, size_t count) { gkfs_init_routine_placeholder(); if(CTX->interception_enabled() && (is_gkfs_fd(in_fd) || is_gkfs_fd(out_fd))) { - const auto ret = copy_between_fds(out_fd, in_fd, offset, nullptr, - count); + const auto ret = + copy_between_fds(out_fd, in_fd, offset, nullptr, count); return ret; } GKFS_FALLBACK(sendfile64, out_fd, in_fd, offset, count); diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index f891ae43f..fa44e60a1 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -132,9 +132,9 @@ copy_between_fds(int out_fd, int in_fd, off64_t* in_off, off64_t* out_off, size_t consumed = 0; while(consumed < static_cast(nread)) { - const auto nwrite = write_once(buffer.data() + consumed, - static_cast(nread) - consumed, - out_pos); + const auto nwrite = + write_once(buffer.data() + consumed, + static_cast(nread) - consumed, out_pos); if(nwrite < 0) { return total > 0 ? static_cast(total) : nwrite; } @@ -482,8 +482,7 @@ hook_pwritev(unsigned long fd, const struct iovec* iov, unsigned long iovcnt, ssize_t hook_sendfile(int out_fd, int in_fd, off_t* offset, size_t count) { - LOG(DEBUG, - "{}() called with out_fd: {}, in_fd: {}, offset: {}, count: {}", + LOG(DEBUG, "{}() called with out_fd: {}, in_fd: {}, offset: {}, count: {}", __func__, out_fd, in_fd, fmt::ptr(offset), count); auto in_file = CTX->file_map()->get(in_fd); -- GitLab From cac5cfb2787319e60f5cb8ea71067c1eb9c9f37f Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 2 Mar 2026 20:27:23 +0100 Subject: [PATCH 59/68] deferred --- include/client/gkfs_functions.hpp | 3 +++ src/client/gkfs_functions.cpp | 27 +++++++++++++++++++++++++++ src/client/gkfs_metadata.cpp | 4 ++++ 3 files changed, 34 insertions(+) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index f7ee815dc..91190d936 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -158,6 +158,9 @@ gkfs_mmap_has_active_path(const std::string& path); bool gkfs_mmap_has_active_write_path(const std::string& path); +void +gkfs_mmap_invalidate_writeback_for_path(const std::string& path); + void gkfs_register_deferred_trunc(const std::string& path); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index a7dc44ab0..3d206cf67 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -354,6 +354,33 @@ gkfs_mmap_has_active_write_path(const std::string& path) { }); } +void +gkfs_mmap_invalidate_writeback_for_path(const std::string& path) { + std::lock_guard lock(get_mmap_set_mutex()); + auto& mmap_set = get_mmap_set(); + size_t disabled = 0; + for(auto it = mmap_set.begin(); it != mmap_set.end();) { + if(it->path != path || !it->write_intent) { + ++it; + continue; + } + auto updated = *it; + it = mmap_set.erase(it); + // A deferred O_TRUNC starts a new logical file generation for this + // path. Existing mappings must not publish stale bytes during + // read-triggered flush. + updated.write_intent = false; + updated.seeded_bytes = 0; + updated.flush_epoch = 0; + mmap_set.insert(std::move(updated)); + ++disabled; + } + if(disabled != 0) { + LOG(DEBUG, "{}() path '{}' disabled stale writeback segments {}", + __func__, path, disabled); + } +} + /** * Flush any pending write-mmap for `path` directly to the GekkoFS daemon. * diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index badb1c849..84543909d 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -155,6 +155,10 @@ gkfs_register_deferred_trunc(const std::string& path) { std::lock_guard lock(deferred_trunc_mutex()); auto& deferred = deferred_trunc_paths(); deferred[path]++; + // A deferred truncate starts a new publication phase for this pathname. + // Prevent pre-truncate mappings from clobbering offset 0 while the new + // writer is still publishing. + gkfs_mmap_invalidate_writeback_for_path(path); } void -- GitLab From 091d74864e7eadeba1719d6fa5839d1930794657 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 2 Mar 2026 20:29:53 +0100 Subject: [PATCH 60/68] def2 --- src/client/gkfs_functions.cpp | 33 +++++++++++++++++++++++++-------- src/client/gkfs_metadata.cpp | 6 ++++++ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 3d206cf67..b603b07b5 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -432,6 +432,14 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, return newer.flush_epoch > older.flush_epoch; }; + // Publish deferred truncation before selecting mmap candidates. If we + // snapshot first, stale pre-truncate mappings can still be flushed after + // truncate publication. + if(gkfs_has_deferred_trunc(path) && + gkfs_publish_deferred_trunc_for_path(path) < 0) { + return -1; + } + std::vector candidates; { std::lock_guard lock(get_mmap_set_mutex()); @@ -493,17 +501,26 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, return 0; } - // Publish deferred truncation once before first mmap-backed write - // publication. This keeps O_TRUNC ordering consistent while avoiding - // holding the mmap-tracking lock across RPC. - if(gkfs_has_deferred_trunc(path) && - gkfs_publish_deferred_trunc_for_path(path) < 0) { - return -1; - } - const long page_size = ::sysconf(_SC_PAGESIZE); size_t flushed_segments = 0; for(const auto& c : candidates) { + { + std::lock_guard lock(get_mmap_set_mutex()); + const auto& mmap_set = get_mmap_set(); + auto it = std::find_if( + mmap_set.begin(), mmap_set.end(), [&](const auto& entry) { + return entry.path == path && entry.addr == c.map_base && + entry.length == c.map_len && + entry.offset == c.map_off; + }); + if(it == mmap_set.end() || !should_flush_mapping(*it)) { + LOG(DEBUG, + "{}() skipping stale/invalidated overlap path '{}' addr {} len {} off {}", + __func__, path, c.addr, c.len, c.off); + continue; + } + } + LOG(DEBUG, "{}() flushing overlap path '{}' addr {} len {} file-off {}", __func__, path, c.addr, c.len, c.off); diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 84543909d..77e0ae124 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -199,6 +199,9 @@ gkfs_publish_deferred_trunc(gkfs::filemap::OpenFile& file) { if(md && gkfs_truncate(path, md->size(), 0)) { return -1; } + // Mappings created while truncate was deferred can still carry pre-truncate + // bytes. Prevent them from publishing stale data after truncate is visible. + gkfs_mmap_invalidate_writeback_for_path(path); file.set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, false); gkfs_unregister_deferred_trunc(path); return 0; @@ -213,6 +216,9 @@ gkfs_publish_deferred_trunc_for_path(const std::string& path) { if(md && gkfs_truncate(path, md->size(), 0)) { return -1; } + // Keep read-triggered mmap publication from re-introducing pre-truncate + // contents right after truncate publication. + gkfs_mmap_invalidate_writeback_for_path(path); gkfs_unregister_deferred_trunc(path); return 0; } -- GitLab From 4183d2949557ef7ffccb93ced0f3959f007749dc Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 2 Mar 2026 20:34:18 +0100 Subject: [PATCH 61/68] fix --- src/client/gkfs_metadata.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index 77e0ae124..d3fd76de5 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -328,6 +328,9 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if(CTX->protect_files_generator()) { generate_lock_file(path, true); } + // Re-creating a path starts a fresh publication phase. Drop stale + // mmap writeback candidates from a previous file incarnation. + gkfs_mmap_invalidate_writeback_for_path(path); return fd; } @@ -368,6 +371,9 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if(CTX->protect_files_generator()) { generate_lock_file(path, true); } + // Successful create on an existing pathname (after delete/reuse) + // must not allow stale mmap segments from an older incarnation. + gkfs_mmap_invalidate_writeback_for_path(path); // file was successfully created. Add to filemap return fd; } @@ -649,6 +655,10 @@ gkfs_remove(const std::string& path) { errno = err; return -1; } + // A removed pathname may be re-created quickly; prevent stale mappings of + // the previous file incarnation from publishing data into the new one. + gkfs_mmap_invalidate_writeback_for_path(path); + gkfs_unregister_deferred_trunc(path); return 0; } -- GitLab From 9d7a41d46cfb36cc25bcd7c0d536ecf46a167f63 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 2 Mar 2026 20:38:44 +0100 Subject: [PATCH 62/68] unmap --- include/client/gkfs_functions.hpp | 3 ++ src/client/gkfs_functions.cpp | 59 ++++++++++++++++++++++++++++++- src/client/gkfs_metadata.cpp | 12 +++---- 3 files changed, 67 insertions(+), 7 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index 91190d936..733970582 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -161,6 +161,9 @@ gkfs_mmap_has_active_write_path(const std::string& path); void gkfs_mmap_invalidate_writeback_for_path(const std::string& path); +void +gkfs_mmap_advance_writeback_epoch_for_path(const std::string& path); + void gkfs_register_deferred_trunc(const std::string& path); diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index b603b07b5..86c716e47 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -61,6 +61,7 @@ #include #include #include +#include #include #ifdef GKFS_ENABLE_CLIENT_METRICS @@ -94,6 +95,7 @@ struct mmap_entry { size_t seeded_bytes; uint64_t flush_epoch; uint64_t map_epoch; + uint64_t path_epoch; // We use addr as the unique key for the set bool @@ -119,6 +121,12 @@ get_mmap_set_mutex() { return mmap_set_mutex; } +std::unordered_map& +get_mmap_path_epoch_map() { + static std::unordered_map epoch_map; + return epoch_map; +} + using addr_type = uintptr_t; addr_type @@ -185,6 +193,13 @@ map_epoch_counter() { return counter; } +uint64_t +current_path_epoch_nolock(const std::string& path) { + auto& epoch_map = get_mmap_path_epoch_map(); + const auto it = epoch_map.find(path); + return (it == epoch_map.end()) ? 0 : it->second; +} + enum class flush_mode { read_triggered, explicit_sync }; bool @@ -381,6 +396,34 @@ gkfs_mmap_invalidate_writeback_for_path(const std::string& path) { } } +void +gkfs_mmap_advance_writeback_epoch_for_path(const std::string& path) { + std::lock_guard lock(get_mmap_set_mutex()); + auto& epoch_map = get_mmap_path_epoch_map(); + const auto next_epoch = ++epoch_map[path]; + + auto& mmap_set = get_mmap_set(); + size_t disabled = 0; + for(auto it = mmap_set.begin(); it != mmap_set.end();) { + if(it->path != path || !it->write_intent) { + ++it; + continue; + } + auto updated = *it; + it = mmap_set.erase(it); + // Old write-intent mappings must not publish into the next path epoch. + updated.write_intent = false; + updated.seeded_bytes = 0; + updated.flush_epoch = 0; + mmap_set.insert(std::move(updated)); + ++disabled; + } + + LOG(DEBUG, + "{}() path '{}' advanced writeback epoch {} disabled segments {}", + __func__, path, next_epoch, disabled); +} + /** * Flush any pending write-mmap for `path` directly to the GekkoFS daemon. * @@ -440,9 +483,11 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, return -1; } + uint64_t active_path_epoch = 0; std::vector candidates; { std::lock_guard lock(get_mmap_set_mutex()); + active_path_epoch = current_path_epoch_nolock(path); LOG(DEBUG, "{}() path '{}' tracked segments {} read-range [{}..{})", __func__, path, tracked_segments_for_path(path), read_begin, read_end); @@ -451,6 +496,9 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, if(entry.path != path || !should_flush_mapping(entry)) { continue; } + if(entry.path_epoch != active_path_epoch) { + continue; + } const auto entry_begin = static_cast(entry.offset); const auto entry_end = @@ -519,6 +567,13 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, __func__, path, c.addr, c.len, c.off); continue; } + if(it->path_epoch != active_path_epoch) { + LOG(DEBUG, + "{}() skipping old-epoch overlap path '{}' addr {} len {} off {} entry-epoch {} active-epoch {}", + __func__, path, c.addr, c.len, c.off, it->path_epoch, + active_path_epoch); + continue; + } } LOG(DEBUG, "{}() flushing overlap path '{}' addr {} len {} file-off {}", @@ -702,8 +757,10 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, { const auto map_epoch = map_epoch_counter().fetch_add(1) + 1; std::lock_guard lock(get_mmap_set_mutex()); + const auto path_epoch = current_path_epoch_nolock(path); get_mmap_set().insert(mmap_entry{ptr, fd, path, length, offset, prot, - flags, write_intent, 0, 0, map_epoch}); + flags, write_intent, 0, 0, map_epoch, + path_epoch}); } const auto seeded = gkfs::syscall::gkfs_pread(fd, ptr, length, offset); diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index d3fd76de5..fb89d317a 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -158,7 +158,7 @@ gkfs_register_deferred_trunc(const std::string& path) { // A deferred truncate starts a new publication phase for this pathname. // Prevent pre-truncate mappings from clobbering offset 0 while the new // writer is still publishing. - gkfs_mmap_invalidate_writeback_for_path(path); + gkfs_mmap_advance_writeback_epoch_for_path(path); } void @@ -201,7 +201,7 @@ gkfs_publish_deferred_trunc(gkfs::filemap::OpenFile& file) { } // Mappings created while truncate was deferred can still carry pre-truncate // bytes. Prevent them from publishing stale data after truncate is visible. - gkfs_mmap_invalidate_writeback_for_path(path); + gkfs_mmap_advance_writeback_epoch_for_path(path); file.set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, false); gkfs_unregister_deferred_trunc(path); return 0; @@ -218,7 +218,7 @@ gkfs_publish_deferred_trunc_for_path(const std::string& path) { } // Keep read-triggered mmap publication from re-introducing pre-truncate // contents right after truncate publication. - gkfs_mmap_invalidate_writeback_for_path(path); + gkfs_mmap_advance_writeback_epoch_for_path(path); gkfs_unregister_deferred_trunc(path); return 0; } @@ -330,7 +330,7 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { } // Re-creating a path starts a fresh publication phase. Drop stale // mmap writeback candidates from a previous file incarnation. - gkfs_mmap_invalidate_writeback_for_path(path); + gkfs_mmap_advance_writeback_epoch_for_path(path); return fd; } @@ -373,7 +373,7 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { } // Successful create on an existing pathname (after delete/reuse) // must not allow stale mmap segments from an older incarnation. - gkfs_mmap_invalidate_writeback_for_path(path); + gkfs_mmap_advance_writeback_epoch_for_path(path); // file was successfully created. Add to filemap return fd; } @@ -657,7 +657,7 @@ gkfs_remove(const std::string& path) { } // A removed pathname may be re-created quickly; prevent stale mappings of // the previous file incarnation from publishing data into the new one. - gkfs_mmap_invalidate_writeback_for_path(path); + gkfs_mmap_advance_writeback_epoch_for_path(path); gkfs_unregister_deferred_trunc(path); return 0; } -- GitLab From b3b6cdec2ea14c9b15deb84d7e50d5355fe75306 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 2 Mar 2026 21:06:09 +0100 Subject: [PATCH 63/68] fix2 --- src/client/gkfs_functions.cpp | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 86c716e47..96da04080 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -485,6 +485,7 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, uint64_t active_path_epoch = 0; std::vector candidates; + uint64_t newest_map_epoch = 0; { std::lock_guard lock(get_mmap_set_mutex()); active_path_epoch = current_path_epoch_nolock(path); @@ -539,6 +540,8 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, } } if(!merged) { + newest_map_epoch = std::max(newest_map_epoch, + candidate.map_epoch); candidates.push_back(std::move(candidate)); } } @@ -549,6 +552,24 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, return 0; } + if(newest_map_epoch != 0) { + const size_t before = candidates.size(); + candidates.erase(std::remove_if(candidates.begin(), candidates.end(), + [newest_map_epoch](const auto& c) { + return c.map_epoch != + newest_map_epoch; + }), + candidates.end()); + if(candidates.size() != before) { + // Read-triggered publication is advisory. When multiple mmap + // generations overlap the same path, only publish the newest one + // to avoid stale generations clobbering file offset 0. + LOG(DEBUG, + "{}() path '{}' dropping {} stale overlap candidates (keeping map_epoch {})", + __func__, path, before - candidates.size(), newest_map_epoch); + } + } + const long page_size = ::sysconf(_SC_PAGESIZE); size_t flushed_segments = 0; for(const auto& c : candidates) { @@ -576,8 +597,9 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, } } - LOG(DEBUG, "{}() flushing overlap path '{}' addr {} len {} file-off {}", - __func__, path, c.addr, c.len, c.off); + LOG(DEBUG, + "{}() flushing overlap path '{}' addr {} len {} file-off {} map_epoch {}", + __func__, path, c.addr, c.len, c.off, c.map_epoch); bool definitely_unmapped = false; if(page_size > 0) { @@ -624,8 +646,9 @@ gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, const int flush_rc = flush_range_chunked(path, c.addr, c.len, c.off, flush_mode::read_triggered, c.seeded_bytes); - LOG(DEBUG, "{}() flush result path '{}' addr {} len {} off {} rc {}", - __func__, path, c.addr, c.len, c.off, flush_rc); + LOG(DEBUG, + "{}() flush result path '{}' addr {} len {} off {} map_epoch {} rc {}", + __func__, path, c.addr, c.len, c.off, c.map_epoch, flush_rc); if(flush_rc == k_flush_unavailable) { std::lock_guard lock(get_mmap_set_mutex()); auto& mmap_set = get_mmap_set(); -- GitLab From 0afb21c4506085d8a9d651f853af9d42fb0ac37c Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Tue, 3 Mar 2026 11:05:35 +0100 Subject: [PATCH 64/68] simplify mmap --- include/client/gkfs_functions.hpp | 42 -- src/client/gkfs_data.cpp | 66 --- src/client/gkfs_functions.cpp | 953 +++++------------------------- src/client/gkfs_libc.cpp | 79 +-- src/client/gkfs_metadata.cpp | 177 +----- src/client/hooks.cpp | 18 +- 6 files changed, 158 insertions(+), 1177 deletions(-) diff --git a/include/client/gkfs_functions.hpp b/include/client/gkfs_functions.hpp index 733970582..cdfe357c9 100644 --- a/include/client/gkfs_functions.hpp +++ b/include/client/gkfs_functions.hpp @@ -145,39 +145,6 @@ gkfs_read_ws(const gkfs::filemap::OpenFile& file, char* buf, size_t count, ssize_t gkfs_pread(int fd, void* buf, size_t count, off64_t offset); -int -gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, - size_t length); - -int -gkfs_mmap_flush_all_for_path(const std::string& path); - -bool -gkfs_mmap_has_active_path(const std::string& path); - -bool -gkfs_mmap_has_active_write_path(const std::string& path); - -void -gkfs_mmap_invalidate_writeback_for_path(const std::string& path); - -void -gkfs_mmap_advance_writeback_epoch_for_path(const std::string& path); - -void -gkfs_register_deferred_trunc(const std::string& path); - -void -gkfs_unregister_deferred_trunc(const std::string& path); - -bool -gkfs_has_deferred_trunc(const std::string& path); - -int -gkfs_publish_deferred_trunc(gkfs::filemap::OpenFile& file); - -int -gkfs_publish_deferred_trunc_for_path(const std::string& path); ssize_t gkfs_read(int fd, void* buf, size_t count); @@ -226,15 +193,6 @@ gkfs_munmap(void* addr, size_t length); int gkfs_msync(void* addr, size_t length, int flags); -/** - * Returns true for file descriptors that should publish mmap-backed writes to - * the daemon. - * - * This predicate is path-agnostic on purpose: mmap flush policy should depend - * on open intent, not on workload-specific pathname conventions. - */ -bool -gkfs_has_explicit_mmap_write_intent(gkfs::filemap::OpenFile& file); } // namespace gkfs::syscall diff --git a/src/client/gkfs_data.cpp b/src/client/gkfs_data.cpp index 5b1aea829..5632dcc0e 100644 --- a/src/client/gkfs_data.cpp +++ b/src/client/gkfs_data.cpp @@ -74,37 +74,6 @@ using namespace std; namespace gkfs::syscall { -namespace { -bool -is_suspicious_zero_prefix(const void* buf, long read_ret, off64_t offset, - size_t count) { - if(offset != 0 || count < 8) { - return false; - } - if(read_ret == 0) { - return true; - } - if(read_ret < 8) { - return false; - } - const auto* cbuf = reinterpret_cast(buf); - for(size_t i = 0; i < 8; ++i) { - if(cbuf[i] != 0U) { - return false; - } - } - return true; -} - -bool -should_retry_read_after_flush(const gkfs::filemap::OpenFile& file, - long read_ret, const void* buf, off64_t offset, - size_t count) { - return gkfs_mmap_has_active_write_path(file.path()) && - is_suspicious_zero_prefix(buf, read_ret, offset, count); -} -} // namespace - /** * Actual write function for all gkfs write operations * errno may be set @@ -164,10 +133,6 @@ gkfs_do_write(gkfs::filemap::OpenFile& file, const char* buf, size_t count, } file.set_flag(gkfs::filemap::OpenFile_flags::creation_pending, false); } - if(gkfs_publish_deferred_trunc(file) < 0) { - return -1; - } - // clear inline data cache as it is stale if(!file.inline_data().empty()) file.inline_data(""); @@ -612,37 +577,6 @@ gkfs_do_read(const gkfs::filemap::OpenFile& file, char* buf, size_t count, errno = err; return -1; } - if(should_retry_read_after_flush(file, ret.second, buf, offset, count)) { - constexpr int k_read_retry_count = 2500; - constexpr auto k_read_retry_sleep = std::chrono::milliseconds(2); - for(int attempt = 1; - attempt <= k_read_retry_count && - should_retry_read_after_flush(file, ret.second, buf, offset, count); - ++attempt) { - // Trigger pending mmap-backed publication, then retry the read. - (void) gkfs_mmap_flush_for_path(file.path(), offset, count); - std::this_thread::sleep_for(k_read_retry_sleep); - auto [size_err, remote_size] = - gkfs::rpc::forward_get_metadentry_size(file.path(), 0); - if(size_err == 0 && remote_size > 0 && (attempt % 50) == 0) { - LOG(DEBUG, - "{}() read retry path '{}' attempt {} remote_size {}", - __func__, file.path(), attempt, remote_size); - } - ret = do_chunk_read(); - if(ret.first) { - errno = ret.first; - return -1; - } - if(!should_retry_read_after_flush(file, ret.second, buf, offset, - count)) { - LOG(DEBUG, - "{}() read retry recovered path '{}' after retry {} bytes {}", - __func__, file.path(), attempt, ret.second); - break; - } - } - } // XXX check that we don't try to read past end of the file return ret.second; // return read size } diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 96da04080..7a358bc06 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include #include @@ -81,790 +83,151 @@ extern "C" { using namespace std; +namespace gkfs::syscall { + namespace { -struct mmap_entry { +struct MmapEntry { void* addr; - int fd; - std::string path; size_t length; + int refcount; + std::string path; off_t offset; int prot; int flags; - bool write_intent; - size_t seeded_bytes; - uint64_t flush_epoch; - uint64_t map_epoch; - uint64_t path_epoch; - - // We use addr as the unique key for the set - bool - operator<(const mmap_entry& other) const { - return addr < other.addr; - } }; -// Tracks active GekkoFS mmap regions. -// Stores (addr, fd, path, length, offset, prot). -// path is captured at mmap() time so that munmap()/msync() can flush data -// back even after the originating fd has been closed (POSIX allows this). - -std::set& -get_mmap_set() { - static std::set mmap_set; - return mmap_set; -} - -std::mutex& -get_mmap_set_mutex() { - static std::mutex mmap_set_mutex; - return mmap_set_mutex; -} - -std::unordered_map& -get_mmap_path_epoch_map() { - static std::unordered_map epoch_map; - return epoch_map; -} - -using addr_type = uintptr_t; - -addr_type -ptr_to_addr(void* ptr) { - return reinterpret_cast(ptr); -} - -void* -addr_to_ptr(addr_type addr) { - return reinterpret_cast(addr); -} - -bool -overlaps(addr_type a_begin, addr_type a_end, addr_type b_begin, - addr_type b_end) { - return a_begin < b_end && b_begin < a_end; -} - -bool -is_shared_map_type(int flags) { - const int map_type = flags & MAP_TYPE; -#ifdef MAP_SHARED_VALIDATE - return map_type == MAP_SHARED || map_type == MAP_SHARED_VALIDATE; -#else - return map_type == MAP_SHARED; -#endif -} - -bool -should_flush_mapping(const mmap_entry& entry) { - // Only shared + writable maps with explicit write intent are eligible for - // daemon flush. This avoids publishing reader-side anonymous snapshots. - return entry.write_intent && (entry.prot & PROT_WRITE) && - is_shared_map_type(entry.flags) && entry.length != 0; -} - -size_t -tracked_segments_for_path(const std::string& path) { - size_t segments = 0; - for(const auto& entry : get_mmap_set()) { - if(entry.path == path) { - ++segments; - } - } - return segments; -} - -// Keep RPC payloads conservative for mmap publication. Very large chunks can -// trigger transport-side registration failures and stall close/munmap paths. -constexpr size_t k_mmap_flush_chunk_size = 512UL * 1024UL; -constexpr int k_flush_ok = 0; -constexpr int k_flush_unavailable = 1; -constexpr int k_flush_deferred = 2; - -std::atomic& -flush_epoch_counter() { - static std::atomic counter{0}; - return counter; -} - -std::atomic& -map_epoch_counter() { - static std::atomic counter{0}; - return counter; -} - -uint64_t -current_path_epoch_nolock(const std::string& path) { - auto& epoch_map = get_mmap_path_epoch_map(); - const auto it = epoch_map.find(path); - return (it == epoch_map.end()) ? 0 : it->second; -} - -enum class flush_mode { read_triggered, explicit_sync }; - -bool -has_non_zero_prefix(const char* buf, size_t size) { - const size_t prefix = std::min(size, 8); - for(size_t i = 0; i < prefix; ++i) { - if(static_cast(buf[i]) != 0U) { - return true; - } - } - return false; -} - -bool -remote_prefix_is_non_zero(const std::string& path) { - std::array remote_prefix{}; - std::set failed; - auto [rerr, rsize] = gkfs::rpc::forward_read( - path, remote_prefix.data(), 0, remote_prefix.size(), 0, failed); - if(rerr != 0 || rsize <= 0) { - return false; - } - return has_non_zero_prefix(remote_prefix.data(), - static_cast(rsize)); -} - -int -open_self_mem_fd() { - return static_cast(::syscall_no_intercept( - SYS_openat, AT_FDCWD, "/proc/self/mem", O_RDONLY | O_CLOEXEC, 0)); -} - -int -flush_range_chunked(const std::string& path, const void* addr, size_t len, - off_t off, flush_mode mode, size_t seeded_bytes = 0) { - const auto* base = static_cast(addr); - size_t total = 0; - std::vector bounce(k_mmap_flush_chunk_size); - const int mem_fd = open_self_mem_fd(); - if(mem_fd < 0) { - errno = EIO; - return -1; - } - - while(total < len) { - const size_t chunk = std::min(k_mmap_flush_chunk_size, len - total); - const off_t chunk_off = - static_cast(off + static_cast(total)); - const long page_size_l = ::sysconf(_SC_PAGESIZE); - const size_t page_size = - page_size_l > 0 ? static_cast(page_size_l) : 4096U; - size_t copied = 0; - while(copied < chunk) { - const size_t to_copy = std::min(page_size, chunk - copied); - const auto src_off = static_cast( - reinterpret_cast(base + total + copied)); - const auto nread = static_cast(::syscall_no_intercept( - SYS_pread64, mem_fd, bounce.data() + copied, to_copy, - src_off)); - if(nread <= 0) { - const auto saved_errno = errno; - if(saved_errno == EFAULT || saved_errno == ENOMEM) { - if(copied == 0) { - LOG(WARNING, - "{}() source unavailable before copy path '{}' off {} len {}", - __func__, path, src_off, to_copy); - (void) ::syscall_no_intercept(SYS_close, mem_fd); - return k_flush_unavailable; - } - break; - } - LOG(WARNING, - "{}() self-mem read failed path '{}' off {} len {} errno {}", - __func__, path, src_off, to_copy, saved_errno); - (void) ::syscall_no_intercept(SYS_close, mem_fd); - return -1; - } - copied += static_cast(nread); - } - - if(copied == 0) { - (void) ::syscall_no_intercept(SYS_close, mem_fd); - return k_flush_unavailable; - } - - if(mode == flush_mode::read_triggered && chunk_off == 0 && - !has_non_zero_prefix(bounce.data(), copied)) { - // Read-triggered publication must not synthesize a brand-new - // all-zero header at file offset 0 from an mmap view that was never - // seeded from existing file bytes. That can race with concurrent - // writers and make readers observe a permanent invalid magic. - if(seeded_bytes < 8) { - LOG(DEBUG, - "{}() deferring unseeded zero-prefix publication path '{}' off {} len {} seeded {}", - __func__, path, chunk_off, copied, seeded_bytes); - (void) ::syscall_no_intercept(SYS_close, mem_fd); - return k_flush_deferred; - } - - // Also avoid clobbering an already-visible non-zero header with a - // stale all-zero snapshot. - if(remote_prefix_is_non_zero(path)) { - LOG(DEBUG, - "{}() skipping stale zero-prefix publication path '{}' off {} len {}", - __func__, path, chunk_off, copied); - (void) ::syscall_no_intercept(SYS_close, mem_fd); - return k_flush_deferred; - } - } - - auto [werr, wsize] = gkfs::rpc::forward_write(path, bounce.data(), - chunk_off, copied, 0); - if(werr) { - errno = werr; - LOG(WARNING, - "{}() forward_write failed path '{}' off {} len {} err {}", - __func__, path, chunk_off, copied, werr); - (void) ::syscall_no_intercept(SYS_close, mem_fd); - return -1; - } - if(wsize <= 0) { - errno = EIO; - LOG(WARNING, - "{}() forward_write invalid size {} path '{}' off {} len {}", - __func__, wsize, path, chunk_off, copied); - (void) ::syscall_no_intercept(SYS_close, mem_fd); - return -1; - } - - const size_t wrote = static_cast(wsize); - gkfs::utils::update_file_size(path, wrote, chunk_off, false, false); - total += wrote; - if(wrote < copied) { - break; - } - } - - (void) ::syscall_no_intercept(SYS_close, mem_fd); - return (total == len) ? k_flush_ok : static_cast(total); -} +std::mutex mmap_mtx; +std::unordered_map mmap_registry; +std::unordered_map addr_to_key; } // namespace -namespace gkfs::syscall { - -bool -gkfs_has_explicit_mmap_write_intent(gkfs::filemap::OpenFile& file) { - // Keep this predicate as the single policy source for mmap-backed flush - // decisions. - // Path-specific behavior is intentionally avoided to keep policy consistent - // across syscall/libc interception modes and applications. - return file.get_flag(gkfs::filemap::OpenFile_flags::wronly) || - file.get_flag(gkfs::filemap::OpenFile_flags::append) || - file.get_flag(gkfs::filemap::OpenFile_flags::trunc) || - file.get_flag(gkfs::filemap::OpenFile_flags::creat) || - file.get_flag(gkfs::filemap::OpenFile_flags::creation_pending) || - file.get_flag(gkfs::filemap::OpenFile_flags::rdwr); -} - -bool -gkfs_mmap_has_active_write_path(const std::string& path) { - std::lock_guard lock(get_mmap_set_mutex()); - const auto& mmap_set = get_mmap_set(); - return std::any_of( - mmap_set.begin(), mmap_set.end(), [&path](const auto& entry) { - return entry.path == path && should_flush_mapping(entry); - }); -} - -void -gkfs_mmap_invalidate_writeback_for_path(const std::string& path) { - std::lock_guard lock(get_mmap_set_mutex()); - auto& mmap_set = get_mmap_set(); - size_t disabled = 0; - for(auto it = mmap_set.begin(); it != mmap_set.end();) { - if(it->path != path || !it->write_intent) { - ++it; - continue; - } - auto updated = *it; - it = mmap_set.erase(it); - // A deferred O_TRUNC starts a new logical file generation for this - // path. Existing mappings must not publish stale bytes during - // read-triggered flush. - updated.write_intent = false; - updated.seeded_bytes = 0; - updated.flush_epoch = 0; - mmap_set.insert(std::move(updated)); - ++disabled; - } - if(disabled != 0) { - LOG(DEBUG, "{}() path '{}' disabled stale writeback segments {}", - __func__, path, disabled); - } -} - -void -gkfs_mmap_advance_writeback_epoch_for_path(const std::string& path) { - std::lock_guard lock(get_mmap_set_mutex()); - auto& epoch_map = get_mmap_path_epoch_map(); - const auto next_epoch = ++epoch_map[path]; - - auto& mmap_set = get_mmap_set(); - size_t disabled = 0; - for(auto it = mmap_set.begin(); it != mmap_set.end();) { - if(it->path != path || !it->write_intent) { - ++it; - continue; - } - auto updated = *it; - it = mmap_set.erase(it); - // Old write-intent mappings must not publish into the next path epoch. - updated.write_intent = false; - updated.seeded_bytes = 0; - updated.flush_epoch = 0; - mmap_set.insert(std::move(updated)); - ++disabled; - } - - LOG(DEBUG, - "{}() path '{}' advanced writeback epoch {} disabled segments {}", - __func__, path, next_epoch, disabled); -} - -/** - * Flush any pending write-mmap for `path` directly to the GekkoFS daemon. - * - * This is called at the start of every gkfs_pread so that a reader always - * sees data written through an mmap mapping that hasn't been munmap'd yet. - * Without this, the daemon has 0 bytes and reads return EOF. - */ -int -gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, - size_t length) { - if(offset < 0) { - errno = EINVAL; - return -1; - } +void* +gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, + off_t offset) { if(length == 0) { - return 0; - } - - const auto read_begin = static_cast(offset); - const auto read_end = read_begin + static_cast(length); - if(read_end < read_begin) { errno = EINVAL; - return -1; - } - - struct flush_candidate { - void* addr; - size_t len; - off_t off; - size_t seeded_bytes; - uint64_t flush_epoch; - uint64_t map_epoch; - void* map_base; - size_t map_len; - off_t map_off; - }; - - const auto prefer_candidate = [](const flush_candidate& newer, - const flush_candidate& older) { - // Read-triggered publication is best-effort. If multiple tracked mmaps - // target the same file range, prefer the newest mapping snapshot to - // avoid stale generations clobbering current data. - if(newer.map_epoch != older.map_epoch) { - return newer.map_epoch > older.map_epoch; - } - if(newer.seeded_bytes != older.seeded_bytes) { - return newer.seeded_bytes > older.seeded_bytes; - } - return newer.flush_epoch > older.flush_epoch; - }; - - // Publish deferred truncation before selecting mmap candidates. If we - // snapshot first, stale pre-truncate mappings can still be flushed after - // truncate publication. - if(gkfs_has_deferred_trunc(path) && - gkfs_publish_deferred_trunc_for_path(path) < 0) { - return -1; - } - - uint64_t active_path_epoch = 0; - std::vector candidates; - uint64_t newest_map_epoch = 0; - { - std::lock_guard lock(get_mmap_set_mutex()); - active_path_epoch = current_path_epoch_nolock(path); - LOG(DEBUG, "{}() path '{}' tracked segments {} read-range [{}..{})", - __func__, path, tracked_segments_for_path(path), read_begin, - read_end); - - for(const auto& entry : get_mmap_set()) { - if(entry.path != path || !should_flush_mapping(entry)) { - continue; - } - if(entry.path_epoch != active_path_epoch) { - continue; - } - - const auto entry_begin = static_cast(entry.offset); - const auto entry_end = - entry_begin + static_cast(entry.length); - if(entry_end <= read_begin || read_end <= entry_begin) { - continue; - } - - const auto flush_begin = std::max(entry_begin, read_begin); - const auto flush_end = std::min(entry_end, read_end); - const auto rel_begin = - static_cast(flush_begin - entry_begin); - const size_t len = static_cast(flush_end - flush_begin); - auto candidate = flush_candidate{ - addr_to_ptr(ptr_to_addr(entry.addr) + rel_begin), - len, - static_cast(flush_begin), - entry.seeded_bytes, - entry.flush_epoch, - entry.map_epoch, - entry.addr, - entry.length, - entry.offset}; - - bool merged = false; - for(auto& selected : candidates) { - if(selected.off == candidate.off && - selected.len == candidate.len) { - if(prefer_candidate(candidate, selected)) { - LOG(DEBUG, - "{}() replacing candidate path '{}' off {} len {} map_epoch {} -> {}", - __func__, path, candidate.off, candidate.len, - selected.map_epoch, candidate.map_epoch); - selected = candidate; - } - merged = true; - break; - } - } - if(!merged) { - newest_map_epoch = std::max(newest_map_epoch, - candidate.map_epoch); - candidates.push_back(std::move(candidate)); - } - } - } - - if(candidates.empty()) { - LOG(DEBUG, "{}() path '{}' flushed segments {}", __func__, path, 0); - return 0; - } - - if(newest_map_epoch != 0) { - const size_t before = candidates.size(); - candidates.erase(std::remove_if(candidates.begin(), candidates.end(), - [newest_map_epoch](const auto& c) { - return c.map_epoch != - newest_map_epoch; - }), - candidates.end()); - if(candidates.size() != before) { - // Read-triggered publication is advisory. When multiple mmap - // generations overlap the same path, only publish the newest one - // to avoid stale generations clobbering file offset 0. - LOG(DEBUG, - "{}() path '{}' dropping {} stale overlap candidates (keeping map_epoch {})", - __func__, path, before - candidates.size(), newest_map_epoch); - } + return MAP_FAILED; } - const long page_size = ::sysconf(_SC_PAGESIZE); - size_t flushed_segments = 0; - for(const auto& c : candidates) { - { - std::lock_guard lock(get_mmap_set_mutex()); - const auto& mmap_set = get_mmap_set(); - auto it = std::find_if( - mmap_set.begin(), mmap_set.end(), [&](const auto& entry) { - return entry.path == path && entry.addr == c.map_base && - entry.length == c.map_len && - entry.offset == c.map_off; - }); - if(it == mmap_set.end() || !should_flush_mapping(*it)) { - LOG(DEBUG, - "{}() skipping stale/invalidated overlap path '{}' addr {} len {} off {}", - __func__, path, c.addr, c.len, c.off); - continue; - } - if(it->path_epoch != active_path_epoch) { - LOG(DEBUG, - "{}() skipping old-epoch overlap path '{}' addr {} len {} off {} entry-epoch {} active-epoch {}", - __func__, path, c.addr, c.len, c.off, it->path_epoch, - active_path_epoch); - continue; - } - } - - LOG(DEBUG, - "{}() flushing overlap path '{}' addr {} len {} file-off {} map_epoch {}", - __func__, path, c.addr, c.len, c.off, c.map_epoch); - - bool definitely_unmapped = false; - if(page_size > 0) { - const auto page_size_u = static_cast(page_size); - const auto start = ptr_to_addr(c.addr); - const auto end = start + c.len; - const auto aligned_start = start & ~(page_size_u - 1U); - const auto aligned_end = - (end + page_size_u - 1U) & ~(page_size_u - 1U); - unsigned char mincore_vec = 0; - for(auto p = aligned_start; p < aligned_end; p += page_size_u) { - if(::mincore(addr_to_ptr(p), static_cast(page_size_u), - &mincore_vec) != 0) { - if(errno == ENOMEM) { - definitely_unmapped = true; - } else { - LOG(DEBUG, - "{}() mincore indeterminate for path '{}' addr {} errno {}, continuing with self-mem copy", - __func__, path, addr_to_ptr(p), errno); - } - break; - } - } - } - - if(definitely_unmapped) { - std::lock_guard lock(get_mmap_set_mutex()); - auto& mmap_set = get_mmap_set(); - auto it = std::find_if( - mmap_set.begin(), mmap_set.end(), [&](const auto& entry) { - return entry.path == path && entry.addr == c.map_base && - entry.length == c.map_len && - entry.offset == c.map_off; - }); - if(it != mmap_set.end()) { - LOG(WARNING, - "{}() dropping stale mmap segment path '{}' addr {} len {}", - __func__, path, c.addr, c.len); - mmap_set.erase(it); - } - continue; - } - - const int flush_rc = - flush_range_chunked(path, c.addr, c.len, c.off, - flush_mode::read_triggered, c.seeded_bytes); - LOG(DEBUG, - "{}() flush result path '{}' addr {} len {} off {} map_epoch {} rc {}", - __func__, path, c.addr, c.len, c.off, c.map_epoch, flush_rc); - if(flush_rc == k_flush_unavailable) { - std::lock_guard lock(get_mmap_set_mutex()); - auto& mmap_set = get_mmap_set(); - auto it = std::find_if( - mmap_set.begin(), mmap_set.end(), [&](const auto& entry) { - return entry.path == path && entry.addr == c.map_base && - entry.length == c.map_len && - entry.offset == c.map_off; - }); - if(it != mmap_set.end()) { - LOG(DEBUG, - "{}() dropping unavailable overlap path '{}' addr {} len {} off {}", - __func__, path, c.addr, c.len, c.off); - mmap_set.erase(it); - } - continue; - } - if(flush_rc == k_flush_deferred) { - continue; - } - if(flush_rc < 0) { - LOG(WARNING, - "{}() chunked flush failed for path '{}' (addr {} len {} off {})", - __func__, path, c.addr, c.len, c.off); - errno = EIO; - return -1; - } - - { - std::lock_guard lock(get_mmap_set_mutex()); - auto& mmap_set = get_mmap_set(); - auto it = std::find_if( - mmap_set.begin(), mmap_set.end(), [&](const auto& entry) { - return entry.path == path && entry.addr == c.map_base && - entry.length == c.map_len && - entry.offset == c.map_off; - }); - if(it != mmap_set.end()) { - auto updated = *it; - const auto next_epoch = flush_epoch_counter().fetch_add(1) + 1; - updated.flush_epoch = next_epoch; - mmap_set.erase(it); - mmap_set.insert(std::move(updated)); - } - } - ++flushed_segments; + auto gkfs_fd = CTX->file_map()->get(fd); + if(!gkfs_fd) { + return ::mmap(addr, length, prot, flags, fd, offset); } - LOG(DEBUG, "{}() path '{}' flushed segments {}", __func__, path, - flushed_segments); - return 0; -} + std::string path = gkfs_fd->path(); + std::string key = fmt::format("{}:{}:{}", path, offset, length); -int -gkfs_mmap_flush_all_for_path(const std::string& path) { - std::vector entries; { - std::lock_guard lock(get_mmap_set_mutex()); - for(const auto& entry : get_mmap_set()) { - if(entry.path == path && should_flush_mapping(entry)) { - entries.push_back(entry); + std::lock_guard lock(mmap_mtx); + auto it = mmap_registry.find(key); + if(it != mmap_registry.end()) { + it->second.refcount++; + // Upgrade protections if needed (e.g., initial map was RO, now we + // want RW) + if((it->second.prot & prot) != prot) { + it->second.prot |= prot; + ::mprotect(it->second.addr, length, it->second.prot); } + return it->second.addr; } } - size_t flushed_segments = 0; - - for(const auto& entry : entries) { - const int flush_rc = - flush_range_chunked(path, entry.addr, entry.length, - entry.offset, flush_mode::explicit_sync); - LOG(DEBUG, - "{}() flush-all result path '{}' addr {} len {} off {} rc {}", - __func__, path, entry.addr, entry.length, entry.offset, flush_rc); - if(flush_rc == k_flush_unavailable) { - // Mapping is no longer readable, likely already unmapped. - continue; - } - if(flush_rc < 0) { - errno = EIO; - return -1; - } - ++flushed_segments; - } - - LOG(DEBUG, "{}() path '{}' flushed all segments {}", __func__, path, - flushed_segments); - return 0; -} -bool -gkfs_mmap_has_active_path(const std::string& path) { - std::lock_guard lock(get_mmap_set_mutex()); - const auto& mmap_set = get_mmap_set(); - return std::any_of( - mmap_set.begin(), mmap_set.end(), - [&path](const auto& entry) { return entry.path == path; }); -} - -void* -gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, - off_t offset) { - if(length == 0) { - errno = EINVAL; + // Allocate new anonymous, SHARED mapping so child processes inherit it if + // needed (MAP_SHARED | MAP_ANONYMOUS preserves it across fork if possible) + int map_flags = + MAP_ANONYMOUS | (flags & (MAP_SHARED | MAP_PRIVATE | MAP_FIXED)); + void* ptr = ::mmap(addr, length, prot | PROT_READ | PROT_WRITE, map_flags, + -1, 0); + if(ptr == MAP_FAILED) { return MAP_FAILED; } - // Capture the file path *now*, while the fd is still open. - // s3d.x (and many Fortran programs) close the fd immediately after mmap; - // storing the path lets munmap/msync flush data back even when fd is gone. - auto gkfs_fd = CTX->file_map()->get(fd); - if(!gkfs_fd) { - // Descriptor tracking can race with concurrent close/reuse in - // syscall-intercepted multi-threaded runtimes (e.g., MPI internals). - // In that case, gracefully defer to the kernel mmap path. - return ::mmap(addr, length, prot, flags, fd, offset); - } - std::string path = gkfs_fd->path(); - // Persist the open-time write intent with the mapping so read-triggered - // flushes and close-time flushes follow exactly the same policy. - const bool write_intent = gkfs_has_explicit_mmap_write_intent(*gkfs_fd); + // fault memory for RDMA pinning + std::memset(ptr, 0, length); - void* ptr = ::mmap(addr, length, prot | PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if(ptr == MAP_FAILED) { + // Call pread WITHOUT the lock so Mercury network threads do not deadlock + const auto seeded = gkfs_pread(fd, ptr, length, offset); + if(seeded < 0) { + ::munmap(ptr, length); return MAP_FAILED; } - // Register mapping under lock so concurrent threads don't race on mmap_set - { - const auto map_epoch = map_epoch_counter().fetch_add(1) + 1; - std::lock_guard lock(get_mmap_set_mutex()); - const auto path_epoch = current_path_epoch_nolock(path); - get_mmap_set().insert(mmap_entry{ptr, fd, path, length, offset, prot, - flags, write_intent, 0, 0, map_epoch, - path_epoch}); + // Restrict protections if NOT requested to have write + if((prot & PROT_WRITE) == 0 && prot != (prot | PROT_READ | PROT_WRITE)) { + ::mprotect(ptr, length, prot); } - const auto seeded = gkfs::syscall::gkfs_pread(fd, ptr, length, offset); - if(seeded > 0) { - std::lock_guard lock(get_mmap_set_mutex()); - auto it = std::find_if(get_mmap_set().begin(), get_mmap_set().end(), - [ptr](const auto& e) { return e.addr == ptr; }); - if(it != get_mmap_set().end()) { - auto updated = *it; - get_mmap_set().erase(it); - updated.seeded_bytes = static_cast(seeded); - get_mmap_set().insert(std::move(updated)); + // Re-acquire lock to insert mapping + { + std::lock_guard lock(mmap_mtx); + // Ensure no other thread raced to map the same region while we were + // reading + auto it = mmap_registry.find(key); + if(it != mmap_registry.end()) { + ::munmap(ptr, length); + it->second.refcount++; + if((it->second.prot & prot) != prot) { + it->second.prot |= prot; + ::mprotect(it->second.addr, length, it->second.prot); + } + return it->second.addr; } - } - if(!(prot & PROT_WRITE) && prot != (prot | PROT_READ | PROT_WRITE)) { - ::mprotect(ptr, length, prot); + mmap_registry[key] = {ptr, length, 1, path, offset, prot, flags}; + addr_to_key[ptr] = key; } return ptr; } int -// cppcheck-suppress constParameterPointer gkfs_msync(void* addr, size_t length, int flags) { - (void) flags; if(length == 0) { - return -1; - } - - struct flush_range { - void* addr; - size_t length; - off_t offset; - int fd; - std::string path; - }; - - const auto sync_begin = ptr_to_addr(addr); - const auto sync_end = sync_begin + length; - if(sync_end < sync_begin) { errno = EINVAL; return -1; } - std::vector flush_ranges; - std::unique_lock lock(get_mmap_set_mutex()); - for(const auto& entry : get_mmap_set()) { - if(!should_flush_mapping(entry)) { - continue; - } - const auto entry_begin = ptr_to_addr(entry.addr); - const auto entry_end = entry_begin + entry.length; - if(!overlaps(entry_begin, entry_end, sync_begin, sync_end)) { - continue; - } + std::string write_path; + void* write_addr = nullptr; + size_t write_len = 0; + off_t write_off = 0; + bool do_writeback = false; - const auto flush_begin = std::max(entry_begin, sync_begin); - const auto flush_end = std::min(entry_end, sync_end); - const auto relative = flush_begin - entry_begin; - flush_ranges.push_back( - flush_range{addr_to_ptr(flush_begin), - static_cast(flush_end - flush_begin), - static_cast(entry.offset + relative), - entry.fd, entry.path}); - } - if(flush_ranges.empty()) { - return -1; + { + std::lock_guard lock(mmap_mtx); + auto it_key = addr_to_key.find(addr); + if(it_key != addr_to_key.end()) { + std::string key = it_key->second; + auto& entry = mmap_registry[key]; + + if((entry.prot & PROT_WRITE) && (entry.flags & MAP_SHARED)) { + do_writeback = true; + write_path = entry.path; + write_addr = entry.addr; + write_len = entry.length; + write_off = entry.offset; + } + } else { + return 0; // Not tracked + } } - lock.unlock(); - for(const auto& range : flush_ranges) { - (void) range.fd; - if(flush_range_chunked(range.path, range.addr, range.length, - range.offset, flush_mode::explicit_sync) < 0) { - return -1; + // Perform writeback without holding registry lock + if(do_writeback) { + int fd = gkfs_open(write_path, 0, O_WRONLY); + if(fd >= 0) { + gkfs_pwrite(fd, write_addr, write_len, write_off); + gkfs_close(fd); + } else { + LOG(ERROR, "{}() failed to open file for msync writeback: {}", + __func__, write_path); } } - return 0; -} + return 1; +} int gkfs_munmap(void* addr, size_t length) { @@ -873,104 +236,56 @@ gkfs_munmap(void* addr, size_t length) { return -1; } - struct flush_range { - void* addr; - size_t length; - off_t offset; - int fd; - std::string path; - }; + std::string write_path; + void* write_addr = nullptr; + size_t write_len = 0; + off_t write_off = 0; + bool do_writeback = false; - const auto unmap_begin = ptr_to_addr(addr); - const auto unmap_end = unmap_begin + length; - if(unmap_end < unmap_begin) { - errno = EINVAL; - return -1; - } - - std::vector flush_ranges; - std::vector new_entries; - - std::unique_lock lock(get_mmap_set_mutex()); - bool handled = false; - for(const auto& entry : get_mmap_set()) { - const auto entry_begin = ptr_to_addr(entry.addr); - const auto entry_end = entry_begin + entry.length; - - if(!overlaps(entry_begin, entry_end, unmap_begin, unmap_end)) { - new_entries.push_back(entry); - continue; - } - - handled = true; - const auto cut_begin = std::max(entry_begin, unmap_begin); - const auto cut_end = std::min(entry_end, unmap_end); - - LOG(DEBUG, "{}() path '{}' unmap overlap entry [{}..{}) cut [{}..{})", - __func__, entry.path, static_cast(entry_begin), - static_cast(entry_end), - static_cast(cut_begin), - static_cast(cut_end)); - - if(should_flush_mapping(entry) && cut_end > cut_begin) { - const auto rel = cut_begin - entry_begin; - flush_ranges.push_back( - flush_range{addr_to_ptr(cut_begin), - static_cast(cut_end - cut_begin), - static_cast(entry.offset + rel), - entry.fd, entry.path}); - } - - if(entry_begin < cut_begin) { - auto left = entry; - left.addr = addr_to_ptr(entry_begin); - left.length = static_cast(cut_begin - entry_begin); - left.offset = entry.offset; - new_entries.push_back(std::move(left)); - } + { + std::lock_guard lock(mmap_mtx); + auto it_key = addr_to_key.find(addr); + if(it_key != addr_to_key.end()) { + std::string key = it_key->second; + auto& entry = mmap_registry.at(key); + + entry.refcount--; + if(entry.refcount == 0) { + if((entry.prot & PROT_WRITE) && (entry.flags & MAP_SHARED)) { + do_writeback = true; + write_path = entry.path; + write_addr = entry.addr; + write_len = entry.length; + write_off = entry.offset; + } - if(cut_end < entry_end) { - auto right = entry; - right.addr = addr_to_ptr(cut_end); - right.length = static_cast(entry_end - cut_end); - right.offset = - static_cast(entry.offset + (cut_end - entry_begin)); - new_entries.push_back(std::move(right)); + mmap_registry.erase(key); + addr_to_key.erase(it_key); + } else { + return 1; // Handled successfully, refcount simply decremented + } + } else { + return 0; // Not tracked } } - if(!handled) { - return -1; - } - - // Publish post-unmap interval state before performing expensive I/O/RPC. - // This avoids holding the mmap tracking lock while lower layers may trigger - // their own mmap/munmap activity. - get_mmap_set().clear(); - for(const auto& entry : new_entries) { - get_mmap_set().insert(entry); - } - lock.unlock(); - - for(const auto& range : flush_ranges) { - (void) range.fd; - if(flush_range_chunked(range.path, range.addr, range.length, - range.offset, flush_mode::explicit_sync) < 0) { - return -1; + // Perform writeback and ::munmap exactly ONCE when refcount hits 0 without + // holding the lock + if(do_writeback) { + int fd = gkfs_open(write_path, 0, O_WRONLY); + if(fd >= 0) { + gkfs_pwrite(fd, write_addr, write_len, write_off); + gkfs_close(fd); + } else { + LOG(ERROR, "{}() failed to open file for munmap writeback: {}", + __func__, write_path); } } - if(::munmap(addr, length) != 0) { - return -1; - } - - LOG(DEBUG, "{}() unmap [{}..{}) updated tracked segments {}", __func__, - static_cast(unmap_begin), - static_cast(unmap_end), new_entries.size()); - return 0; + ::munmap(addr, length); + return 1; } - int gkfs_utimensat(const std::string& path, const struct timespec times[2]) { // Check if file exists diff --git a/src/client/gkfs_libc.cpp b/src/client/gkfs_libc.cpp index 05d7444c0..ad1c443bd 100644 --- a/src/client/gkfs_libc.cpp +++ b/src/client/gkfs_libc.cpp @@ -80,12 +80,7 @@ #include #include -// Forward declaration: defined in gkfs_functions.cpp -namespace gkfs::syscall { -int -gkfs_mmap_flush_for_path(const std::string& path, off64_t offset, - size_t length); -} // namespace gkfs::syscall +// Forward declaration removed //========================= Global Atomics and Variables @@ -954,32 +949,6 @@ ssize_t read(int fd, void* buf, size_t nbyte) { gkfs_init_routine_placeholder(); DEBUG_INFO("read(fd={}, nbyte={})", fd, nbyte); - if(CTX->interception_enabled() && is_gkfs_fd(fd)) { - auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd) { - constexpr int k_flush_retry_count = 8; - constexpr auto k_flush_retry_sleep = std::chrono::milliseconds(2); - int flush_rc = -1; - for(int attempt = 0; attempt <= k_flush_retry_count; ++attempt) { - flush_rc = gkfs::syscall::gkfs_mmap_flush_for_path( - gkfs_fd->path(), static_cast(gkfs_fd->pos()), - nbyte); - if(flush_rc == 0 || errno != EAGAIN) { - break; - } - std::this_thread::sleep_for(k_flush_retry_sleep); - } - if(flush_rc < 0) { - if(errno == EAGAIN) { - // Avoid surfacing transient mmap flush races to userspace - // reads (Python may translate this to None reads). - errno = 0; - } else { - return -1; - } - } - } - } GKFS_OPERATION(read, fd, buf, nbyte); GKFS_FALLBACK(read, fd, buf, nbyte); } @@ -995,29 +964,6 @@ ssize_t pread(int fd, void* buf, size_t count, off_t offset) { gkfs_init_routine_placeholder(); DEBUG_INFO("pread(fd={}, count={}, offset={})", fd, count, offset); - if(CTX->interception_enabled() && is_gkfs_fd(fd)) { - auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd) { - constexpr int k_flush_retry_count = 8; - constexpr auto k_flush_retry_sleep = std::chrono::milliseconds(2); - int flush_rc = -1; - for(int attempt = 0; attempt <= k_flush_retry_count; ++attempt) { - flush_rc = gkfs::syscall::gkfs_mmap_flush_for_path( - gkfs_fd->path(), static_cast(offset), count); - if(flush_rc == 0 || errno != EAGAIN) { - break; - } - std::this_thread::sleep_for(k_flush_retry_sleep); - } - if(flush_rc < 0) { - if(errno == EAGAIN) { - errno = 0; - } else { - return -1; - } - } - } - } GKFS_OPERATION(pread, fd, buf, count, offset); GKFS_FALLBACK(pread, fd, buf, count, offset); } @@ -1033,29 +979,6 @@ ssize_t pread64(int fd, void* buf, size_t count, off64_t offset) { gkfs_init_routine_placeholder(); DEBUG_INFO("pread64(fd={}, count={}, offset={})", fd, count, offset); - if(CTX->interception_enabled() && is_gkfs_fd(fd)) { - auto gkfs_fd = CTX->file_map()->get(fd); - if(gkfs_fd) { - constexpr int k_flush_retry_count = 8; - constexpr auto k_flush_retry_sleep = std::chrono::milliseconds(2); - int flush_rc = -1; - for(int attempt = 0; attempt <= k_flush_retry_count; ++attempt) { - flush_rc = gkfs::syscall::gkfs_mmap_flush_for_path( - gkfs_fd->path(), offset, count); - if(flush_rc == 0 || errno != EAGAIN) { - break; - } - std::this_thread::sleep_for(k_flush_retry_sleep); - } - if(flush_rc < 0) { - if(errno == EAGAIN) { - errno = 0; - } else { - return -1; - } - } - } - } GKFS_OPERATION(pread, fd, buf, count, offset); // GekkoFS pread likely handles large offsets GKFS_FALLBACK(pread64, fd, buf, count, offset); diff --git a/src/client/gkfs_metadata.cpp b/src/client/gkfs_metadata.cpp index fb89d317a..4530885aa 100644 --- a/src/client/gkfs_metadata.cpp +++ b/src/client/gkfs_metadata.cpp @@ -123,106 +123,10 @@ check_parent_dir(const std::string& path) { return 0; } -std::mutex& -deferred_trunc_mutex() { - static std::mutex mtx; - return mtx; -} - -std::unordered_map& -deferred_trunc_paths() { - static std::unordered_map paths; - return paths; -} - -bool -should_defer_trunc_for_consistency(const std::string& path, int flags) { - if(!(flags & O_TRUNC) || !((flags & O_WRONLY) || (flags & O_RDWR))) { - return false; - } - // Keep truncate publication ordered with mmap-backed publication to avoid - // exposing mixed old/new views to concurrent readers. - return gkfs::syscall::gkfs_mmap_has_active_path(path) || - gkfs::syscall::gkfs_has_deferred_trunc(path); -} - } // namespace namespace gkfs::syscall { -void -gkfs_register_deferred_trunc(const std::string& path) { - std::lock_guard lock(deferred_trunc_mutex()); - auto& deferred = deferred_trunc_paths(); - deferred[path]++; - // A deferred truncate starts a new publication phase for this pathname. - // Prevent pre-truncate mappings from clobbering offset 0 while the new - // writer is still publishing. - gkfs_mmap_advance_writeback_epoch_for_path(path); -} - -void -gkfs_unregister_deferred_trunc(const std::string& path) { - std::lock_guard lock(deferred_trunc_mutex()); - auto& deferred = deferred_trunc_paths(); - auto it = deferred.find(path); - if(it == deferred.end()) { - return; - } - if(it->second <= 1) { - deferred.erase(it); - } else { - --(it->second); - } -} - -bool -gkfs_has_deferred_trunc(const std::string& path) { - std::lock_guard lock(deferred_trunc_mutex()); - const auto& deferred = deferred_trunc_paths(); - return deferred.find(path) != deferred.end(); -} - -int -gkfs_publish_deferred_trunc(gkfs::filemap::OpenFile& file) { - if(!file.get_flag(gkfs::filemap::OpenFile_flags::trunc_pending)) { - return 0; - } - const auto path = file.path(); - if(!gkfs_has_deferred_trunc(path)) { - // Truncation was already published via another publication path - // (e.g., mmap read-triggered flush). Avoid truncating again. - file.set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, false); - return 0; - } - auto md = gkfs::utils::get_metadata(path); - if(md && gkfs_truncate(path, md->size(), 0)) { - return -1; - } - // Mappings created while truncate was deferred can still carry pre-truncate - // bytes. Prevent them from publishing stale data after truncate is visible. - gkfs_mmap_advance_writeback_epoch_for_path(path); - file.set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, false); - gkfs_unregister_deferred_trunc(path); - return 0; -} - -int -gkfs_publish_deferred_trunc_for_path(const std::string& path) { - if(!gkfs_has_deferred_trunc(path)) { - return 0; - } - auto md = gkfs::utils::get_metadata(path); - if(md && gkfs_truncate(path, md->size(), 0)) { - return -1; - } - // Keep read-triggered mmap publication from re-introducing pre-truncate - // contents right after truncate publication. - gkfs_mmap_advance_writeback_epoch_for_path(path); - gkfs_unregister_deferred_trunc(path); - return 0; -} - /** * @brief generate_lock_file @@ -318,19 +222,9 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { file->mode(mode); file->set_flag(gkfs::filemap::OpenFile_flags::creation_pending, true); - if(should_defer_trunc_for_consistency(path, flags)) { - file->set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, - true); - file->inline_data(""); - file->inline_data_size(0); - gkfs_register_deferred_trunc(path); - } if(CTX->protect_files_generator()) { generate_lock_file(path, true); } - // Re-creating a path starts a fresh publication phase. Drop stale - // mmap writeback candidates from a previous file incarnation. - gkfs_mmap_advance_writeback_epoch_for_path(path); return fd; } @@ -371,9 +265,6 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if(CTX->protect_files_generator()) { generate_lock_file(path, true); } - // Successful create on an existing pathname (after delete/reuse) - // must not allow stale mmap segments from an older incarnation. - gkfs_mmap_advance_writeback_epoch_for_path(path); // file was successfully created. Add to filemap return fd; } @@ -432,31 +323,16 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY))) { - if(should_defer_trunc_for_consistency(new_path, flags)) { - LOG(DEBUG, - "{}() deferring O_TRUNC for consistency path '{}'", - __func__, new_path); - } else { - if(gkfs_truncate(new_path, md.size(), 0)) { - LOG(ERROR, "Error truncating file"); - return -1; - } - md.size(0); - md.inline_data(""); + if(gkfs_truncate(new_path, md.size(), 0)) { + LOG(ERROR, "Error truncating file"); + return -1; } + md.size(0); + md.inline_data(""); } // RENAMED OR SYMLINK NOT PROTECTED auto file = std::make_shared(new_path, flags); - if((flags & O_TRUNC) && - ((flags & O_RDWR) || (flags & O_WRONLY)) && - should_defer_trunc_for_consistency(new_path, flags)) { - file->set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, - true); - file->inline_data(""); - file->inline_data_size(0); - gkfs_register_deferred_trunc(new_path); - } auto fd = CTX->file_map()->add(file); return fd; } @@ -470,17 +346,12 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { assert(S_ISREG(md.mode())); if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY))) { - if(should_defer_trunc_for_consistency(path, flags)) { - LOG(DEBUG, "{}() deferring O_TRUNC for consistency path '{}'", - __func__, path); - } else { - if(gkfs_truncate(path, md.size(), 0)) { - LOG(ERROR, "Error truncating file"); - return -1; - } - md.size(0); - md.inline_data(""); + if(gkfs_truncate(path, md.size(), 0)) { + LOG(ERROR, "Error truncating file"); + return -1; } + md.size(0); + md.inline_data(""); } auto file = std::make_shared(path, flags); if(gkfs::config::metadata::read_inline_prefetch and @@ -488,13 +359,6 @@ gkfs_open(const std::string& path, mode_t mode, int flags) { file->inline_data(md.inline_data()); file->inline_data_size(md.size()); // Store the actual file size } - if((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY)) && - should_defer_trunc_for_consistency(path, flags)) { - file->set_flag(gkfs::filemap::OpenFile_flags::trunc_pending, true); - file->inline_data(""); - file->inline_data_size(0); - gkfs_register_deferred_trunc(path); - } auto fd = CTX->file_map()->add(file); @@ -655,10 +519,7 @@ gkfs_remove(const std::string& path) { errno = err; return -1; } - // A removed pathname may be re-created quickly; prevent stale mappings of - // the previous file incarnation from publishing data into the new one. - gkfs_mmap_advance_writeback_epoch_for_path(path); - gkfs_unregister_deferred_trunc(path); + return 0; } @@ -1913,22 +1774,6 @@ gkfs_close(unsigned int fd) { auto file = CTX->file_map()->get(fd); if(file) { const auto path = file->path(); - if(gkfs_publish_deferred_trunc(*file) < 0) { - return -1; - } - if(file->type() == gkfs::filemap::FileType::regular) { - // Reuse the same write-intent predicate as mmap registration to - // keep close-time and read-triggered flush behavior consistent. - const bool should_close_flush = - gkfs_has_explicit_mmap_write_intent(*file); - if(should_close_flush) { - auto flush_err = gkfs_mmap_flush_all_for_path(path); - if(flush_err < 0) { - LOG(WARNING, "{}() mmap flush failed for path '{}'", - __func__, path); - } - } - } if(file->get_flag(gkfs::filemap::OpenFile_flags::creation_pending)) { gkfs_create(path, file->mode()); diff --git a/src/client/hooks.cpp b/src/client/hooks.cpp index fa44e60a1..b6f37e798 100644 --- a/src/client/hooks.cpp +++ b/src/client/hooks.cpp @@ -1518,9 +1518,12 @@ hook_munmap(void* addr, size_t length) { LOG(DEBUG, "{}() called with addr '{}' length '{}'", __func__, fmt::ptr(addr), length); - auto res = gkfs::syscall::gkfs_munmap(addr, length); - if(res == 0) - return res; + int res = gkfs::syscall::gkfs_munmap(addr, length); + if(res == 1) { + return 0; + } else if(res == -1) { + return -1; + } return syscall_no_intercept_wrapper(SYS_munmap, addr, length); } @@ -1529,9 +1532,12 @@ hook_msync(void* addr, size_t length, int flags) { LOG(DEBUG, "{}() called with addr '{}' length '{}' flags '{}'", __func__, fmt::ptr(addr), length, flags); - auto res = gkfs::syscall::gkfs_msync(addr, length, flags); - if(res == 0) - return res; + int res = gkfs::syscall::gkfs_msync(addr, length, flags); + if(res == 1) { + return 0; + } else if(res == -1) { + return -1; + } return syscall_no_intercept_wrapper(SYS_msync, addr, length, flags); } -- GitLab From 22faef6152d3982b71bcb830f2f8cde86688520d Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Tue, 3 Mar 2026 11:27:22 +0100 Subject: [PATCH 65/68] check functions --- src/client/gkfs_functions.cpp | 77 ++++++++--------------------------- 1 file changed, 18 insertions(+), 59 deletions(-) diff --git a/src/client/gkfs_functions.cpp b/src/client/gkfs_functions.cpp index 7a358bc06..8df851d28 100644 --- a/src/client/gkfs_functions.cpp +++ b/src/client/gkfs_functions.cpp @@ -90,7 +90,6 @@ namespace { struct MmapEntry { void* addr; size_t length; - int refcount; std::string path; off_t offset; int prot; @@ -98,8 +97,7 @@ struct MmapEntry { }; std::mutex mmap_mtx; -std::unordered_map mmap_registry; -std::unordered_map addr_to_key; +std::unordered_map mmap_registry; } // namespace @@ -117,22 +115,6 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, } std::string path = gkfs_fd->path(); - std::string key = fmt::format("{}:{}:{}", path, offset, length); - - { - std::lock_guard lock(mmap_mtx); - auto it = mmap_registry.find(key); - if(it != mmap_registry.end()) { - it->second.refcount++; - // Upgrade protections if needed (e.g., initial map was RO, now we - // want RW) - if((it->second.prot & prot) != prot) { - it->second.prot |= prot; - ::mprotect(it->second.addr, length, it->second.prot); - } - return it->second.addr; - } - } // Allocate new anonymous, SHARED mapping so child processes inherit it if // needed (MAP_SHARED | MAP_ANONYMOUS preserves it across fork if possible) @@ -162,21 +144,7 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd, // Re-acquire lock to insert mapping { std::lock_guard lock(mmap_mtx); - // Ensure no other thread raced to map the same region while we were - // reading - auto it = mmap_registry.find(key); - if(it != mmap_registry.end()) { - ::munmap(ptr, length); - it->second.refcount++; - if((it->second.prot & prot) != prot) { - it->second.prot |= prot; - ::mprotect(it->second.addr, length, it->second.prot); - } - return it->second.addr; - } - - mmap_registry[key] = {ptr, length, 1, path, offset, prot, flags}; - addr_to_key[ptr] = key; + mmap_registry[ptr] = {ptr, length, path, offset, prot, flags}; } return ptr; @@ -197,10 +165,9 @@ gkfs_msync(void* addr, size_t length, int flags) { { std::lock_guard lock(mmap_mtx); - auto it_key = addr_to_key.find(addr); - if(it_key != addr_to_key.end()) { - std::string key = it_key->second; - auto& entry = mmap_registry[key]; + auto it = mmap_registry.find(addr); + if(it != mmap_registry.end()) { + auto& entry = it->second; if((entry.prot & PROT_WRITE) && (entry.flags & MAP_SHARED)) { do_writeback = true; @@ -244,33 +211,25 @@ gkfs_munmap(void* addr, size_t length) { { std::lock_guard lock(mmap_mtx); - auto it_key = addr_to_key.find(addr); - if(it_key != addr_to_key.end()) { - std::string key = it_key->second; - auto& entry = mmap_registry.at(key); - - entry.refcount--; - if(entry.refcount == 0) { - if((entry.prot & PROT_WRITE) && (entry.flags & MAP_SHARED)) { - do_writeback = true; - write_path = entry.path; - write_addr = entry.addr; - write_len = entry.length; - write_off = entry.offset; - } - - mmap_registry.erase(key); - addr_to_key.erase(it_key); - } else { - return 1; // Handled successfully, refcount simply decremented + auto it = mmap_registry.find(addr); + if(it != mmap_registry.end()) { + auto& entry = it->second; + + if((entry.prot & PROT_WRITE) && (entry.flags & MAP_SHARED)) { + do_writeback = true; + write_path = entry.path; + write_addr = entry.addr; + write_len = entry.length; + write_off = entry.offset; } + + mmap_registry.erase(it); } else { return 0; // Not tracked } } - // Perform writeback and ::munmap exactly ONCE when refcount hits 0 without - // holding the lock + // Perform writeback exactly ONCE without holding the lock if(do_writeback) { int fd = gkfs_open(write_path, 0, O_WRONLY); if(fd >= 0) { -- GitLab From aa3bff318d9ace027069ec0f6daf350f664bb763 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Tue, 3 Mar 2026 13:07:13 +0100 Subject: [PATCH 66/68] integrate flex test --- tests/apps/CMakeLists.txt | 2 + tests/apps/flex-mmap-local.sh | 73 +++++++++ tests/apps/flex-mmap-repro.py | 280 ++++++++++++++++++++++++++++++++++ 3 files changed, 355 insertions(+) create mode 100755 tests/apps/flex-mmap-local.sh create mode 100755 tests/apps/flex-mmap-repro.py diff --git a/tests/apps/CMakeLists.txt b/tests/apps/CMakeLists.txt index 83fefaeaf..6d73d88b2 100644 --- a/tests/apps/CMakeLists.txt +++ b/tests/apps/CMakeLists.txt @@ -62,6 +62,8 @@ gekko_add_test(wacomm wacomm.sh) gekko_add_test(lockfile lockfile.sh) +gekko_add_test(flex_mmap_local flex-mmap-local.sh) + # --- Installation of Test Scripts --- if(GKFS_INSTALL_TESTS) diff --git a/tests/apps/flex-mmap-local.sh b/tests/apps/flex-mmap-local.sh new file mode 100755 index 000000000..e398550bf --- /dev/null +++ b/tests/apps/flex-mmap-local.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +set -euo pipefail + +export IO=${IO:-/builds/gitlab/hpc/gekkofs/gkfs/install} +export GKFS=${GKFS:-$IO/lib/libgkfs_intercept.so} +export GKFS_LIBC=${GKFS_LIBC:-$IO/lib/libgkfs_libc_intercept.so} +export DAEMON=${DAEMON:-$IO/bin/gkfs_daemon} + +GKROOT="${GKROOT:-$IO/tests/app/flex-mmap-repro/root}" +MNT="${MNT:-$IO/tests/app/flex-mmap-repro/mnt}" +LOG_DIR="${LOG_DIR:-$IO/tests/app/flex-mmap-repro/logs}" + +GKFS_PRELOAD="${GKFS_PRELOAD:-$GKFS_LIBC}" +CLIENT_LOG_LEVEL="${CLIENT_LOG_LEVEL:-debug}" +LOOPS="${LOOPS:-200}" +THREADS="${THREADS:-2}" +MB="${MB:-32}" +PYTHON_BIN="${PYTHON_BIN:-python3}" +COPY_METHOD="${COPY_METHOD:-shutil}" +PATH_MODE="${PATH_MODE:-shared}" +HOLD_MS="${HOLD_MS:-1}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +mkdir -p "$GKROOT" "$MNT" "$LOG_DIR" +rm -rf "$LOG_DIR"/* +rm -f gkfs_hosts.txt + +cleanup() { + pkill -9 gkfs_daemon >/dev/null 2>&1 || true +} +trap cleanup EXIT + +cleanup +sleep 1 +rm -rf "$GKROOT"/* "$MNT"/* + +$DAEMON -r "$GKROOT" -m "$MNT" \ + >"$LOG_DIR/daemon.out" 2>"$LOG_DIR/daemon.err" & +sleep 4 + +if ! pgrep -x gkfs_daemon >/dev/null 2>&1; then + echo "gkfs_daemon failed to start. See $LOG_DIR/daemon.err" + exit 1 +fi + +set +e +LD_PRELOAD="$GKFS_PRELOAD" \ +LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-$IO/lib}" \ +LIBGKFS_LOG="$CLIENT_LOG_LEVEL" \ +LIBGKFS_LOG_OUTPUT="$LOG_DIR/client.log" \ +"$PYTHON_BIN" -u "${SCRIPT_DIR}/flex-mmap-repro.py" \ + --mount "$MNT" \ + --loops "$LOOPS" \ + --threads "$THREADS" \ + --mb "$MB" \ + --copy-method "$COPY_METHOD" \ + --path-mode "$PATH_MODE" \ + --hold-ms "$HOLD_MS" \ + >"$LOG_DIR/repro.out" 2>"$LOG_DIR/repro.err" +RC=$? +set -e + +echo "LOG_DIR=$LOG_DIR" +echo "REPRO_RC=$RC" +if [[ $RC -eq 0 ]]; then + echo "Flex mmap repro PASS" +else + echo "Flex mmap repro FAIL" +fi + +exit $RC diff --git a/tests/apps/flex-mmap-repro.py b/tests/apps/flex-mmap-repro.py new file mode 100755 index 000000000..12374ca24 --- /dev/null +++ b/tests/apps/flex-mmap-repro.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 +"""CPU-only repro for FlexLLM-style NumPy memmap header corruption on GKFS. + +This intentionally exercises: +1) copy a valid .npy into GKFS offload path, +2) map file with np.lib.format.open_memmap(), +3) trigger read() from offset 0 while mmap is active, +4) reopen memmap and verify NumPy magic stays valid. +""" + +from __future__ import annotations + +import argparse +import os +import queue +import random +import shutil +import sys +import tempfile +import threading +import time +from pathlib import Path + +import numpy as np + + +NUMPY_MAGIC = b"\x93NUMPY" + + +def build_source_file(src_file: Path, mb: int) -> None: + elems = (mb * 1024 * 1024) // 4 + arr = np.arange(elems, dtype=np.float32) + np.save(src_file, arr) + + +def check_magic(path: Path) -> tuple[bool, bytes]: + with open(path, "rb") as f: + head = f.read(8) + return head.startswith(NUMPY_MAGIC), head + + +def copy_via_rw_loop(src: Path, dst: Path) -> None: + """Deterministic copy path: avoid platform fast-copy syscalls.""" + # 4MB chunks + chk = 4 * 1024 * 1024 + with open(src, "rb") as fin, open(dst, "wb") as fout: + while data := fin.read(chk): + fout.write(data) + fout.flush() + + +def copy_via_mmap_loop(src: Path, dst: Path) -> None: + src_arr = np.lib.format.open_memmap(src, mode="c") + dst_arr = np.lib.format.open_memmap(dst, mode="w+", shape=src_arr.shape, dtype=src_arr.dtype) + dst_arr[:] = src_arr[:] + dst_arr.flush() + del dst_arr + del src_arr + + +def exercise_mapping(tid: int, iter_idx: int, dst: Path, + stop_event: threading.Event, errs: list[str], + lock: threading.Lock, hold_ms: int) -> bool: + try: + mm = np.lib.format.open_memmap(dst, mode="r+") + except Exception as exc: # noqa: BLE001 + with lock: + errs.append(f"thread {tid} iter {iter_idx} open_memmap(r+) failed: {exc}") + stop_event.set() + return False + + # read-triggered flush stimulus while mapping is alive + fd = os.open(dst, os.O_RDONLY) + try: + os.lseek(fd, 0, os.SEEK_SET) + _ = os.read(fd, 512 * 1024) + finally: + os.close(fd) + + _ = float(mm[0]) + if hold_ms > 0: + time.sleep((hold_ms / 1000.0) * (1.0 + random.random())) + del mm + + ok, head = check_magic(dst) + if not ok: + with lock: + errs.append( + f"thread {tid} iter {iter_idx} bad magic raw={head!r} file={dst}" + ) + stop_event.set() + return False + + try: + mm2 = np.lib.format.open_memmap(dst, mode="r") + _ = float(mm2[0]) + del mm2 + except Exception as exc: # noqa: BLE001 + with lock: + errs.append(f"thread {tid} iter {iter_idx} open_memmap(r) failed: {exc}") + stop_event.set() + return False + + return True + + +def contention_worker(tid: int, mount_offload: Path, src_file: Path, loops: int, + stop_event: threading.Event, errs: list[str], + lock: threading.Lock, copy_method: str, path_mode: str, + hold_ms: int) -> None: + if path_mode == "shared": + # Match FlexLLM offload naming/lifecycle: workers contend on the same + # rotating temp files. + local_paths = [mount_offload / "t_0", mount_offload / "t_1"] + else: + local_paths = [mount_offload / f"t_{tid}_0.npy", + mount_offload / f"t_{tid}_1.npy"] + + for i in range(loops): + if stop_event.is_set(): + return + # Keep workers intentionally unsynchronized over shared paths to expose + # mmap+truncate+read publication races. + dst = local_paths[(i + tid) % 2] + if copy_method == "shutil": + shutil.copyfile(src_file, dst) + else: + copy_via_rw_loop(src_file, dst) + if not exercise_mapping(tid, i, dst, stop_event, errs, lock, hold_ms): + return + + +def pipeline_producer(src_file: Path, loops: int, paths: list[Path], + copy_method: str, stop_event: threading.Event, + errs: list[str], lock: threading.Lock, + work_q: queue.Queue[tuple[int, Path] | None], + path_slots: dict[Path, threading.Semaphore], + consumers: int) -> None: + for i in range(loops): + if stop_event.is_set(): + break + dst = paths[i % len(paths)] + slot = path_slots[dst] + acquired = False + try: + slot.acquire() + acquired = True + if copy_method == "shutil": + shutil.copyfile(src_file, dst) + elif copy_method == "mmap": + copy_via_mmap_loop(src_file, dst) + else: + copy_via_rw_loop(src_file, dst) + work_q.put((i, dst)) + acquired = False + except Exception as exc: # noqa: BLE001 + with lock: + errs.append(f"producer iter {i} copy failed for {dst}: {exc}") + stop_event.set() + finally: + if acquired: + slot.release() + + for _ in range(consumers): + work_q.put(None) + + +def pipeline_consumer(tid: int, stop_event: threading.Event, errs: list[str], + lock: threading.Lock, + work_q: queue.Queue[tuple[int, Path] | None], + path_slots: dict[Path, threading.Semaphore], + hold_ms: int) -> None: + while True: + item = work_q.get() + if item is None: + work_q.task_done() + return + iter_idx, dst = item + try: + if stop_event.is_set(): + return + exercise_mapping(tid, iter_idx, dst, stop_event, errs, lock, hold_ms) + finally: + path_slots[dst].release() + work_q.task_done() + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--mount", required=True, help="GKFS mount directory") + ap.add_argument("--loops", type=int, default=300) + ap.add_argument("--threads", type=int, default=2) + ap.add_argument("--mb", type=int, default=32, help="source tensor size in MB") + ap.add_argument("--copy-method", choices=["rw", "shutil", "mmap"], default="shutil") + ap.add_argument("--path-mode", choices=["shared", "per-thread"], + default="shared") + ap.add_argument("--hold-ms", type=int, default=1, + help="extra mapping hold time to widen races") + ap.add_argument("--workflow", choices=["pipeline", "contention"], + default="pipeline") + args = ap.parse_args() + + mount = Path(args.mount) + offload = mount / "offload" + offload.mkdir(parents=True, exist_ok=True) + + tmpdir = Path(tempfile.mkdtemp(prefix="gkfs-flex-repro-")) + src = tmpdir / "src.npy" + build_source_file(src, args.mb) + + errs: list[str] = [] + lock = threading.Lock() + stop_event = threading.Event() + t0 = time.time() + if args.workflow == "contention": + threads = [ + threading.Thread( + target=contention_worker, + args=(t, offload, src, args.loops, stop_event, errs, lock, + args.copy_method, args.path_mode, args.hold_ms), + daemon=True, + ) + for t in range(args.threads) + ] + for t in threads: + t.start() + for t in threads: + t.join() + else: + if args.path_mode == "shared": + paths = [offload / "t_0", offload / "t_1"] + else: + paths = [offload / f"t_{i}_0.npy" for i in range(args.threads)] + paths.extend(offload / f"t_{i}_1.npy" for i in range(args.threads)) + + path_slots = {p: threading.Semaphore(1) for p in paths} + work_q: queue.Queue[tuple[int, Path] | None] = queue.Queue( + maxsize=max(8, args.threads * 4) + ) + producer = threading.Thread( + target=pipeline_producer, + args=(src, args.loops, paths, args.copy_method, stop_event, errs, + lock, work_q, path_slots, args.threads), + daemon=True, + ) + consumers = [ + threading.Thread( + target=pipeline_consumer, + args=(t, stop_event, errs, lock, work_q, path_slots, + args.hold_ms), + daemon=True, + ) + for t in range(args.threads) + ] + producer.start() + for t in consumers: + t.start() + producer.join() + work_q.join() + for t in consumers: + t.join() + + dt = time.time() - t0 + + if errs: + for e in errs: + print(e, file=sys.stderr) + print(f"FAILED after {dt:.2f}s", file=sys.stderr) + return 1 + + print( + f"PASS threads={args.threads} loops={args.loops} mb={args.mb} " + f"path_mode={args.path_mode} copy={args.copy_method} " + f"workflow={args.workflow} time={dt:.2f}s" + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) -- GitLab From ccfe2e23a16085ccb33a0ddd5b33e95f74eeee4c Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Tue, 3 Mar 2026 13:34:24 +0100 Subject: [PATCH 67/68] numpy --- CHANGELOG.md | 1 + tests/apps/flex-mmap-local.sh | 72 ++++++++++++++++++++++++++--------- 2 files changed, 56 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cfa921829..ec561aad5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - SYS_lstat does not exists on some architectures, change to newfstatat ([!269](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/269)) - We cannot use lstat directly as may cause a recursion call on libc interception. - Un/Packing order of directory entries in compressed format was incorrect ([!281](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/281)) + - Fix pytorch mmap ([!291](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/291)) ## [0.9.5] - 2025-08 diff --git a/tests/apps/flex-mmap-local.sh b/tests/apps/flex-mmap-local.sh index e398550bf..d3f79db0b 100755 --- a/tests/apps/flex-mmap-local.sh +++ b/tests/apps/flex-mmap-local.sh @@ -1,31 +1,61 @@ #!/usr/bin/env bash +# flex-mmap-local.sh — CI test for GekkoFS mmap interception compatibility +# with NumPy memmap workloads (FlexLLM-style). +# +# Expected environment variables (all have CI-compatible defaults): +# IO : GekkoFS install prefix +# DAEMON : path to gkfs_daemon binary +# GKFS : path to libgkfs_intercept.so +# GKFS_LIBC : path to libgkfs_libc_intercept.so set -euo pipefail -export IO=${IO:-/builds/gitlab/hpc/gekkofs/gkfs/install} -export GKFS=${GKFS:-$IO/lib/libgkfs_intercept.so} -export GKFS_LIBC=${GKFS_LIBC:-$IO/lib/libgkfs_libc_intercept.so} -export DAEMON=${DAEMON:-$IO/bin/gkfs_daemon} +export IO="${IO:-/builds/gitlab/hpc/gekkofs/gkfs/install}" +export GKFS="${GKFS:-$IO/lib/libgkfs_intercept.so}" +export GKFS_LIBC="${GKFS_LIBC:-$IO/lib/libgkfs_libc_intercept.so}" +export DAEMON="${DAEMON:-$IO/bin/gkfs_daemon}" -GKROOT="${GKROOT:-$IO/tests/app/flex-mmap-repro/root}" -MNT="${MNT:-$IO/tests/app/flex-mmap-repro/mnt}" -LOG_DIR="${LOG_DIR:-$IO/tests/app/flex-mmap-repro/logs}" +GKROOT="${GKROOT:-/tmp/flex-mmap-ci/root}" +MNT="${MNT:-/tmp/flex-mmap-ci/mnt}" +LOG_DIR="${LOG_DIR:-/tmp/flex-mmap-ci/logs}" GKFS_PRELOAD="${GKFS_PRELOAD:-$GKFS_LIBC}" -CLIENT_LOG_LEVEL="${CLIENT_LOG_LEVEL:-debug}" -LOOPS="${LOOPS:-200}" +CLIENT_LOG_LEVEL="${CLIENT_LOG_LEVEL:-error}" +LOOPS="${LOOPS:-50}" THREADS="${THREADS:-2}" -MB="${MB:-32}" -PYTHON_BIN="${PYTHON_BIN:-python3}" +MB="${MB:-8}" COPY_METHOD="${COPY_METHOD:-shutil}" PATH_MODE="${PATH_MODE:-shared}" HOLD_MS="${HOLD_MS:-1}" +# Resolve the directory containing this script (source tree, not build tree) SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# ------------------------------------------------------------------- +# Set up a Python venv with numpy if the system python3 lacks it. +# This is necessary for CI images that ship only a bare python3. +# ------------------------------------------------------------------- +VENV_DIR="/tmp/flex-mmap-ci/venv" +if ! python3 -c "import numpy" 2>/dev/null; then + echo "numpy not found — creating venv and installing numpy..." + python3 -m venv "$VENV_DIR" + # shellcheck source=/dev/null + source "$VENV_DIR/bin/activate" + pip install --quiet numpy +else + # Activate an existing venv if present, else use system python3 + if [[ -f "$VENV_DIR/bin/activate" ]]; then + # shellcheck source=/dev/null + source "$VENV_DIR/bin/activate" + fi +fi +PYTHON_BIN="${PYTHON_BIN:-$(command -v python3)}" + +# ------------------------------------------------------------------- +# Prepare directories +# ------------------------------------------------------------------- mkdir -p "$GKROOT" "$MNT" "$LOG_DIR" -rm -rf "$LOG_DIR"/* -rm -f gkfs_hosts.txt +rm -rf "${LOG_DIR:?}"/* cleanup() { pkill -9 gkfs_daemon >/dev/null 2>&1 || true @@ -34,17 +64,24 @@ trap cleanup EXIT cleanup sleep 1 -rm -rf "$GKROOT"/* "$MNT"/* +rm -rf "${GKROOT:?}"/* "${MNT:?}"/* 2>/dev/null || true -$DAEMON -r "$GKROOT" -m "$MNT" \ +# ------------------------------------------------------------------- +# Start daemon +# ------------------------------------------------------------------- +"$DAEMON" -r "$GKROOT" -m "$MNT" \ >"$LOG_DIR/daemon.out" 2>"$LOG_DIR/daemon.err" & sleep 4 if ! pgrep -x gkfs_daemon >/dev/null 2>&1; then - echo "gkfs_daemon failed to start. See $LOG_DIR/daemon.err" + echo "ERROR: gkfs_daemon failed to start. See $LOG_DIR/daemon.err" >&2 + cat "$LOG_DIR/daemon.err" >&2 exit 1 fi +# ------------------------------------------------------------------- +# Run repro script +# ------------------------------------------------------------------- set +e LD_PRELOAD="$GKFS_PRELOAD" \ LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-$IO/lib}" \ @@ -67,7 +104,8 @@ echo "REPRO_RC=$RC" if [[ $RC -eq 0 ]]; then echo "Flex mmap repro PASS" else - echo "Flex mmap repro FAIL" + echo "Flex mmap repro FAIL (RC=$RC)" + tail -20 "$LOG_DIR/repro.err" >&2 fi exit $RC -- GitLab From ac2b56ba14f2750bf27e663e024a8da504fd66ba Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Tue, 3 Mar 2026 13:58:35 +0100 Subject: [PATCH 68/68] fix --- tests/apps/flex-mmap-local.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/apps/flex-mmap-local.sh b/tests/apps/flex-mmap-local.sh index d3f79db0b..84b39c8f4 100755 --- a/tests/apps/flex-mmap-local.sh +++ b/tests/apps/flex-mmap-local.sh @@ -14,6 +14,7 @@ export IO="${IO:-/builds/gitlab/hpc/gekkofs/gkfs/install}" export GKFS="${GKFS:-$IO/lib/libgkfs_intercept.so}" export GKFS_LIBC="${GKFS_LIBC:-$IO/lib/libgkfs_libc_intercept.so}" export DAEMON="${DAEMON:-$IO/bin/gkfs_daemon}" +export LD_LIBRARY_PATH=/root/wacommplusplus/build/external/lib:$IO/lib/:$LD_LIBRARY_PATH GKROOT="${GKROOT:-/tmp/flex-mmap-ci/root}" MNT="${MNT:-/tmp/flex-mmap-ci/mnt}" @@ -84,7 +85,6 @@ fi # ------------------------------------------------------------------- set +e LD_PRELOAD="$GKFS_PRELOAD" \ -LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-$IO/lib}" \ LIBGKFS_LOG="$CLIENT_LOG_LEVEL" \ LIBGKFS_LOG_OUTPUT="$LOG_DIR/client.log" \ "$PYTHON_BIN" -u "${SCRIPT_DIR}/flex-mmap-repro.py" \ -- GitLab