Commit 5e74ba10 authored by Ramon Nou's avatar Ramon Nou
Browse files

Fix CUDA issues with syscall intercept

parent 76d8b749
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -45,6 +45,8 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
    - We cannot use lstat directly as may cause a recursion call on libc interception.
  - Un/Packing order of directory entries in compressed format was incorrect ([!281](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/281))
  - Fix pytorch mmap ([!291](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/291))
  - Fix cuda in syscall ([!292](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/292))
    - mmap and dangling fd issues
    

## [0.9.5] - 2025-08
+6 −0
Original line number Diff line number Diff line
@@ -193,6 +193,12 @@ gkfs_munmap(void* addr, size_t length);
int
gkfs_msync(void* addr, size_t length, int flags);

// Returns true if addr is currently registered as a GekkoFS mmap mapping.
// Cheap shared-lock read; used by hooks to skip gkfs_munmap/gkfs_msync
// for non-GekkoFS addresses (e.g. CUDA GPU memory mappings).
bool
gkfs_mmap_is_tracked(void* addr);


} // namespace gkfs::syscall

+5 −0
Original line number Diff line number Diff line
@@ -46,6 +46,8 @@
#include <atomic>
#include <array>
#include <string>
#include <vector>


namespace gkfs::filemap {

@@ -189,6 +191,9 @@ public:

    int
    get_fd_idx();

    std::vector<int>
    get_range(unsigned int first, unsigned int last);
};

} // namespace gkfs::filemap
+15 −4
Original line number Diff line number Diff line
@@ -53,6 +53,7 @@
#include <string>
#include <string_view>
#include <mutex>
#include <shared_mutex>
#include <set>
#include <tuple>
#include <thread>
@@ -96,11 +97,21 @@ struct MmapEntry {
    int flags;
};

std::mutex mmap_mtx;
// Use shared_mutex so that gkfs_mmap_is_tracked() can do a lock-free
// quick-check without blocking concurrent mmap/munmap operations.
std::shared_mutex mmap_mtx;
std::unordered_map<void*, MmapEntry> mmap_registry;

} // namespace

bool
gkfs_mmap_is_tracked(void* addr) {
    // Shared (read-only) lock: does not block concurrent is_tracked queries
    // or mmap registrations that don't call munmap.
    std::shared_lock<std::shared_mutex> lock(mmap_mtx);
    return mmap_registry.find(addr) != mmap_registry.end();
}

void*
gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd,
          off_t offset) {
@@ -143,7 +154,7 @@ gkfs_mmap(void* addr, size_t length, int prot, int flags, int fd,

    // Re-acquire lock to insert mapping
    {
        std::lock_guard<std::mutex> lock(mmap_mtx);
        std::unique_lock<std::shared_mutex> lock(mmap_mtx);
        mmap_registry[ptr] = {ptr, length, path, offset, prot, flags};
    }

@@ -164,7 +175,7 @@ gkfs_msync(void* addr, size_t length, int flags) {
    bool do_writeback = false;

    {
        std::lock_guard<std::mutex> lock(mmap_mtx);
        std::unique_lock<std::shared_mutex> lock(mmap_mtx);
        auto it = mmap_registry.find(addr);
        if(it != mmap_registry.end()) {
            auto& entry = it->second;
@@ -210,7 +221,7 @@ gkfs_munmap(void* addr, size_t length) {
    bool do_writeback = false;

    {
        std::lock_guard<std::mutex> lock(mmap_mtx);
        std::unique_lock<std::shared_mutex> lock(mmap_mtx);
        auto it = mmap_registry.find(addr);
        if(it != mmap_registry.end()) {
            auto& entry = it->second;
+71 −15
Original line number Diff line number Diff line
@@ -48,6 +48,7 @@
#include <common/path_util.hpp>

#include <algorithm>
#include <cstdio>
#include <string>
#include <vector>

@@ -71,6 +72,27 @@ with_errno(T ret) {

constexpr size_t k_fd_copy_chunk_size = 1UL * 1024UL * 1024UL;

bool
kernel_fd_targets_dev_null(int fd) {
    char proc_fd_path[64];
    const auto path_len = std::snprintf(proc_fd_path, sizeof(proc_fd_path),
                                        "/proc/self/fd/%d", fd);
    if(path_len <= 0 || path_len >= static_cast<int>(sizeof(proc_fd_path))) {
        return false;
    }

    char target_path[256];
    const auto nread =
            syscall_no_intercept_wrapper(SYS_readlinkat, AT_FDCWD, proc_fd_path,
                                         target_path, sizeof(target_path) - 1);
    if(nread < 0) {
        return false;
    }

    target_path[nread] = '\0';
    return std::string(target_path) == "/dev/null";
}

ssize_t
copy_between_fds(int out_fd, int in_fd, off64_t* in_off, off64_t* out_off,
                 size_t count) {
@@ -588,42 +610,45 @@ hook_symlinkat(const char* oldname, int newdfd, const char* newname) {
    LOG(DEBUG, "{}() called with oldname: \"{}\", newfd: {}, newname: \"{}\"",
        __func__, oldname, newdfd, newname);

    bool internal1 = false;
    std::string oldname_resolved;
    if(CTX->relativize_path(oldname, oldname_resolved)) {
        if(!gkfs::config::metadata::symlink_support) {
            LOG(WARNING, "{}() operation not supported", __func__);
            return -ENOTSUP;
        }
        internal1 = true;
    }

    // First determine where the destination (newname) lives.
    // We MUST do this before touching oldname, because oldname is a literal
    // symlink content string — NOT a filesystem path to resolve. Calling
    // relativize_path() on it would corrupt relative paths like "../nvidia0"
    // by anchoring them to the process CWD, producing wrong paths.
    std::string newname_resolved;
    auto rstatus =
            CTX->relativize_fd_path(newdfd, newname, newname_resolved, false);

    switch(rstatus) {
        case gkfs::preload::RelativizeStatus::fd_unknown:
            // Destination fd is not known to us — forward verbatim.
            return gsl::narrow_cast<int>(syscall_no_intercept_wrapper(
                    SYS_symlinkat, oldname, newdfd, newname));

        case gkfs::preload::RelativizeStatus::external:
            // Destination is outside the GekkoFS mountpoint — kernel handles.
            // Pass oldname as-is (it's a literal symlink content, not a path).
            return gsl::narrow_cast<int>(syscall_no_intercept_wrapper(
                    SYS_symlinkat, oldname, newdfd, newname));

        case gkfs::preload::RelativizeStatus::fd_not_a_dir:
            return -ENOTDIR;

        case gkfs::preload::RelativizeStatus::internal:
        case gkfs::preload::RelativizeStatus::internal: {
            // Destination is in GekkoFS. Now we also need to inspect oldname.
            if(!gkfs::config::metadata::symlink_support) {
                LOG(WARNING, "{}() operation not supported", __func__);
                return -ENOTSUP;
            }
            if(internal1) { // Parameters are inverted
            std::string oldname_resolved;
            if(CTX->relativize_path(oldname, oldname_resolved)) {
                // Source is also in GekkoFS — create symlink inside GekkoFS.
                return with_errno(gkfs::syscall::gkfs_mk_symlink(
                        newname_resolved, oldname_resolved));
            }
            LOG(WARNING, "{}() operation not supported", __func__);
            LOG(WARNING, "{}() cross-mount symlink not supported", __func__);
            return -ENOTSUP;
        }

        default:
            LOG(ERROR, "{}() relativize status unknown", __func__);
@@ -631,6 +656,7 @@ hook_symlinkat(const char* oldname, int newdfd, const char* newname) {
    }
}


int
hook_flock(unsigned long fd, int flags) {
    LOG(ERROR, "{}() called flock (Not Supported) with fd '{}' flags '{}'",
@@ -1504,20 +1530,45 @@ hook_mmap(void* addr, size_t length, int prot, int flags, int fd,
        "{}() called with addr '{}' length '{}' prot '{}' flags '{}' fd '{}' offset '{}'",
        __func__, fmt::ptr(addr), length, prot, flags, fd, offset);

    // MAP_ANONYMOUS mappings are not backed by any file. The kernel ignores
    // the fd argument, but CUDA drivers commonly pass fd=0 (stdin). Checking
    // file_map()->get(0) can incorrectly route CUDA's GPU memory maps through
    // gkfs_mmap, causing cudaErrorDevicesUnavailable with syscall interception.
    if(flags & MAP_ANONYMOUS) {
        return reinterpret_cast<void*>(syscall_no_intercept_wrapper(
                SYS_mmap, addr, length, prot, flags, fd, offset));
    }

    if(auto file = CTX->file_map()->get(fd)) {
        LOG(DEBUG, "{}() fd {} handled by GKFS path '{}'", __func__, fd,
            file->path());
        // In non-range-fd mode, GekkoFS keeps a /dev/null kernel fd per tracked
        // file. If that no longer holds, this entry is stale (likely fd reuse),
        // so forward to the kernel.
        if(!CTX->range_fd() && !CTX->protect_fds() &&
           !kernel_fd_targets_dev_null(fd)) {
            return reinterpret_cast<void*>(syscall_no_intercept_wrapper(
                    SYS_mmap, addr, length, prot, flags, fd, offset));
        }

        return gkfs::syscall::gkfs_mmap(addr, length, prot, flags, fd, offset);
    }
    return reinterpret_cast<void*>(syscall_no_intercept_wrapper(
            SYS_mmap, addr, length, prot, flags, fd, offset));
}


int
hook_munmap(void* addr, size_t length) {
    LOG(DEBUG, "{}() called with addr '{}' length '{}'", __func__,
        fmt::ptr(addr), length);

    // Only invoke gkfs_munmap for addresses we actually track.
    // Bypassing for all other addresses avoids stalling unrelated munmap
    // callers (e.g. CUDA's multi-threaded init) on mmap_mtx under
    // syscall-level interception.
    if(!gkfs::syscall::gkfs_mmap_is_tracked(addr)) {
        return syscall_no_intercept_wrapper(SYS_munmap, addr, length);
    }

    int res = gkfs::syscall::gkfs_munmap(addr, length);
    if(res == 1) {
        return 0;
@@ -1532,6 +1583,11 @@ hook_msync(void* addr, size_t length, int flags) {
    LOG(DEBUG, "{}() called with addr '{}' length '{}' flags '{}'", __func__,
        fmt::ptr(addr), length, flags);

    // Same fast-path: skip gkfs entirely for non-GekkoFS mappings.
    if(!gkfs::syscall::gkfs_mmap_is_tracked(addr)) {
        return syscall_no_intercept_wrapper(SYS_msync, addr, length, flags);
    }

    int res = gkfs::syscall::gkfs_msync(addr, length, flags);
    if(res == 1) {
        return 0;
Loading