/* Copyright 2018-2020, Barcelona Supercomputing Center (BSC), Spain Copyright 2015-2020, Johannes Gutenberg Universitaet Mainz, Germany This software was partially supported by the EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu). This software was partially supported by the ADA-FS project under the SPPEXA project funded by the DFG. SPDX-License-Identifier: MIT */ #include #include #include #include #include #include #include extern "C" { #include #include #include #include #include } namespace { thread_local bool reentrance_guard_flag; thread_local gkfs::syscall::info saved_syscall_info; constexpr void save_current_syscall_info(gkfs::syscall::info info) { saved_syscall_info = info; } constexpr void reset_current_syscall_info() { saved_syscall_info = gkfs::syscall::no_info; } inline gkfs::syscall::info get_current_syscall_info() { return saved_syscall_info; } /* * hook_internal -- interception hook for internal syscalls * * This hook is basically used to keep track of file descriptors created * internally by the library itself. This is important because some * applications (e.g. ssh) may attempt to close all open file descriptors * which would leave the library internals in an inconsistent state. * We forward syscalls to the kernel but we keep track of any syscalls that may * create or destroy a file descriptor so that we can mark them as 'internal'. */ inline int hook_internal(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, long arg5, long* result) { #if defined(GKFS_ENABLE_LOGGING) && defined(GKFS_DEBUG_BUILD) const long args[gkfs::syscall::MAX_ARGS] = { arg0, arg1, arg2, arg3, arg4, arg5 }; #endif LOG(SYSCALL, gkfs::syscall::from_internal_code | gkfs::syscall::to_hook | gkfs::syscall::not_executed, syscall_number, args); switch (syscall_number) { case SYS_open: *result = syscall_no_intercept(syscall_number, reinterpret_cast(arg0), static_cast(arg1), static_cast(arg2)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_creat: *result = syscall_no_intercept(syscall_number, reinterpret_cast(arg0), O_WRONLY | O_CREAT | O_TRUNC, static_cast(arg1)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_openat: *result = syscall_no_intercept(syscall_number, static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2), static_cast(arg3)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_epoll_create: *result = syscall_no_intercept(syscall_number, static_cast(arg0)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_epoll_create1: *result = syscall_no_intercept(syscall_number, static_cast(arg0)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_dup: *result = syscall_no_intercept(syscall_number, static_cast(arg0)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_dup2: *result = syscall_no_intercept(syscall_number, static_cast(arg0), static_cast(arg1)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_dup3: *result = syscall_no_intercept(syscall_number, static_cast(arg0), static_cast(arg1), static_cast(arg2)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_inotify_init: *result = syscall_no_intercept(syscall_number); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_inotify_init1: *result = syscall_no_intercept(syscall_number, static_cast(arg0)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_perf_event_open: *result = syscall_no_intercept(syscall_number, reinterpret_cast(arg0), static_cast(arg1), static_cast(arg2), static_cast(arg3), static_cast(arg4)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_signalfd: *result = syscall_no_intercept(syscall_number, static_cast(arg0), reinterpret_cast(arg1)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_signalfd4: *result = syscall_no_intercept(syscall_number, static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_timerfd_create: *result = syscall_no_intercept(syscall_number, static_cast(arg0), static_cast(arg1)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_socket: *result = syscall_no_intercept(syscall_number, static_cast(arg0), static_cast(arg1), static_cast(arg2)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_socketpair: *result = syscall_no_intercept(syscall_number, static_cast(arg0), static_cast(arg1), static_cast(arg2), reinterpret_cast(arg3)); if (*result >= 0) { reinterpret_cast(arg3)[0] = CTX->register_internal_fd(reinterpret_cast(arg3)[0]); reinterpret_cast(arg3)[1] = CTX->register_internal_fd(reinterpret_cast(arg3)[1]); } break; case SYS_pipe: *result = syscall_no_intercept(syscall_number, reinterpret_cast(arg0)); if (*result >= 0) { reinterpret_cast(arg0)[0] = CTX->register_internal_fd(reinterpret_cast(arg0)[0]); reinterpret_cast(arg0)[1] = CTX->register_internal_fd(reinterpret_cast(arg0)[1]); } break; case SYS_pipe2: *result = syscall_no_intercept(syscall_number, reinterpret_cast(arg0), static_cast(arg1)); if (*result >= 0) { reinterpret_cast(arg0)[0] = CTX->register_internal_fd(reinterpret_cast(arg0)[0]); reinterpret_cast(arg0)[1] = CTX->register_internal_fd(reinterpret_cast(arg0)[1]); } break; case SYS_eventfd: *result = syscall_no_intercept(syscall_number, static_cast(arg0)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_eventfd2: *result = syscall_no_intercept(syscall_number, static_cast(arg0), static_cast(arg1)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_recvmsg: { *result = syscall_no_intercept(syscall_number, static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); // The recvmsg() syscall can receive file descriptors from another // process that the kernel automatically adds to the client's fds // as if dup2 had been called. Whenever that happens, we need to // make sure that we register these additional fds as internal, or // we could inadvertently overwrite them if (*result >= 0) { auto* hdr = reinterpret_cast(arg1); struct cmsghdr* cmsg = CMSG_FIRSTHDR(hdr); for (; cmsg != NULL; cmsg = CMSG_NXTHDR(hdr, cmsg)) { if (cmsg->cmsg_type == SCM_RIGHTS) { size_t nfd = cmsg->cmsg_len > CMSG_LEN(0) ? (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int) : 0; int* fds = reinterpret_cast(CMSG_DATA(cmsg)); for (size_t i = 0; i < nfd; ++i) { LOG(DEBUG, "recvmsg() provided extra fd {}", fds[i]); // ensure we update the fds in cmsg // if they have been relocated fds[i] = CTX->register_internal_fd(fds[i]); } } } } break; } case SYS_accept: *result = syscall_no_intercept(syscall_number, static_cast(arg0), reinterpret_cast(arg1), reinterpret_cast(arg2)); if (*result >= 0) { *result = CTX->register_internal_fd(*result); } break; case SYS_fcntl: *result = syscall_no_intercept(syscall_number, static_cast(arg0), static_cast(arg1), arg2); if (*result >= 0 && (static_cast(arg1) == F_DUPFD || static_cast(arg1) == F_DUPFD_CLOEXEC)) { *result = CTX->register_internal_fd(*result); } break; case SYS_close: *result = syscall_no_intercept(syscall_number, static_cast(arg0)); if (*result == 0) { CTX->unregister_internal_fd(arg0); } break; default: // ignore any other syscalls, i.e.: pass them on to the kernel // (syscalls forwarded to the kernel that return are logged in // hook_forwarded_syscall()) ::save_current_syscall_info( gkfs::syscall::from_internal_code | gkfs::syscall::to_kernel | gkfs::syscall::not_executed); return gkfs::syscall::forward_to_kernel; } LOG(SYSCALL, gkfs::syscall::from_internal_code | gkfs::syscall::to_hook | gkfs::syscall::executed, syscall_number, args, *result); return gkfs::syscall::hooked; } /* * hook -- interception hook for application syscalls * * This hook is used to implement any application filesystem-related syscalls. */ inline int hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, long arg5, long* result) { #if defined(GKFS_ENABLE_LOGGING) && defined(GKFS_DEBUG_BUILD) const long args[gkfs::syscall::MAX_ARGS] = { arg0, arg1, arg2, arg3, arg4, arg5 }; #endif LOG(SYSCALL, gkfs::syscall::from_external_code | gkfs::syscall::to_hook | gkfs::syscall::not_executed, syscall_number, args); switch (syscall_number) { case SYS_execve: *result = syscall_no_intercept(syscall_number, reinterpret_cast(arg0), reinterpret_cast(arg1), reinterpret_cast(arg2)); break; #ifdef SYS_execveat case SYS_execveat: *result = syscall_no_intercept(syscall_number, arg0, reinterpret_cast(arg1), reinterpret_cast(arg2), reinterpret_cast(arg3), arg4); break; #endif case SYS_open: *result = gkfs::hook::hook_openat(AT_FDCWD, reinterpret_cast(arg0), static_cast(arg1), static_cast(arg2)); break; case SYS_creat: *result = gkfs::hook::hook_openat(AT_FDCWD, reinterpret_cast(arg0), O_WRONLY | O_CREAT | O_TRUNC, static_cast(arg1)); break; case SYS_openat: *result = gkfs::hook::hook_openat(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2), static_cast(arg3)); break; case SYS_close: *result = gkfs::hook::hook_close(static_cast(arg0)); break; case SYS_stat: *result = gkfs::hook::hook_stat(reinterpret_cast(arg0), reinterpret_cast(arg1)); break; case SYS_lstat: *result = gkfs::hook::hook_lstat(reinterpret_cast(arg0), reinterpret_cast(arg1)); break; case SYS_fstat: *result = gkfs::hook::hook_fstat(static_cast(arg0), reinterpret_cast(arg1)); break; case SYS_newfstatat: *result = gkfs::hook::hook_fstatat(static_cast(arg0), reinterpret_cast(arg1), reinterpret_cast(arg2), static_cast(arg3)); break; case SYS_read: *result = gkfs::hook::hook_read(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); break; case SYS_pread64: *result = gkfs::hook::hook_pread(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2), static_cast(arg3)); break; case SYS_pwrite64: *result = gkfs::hook::hook_pwrite(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2), static_cast(arg3)); break; case SYS_write: *result = gkfs::hook::hook_write(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); break; case SYS_writev: *result = gkfs::hook::hook_writev(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); break; case SYS_pwritev: *result = gkfs::hook::hook_pwritev(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2), static_cast(arg3), static_cast(arg4)); break; case SYS_unlink: *result = gkfs::hook::hook_unlinkat(AT_FDCWD, reinterpret_cast(arg0), 0); break; case SYS_unlinkat: *result = gkfs::hook::hook_unlinkat(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); break; case SYS_rmdir: *result = gkfs::hook::hook_unlinkat(AT_FDCWD, reinterpret_cast(arg0), AT_REMOVEDIR); break; case SYS_symlink: *result = gkfs::hook::hook_symlinkat(reinterpret_cast(arg0), AT_FDCWD, reinterpret_cast(arg1)); break; case SYS_symlinkat: *result = gkfs::hook::hook_symlinkat(reinterpret_cast(arg0), static_cast(arg1), reinterpret_cast(arg2)); break; case SYS_access: *result = gkfs::hook::hook_access(reinterpret_cast(arg0), static_cast(arg1)); break; case SYS_faccessat: *result = gkfs::hook::hook_faccessat(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); break; case SYS_lseek: *result = gkfs::hook::hook_lseek(static_cast(arg0), static_cast(arg1), static_cast(arg2)); break; case SYS_truncate: *result = gkfs::hook::hook_truncate(reinterpret_cast(arg0), static_cast(arg1)); break; case SYS_ftruncate: *result = gkfs::hook::hook_ftruncate(static_cast(arg0), static_cast(arg1)); break; case SYS_dup: *result = gkfs::hook::hook_dup(static_cast(arg0)); break; case SYS_dup2: *result = gkfs::hook::hook_dup2(static_cast(arg0), static_cast(arg1)); break; case SYS_dup3: *result = gkfs::hook::hook_dup3(static_cast(arg0), static_cast(arg1), static_cast(arg2)); break; case SYS_getdents: *result = gkfs::hook::hook_getdents(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); break; case SYS_getdents64: *result = gkfs::hook::hook_getdents64(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); break; case SYS_mkdirat: *result = gkfs::hook::hook_mkdirat(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); break; case SYS_mkdir: *result = gkfs::hook::hook_mkdirat(AT_FDCWD, reinterpret_cast(arg0), static_cast(arg1)); break; case SYS_chmod: *result = gkfs::hook::hook_fchmodat(AT_FDCWD, reinterpret_cast(arg0), static_cast(arg1)); break; case SYS_fchmod: *result = gkfs::hook::hook_fchmod(static_cast(arg0), static_cast(arg1)); break; case SYS_fchmodat: *result = gkfs::hook::hook_fchmodat(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); break; case SYS_chdir: *result = gkfs::hook::hook_chdir(reinterpret_cast(arg0)); break; case SYS_fchdir: *result = gkfs::hook::hook_fchdir(static_cast(arg0)); break; case SYS_getcwd: *result = gkfs::hook::hook_getcwd(reinterpret_cast(arg0), static_cast(arg1)); break; case SYS_readlink: *result = gkfs::hook::hook_readlinkat(AT_FDCWD, reinterpret_cast(arg0), reinterpret_cast(arg1), static_cast(arg2)); break; case SYS_readlinkat: *result = gkfs::hook::hook_readlinkat(static_cast(arg0), reinterpret_cast(arg1), reinterpret_cast(arg2), static_cast(arg3)); break; case SYS_fcntl: *result = gkfs::hook::hook_fcntl(static_cast(arg0), static_cast(arg1), static_cast(arg2)); break; case SYS_rename: *result = gkfs::hook::hook_renameat(AT_FDCWD, reinterpret_cast(arg0), AT_FDCWD, reinterpret_cast(arg1), 0); break; case SYS_renameat: *result = gkfs::hook::hook_renameat(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2), reinterpret_cast(arg3), 0); break; case SYS_renameat2: *result = gkfs::hook::hook_renameat(static_cast(arg0), reinterpret_cast(arg1), static_cast(arg2), reinterpret_cast(arg3), static_cast(arg4)); break; case SYS_fstatfs: *result = gkfs::hook::hook_fstatfs(static_cast(arg0), reinterpret_cast(arg1)); break; case SYS_statfs: *result = gkfs::hook::hook_statfs(reinterpret_cast(arg0), reinterpret_cast(arg1)); break; default: // ignore any other syscalls, i.e.: pass them on to the kernel // (syscalls forwarded to the kernel that return are logged in // hook_forwarded_syscall()) ::save_current_syscall_info( gkfs::syscall::from_external_code | gkfs::syscall::to_kernel | gkfs::syscall::not_executed); return gkfs::syscall::forward_to_kernel; } LOG(SYSCALL, gkfs::syscall::from_external_code | gkfs::syscall::to_hook | gkfs::syscall::executed, syscall_number, args, *result); return gkfs::syscall::hooked; } void hook_forwarded_syscall(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, long arg5, long result) { if (::get_current_syscall_info() == gkfs::syscall::no_info) { return; } #if defined(GKFS_ENABLE_LOGGING) && defined(GKFS_DEBUG_BUILD) const long args[gkfs::syscall::MAX_ARGS] = { arg0, arg1, arg2, arg3, arg4, arg5 }; #endif LOG(SYSCALL, ::get_current_syscall_info() | gkfs::syscall::executed, syscall_number, args, result); ::reset_current_syscall_info(); } void hook_clone_at_child(unsigned long flags, void* child_stack, int* ptid, int* ctid, long newtls) { #if defined(GKFS_ENABLE_LOGGING) && defined(GKFS_DEBUG_BUILD) const long args[gkfs::syscall::MAX_ARGS] = { static_cast(flags), reinterpret_cast(child_stack), reinterpret_cast(ptid), reinterpret_cast(ctid), static_cast(newtls), 0}; #endif reentrance_guard_flag = true; LOG(SYSCALL, ::get_current_syscall_info() | gkfs::syscall::executed, SYS_clone, args, 0); reentrance_guard_flag = false; } void hook_clone_at_parent(unsigned long flags, void* child_stack, int* ptid, int* ctid, long newtls, long returned_pid) { #if defined(GKFS_ENABLE_LOGGING) && defined(GKFS_DEBUG_BUILD) const long args[gkfs::syscall::MAX_ARGS] = { static_cast(flags), reinterpret_cast(child_stack), reinterpret_cast(ptid), reinterpret_cast(ctid), static_cast(newtls), 0}; #endif reentrance_guard_flag = true; LOG(SYSCALL, ::get_current_syscall_info() | gkfs::syscall::executed, SYS_clone, args, returned_pid); reentrance_guard_flag = false; } } // namespace namespace gkfs { namespace preload { int internal_hook_guard_wrapper(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, long arg5, long* syscall_return_value) { assert(CTX->interception_enabled()); if (reentrance_guard_flag) { ::save_current_syscall_info( gkfs::syscall::from_internal_code | gkfs::syscall::to_kernel | gkfs::syscall::not_executed); return gkfs::syscall::forward_to_kernel; } int was_hooked = 0; reentrance_guard_flag = true; int oerrno = errno; was_hooked = hook_internal(syscall_number, arg0, arg1, arg2, arg3, arg4, arg5, syscall_return_value); errno = oerrno; reentrance_guard_flag = false; return was_hooked; } /* * hook_guard_wrapper -- a wrapper which can notice reentrance. * * The reentrance_guard_flag flag allows the library to distinguish the hooking * of its own syscalls. E.g. while handling an open() syscall, * libgkfs_intercept might call fopen(), which in turn uses an open() * syscall internally. This internally used open() syscall is once again * forwarded to libgkfs_intercept, but using this flag we can notice this * case of reentering itself. * * XXX This approach still contains a very significant bug, as libgkfs_intercept * being called inside a signal handler might easily forward a mock fd to the * kernel. */ int hook_guard_wrapper(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, long arg5, long* syscall_return_value) { assert(CTX->interception_enabled()); int was_hooked = 0; if (reentrance_guard_flag) { int oerrno = errno; was_hooked = hook_internal(syscall_number, arg0, arg1, arg2, arg3, arg4, arg5, syscall_return_value); errno = oerrno; return was_hooked; } reentrance_guard_flag = true; int oerrno = errno; was_hooked = ::hook(syscall_number, arg0, arg1, arg2, arg3, arg4, arg5, syscall_return_value); errno = oerrno; reentrance_guard_flag = false; return was_hooked; } void start_self_interception() { LOG(DEBUG, "Enabling syscall interception for self"); intercept_hook_point = internal_hook_guard_wrapper; intercept_hook_point_post_kernel = hook_forwarded_syscall; intercept_hook_point_clone_child = hook_clone_at_child; intercept_hook_point_clone_parent = hook_clone_at_parent; } void start_interception() { assert(CTX->interception_enabled()); LOG(DEBUG, "Enabling syscall interception for client process"); // Set up the callback function pointer intercept_hook_point = hook_guard_wrapper; intercept_hook_point_post_kernel = hook_forwarded_syscall; intercept_hook_point_clone_child = hook_clone_at_child; intercept_hook_point_clone_parent = hook_clone_at_parent; } void stop_interception() { assert(CTX->interception_enabled()); LOG(DEBUG, "Disabling syscall interception for client process"); // Reset callback function pointer intercept_hook_point = nullptr; intercept_hook_point_post_kernel = nullptr; intercept_hook_point_clone_child = nullptr; intercept_hook_point_clone_parent = nullptr; } } // namespace preload } // namespace gkfs