From 88ee54cf51b676da6f9dea1a689b8bb858d588db Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 15 Jun 2026 09:22:02 +0200 Subject: [PATCH 1/4] delete inline corrected --- src/daemon/backend/metadata/rocksdb_backend.cpp | 12 ++++++++++++ src/proxy/rpc/forward_metadata.cpp | 11 +++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/daemon/backend/metadata/rocksdb_backend.cpp b/src/daemon/backend/metadata/rocksdb_backend.cpp index bf0ece253..434163d5a 100644 --- a/src/daemon/backend/metadata/rocksdb_backend.cpp +++ b/src/daemon/backend/metadata/rocksdb_backend.cpp @@ -138,6 +138,18 @@ RocksDBBackend::RocksDBBackend(const std::string& path) { options_.OptimizeLevelStyleCompaction(); options_.create_if_missing = true; options_.merge_operator.reset(new MetadataMergeOperator); + options_.write_buffer_size = 128 * 1024 * 1024; + options_.max_write_buffer_number = 4; + options_.compression = rocksdb::kNoCompression; + options_.max_background_jobs = 4; + rocksdb::BlockBasedTableOptions table_options; + table_options.block_cache = + rocksdb::NewLRUCache(512 * 1024 * 1024); // 512MB Cache + table_options.filter_policy.reset( + rocksdb::NewBloomFilterPolicy(10)); // 10 bits per key bloom filter + options_.table_factory.reset( + rocksdb::NewBlockBasedTableFactory(table_options)); + optimize_database_impl(); // Enable WAL if requested via environment diff --git a/src/proxy/rpc/forward_metadata.cpp b/src/proxy/rpc/forward_metadata.cpp index c3c61ab2a..a4f296742 100644 --- a/src/proxy/rpc/forward_metadata.cpp +++ b/src/proxy/rpc/forward_metadata.cpp @@ -152,6 +152,17 @@ forward_remove(const std::string& path, bool rm_dir) { // be removed, thus, we exit if(!S_ISREG(mode) || size == 0) return 0; + + // If inline data is enabled and the file size falls within the inline + // threshold, the data is stored within the metadata entry (RocksDB) and + // deleted automatically, so we can bypass sending remove_data RPCs to all + // storage hosts. + bool is_inline = gkfs::config::metadata::use_inline_data && + static_cast(size) <= + gkfs::config::metadata::inline_data_size; + if(is_inline) + return 0; + return remove_data(path); } -- GitLab From ff5ab3c3ac400b9a73ce26563d5c11959b0ede06 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 15 Jun 2026 10:15:08 +0200 Subject: [PATCH 2/4] count only optimization --- src/client/rpc/forward_metadata.cpp | 23 +++++++++++++------ .../backend/metadata/rocksdb_backend.cpp | 21 ++++------------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/client/rpc/forward_metadata.cpp b/src/client/rpc/forward_metadata.cpp index 2208139ad..80a67c682 100644 --- a/src/client/rpc/forward_metadata.cpp +++ b/src/client/rpc/forward_metadata.cpp @@ -866,13 +866,22 @@ forward_get_dirents_filtered(const std::string& path, int server, // Retry loop in case of buffer being too small while(true) { - std::unique_ptr large_buffer = - std::unique_ptr(new char[buffer_size]); - // expose the buffer - std::vector> segments; - segments.emplace_back(large_buffer.get(), buffer_size); - tl::bulk exposed_buffer = - CTX->rpc_engine()->expose(segments, tl::bulk_mode::write_only); + std::unique_ptr large_buffer; + tl::bulk exposed_buffer; + + if(!count_only) { + large_buffer = std::unique_ptr(new char[buffer_size]); + // expose the buffer + std::vector> segments; + segments.emplace_back(large_buffer.get(), buffer_size); + try { + exposed_buffer = CTX->rpc_engine()->expose(segments, tl::bulk_mode::write_only); + } catch(const std::exception& e) { + LOG(ERROR, "Failed to expose buffer: {}", e.what()); + err = EBUSY; + break; + } + } in.bulk_handle = exposed_buffer; diff --git a/src/daemon/backend/metadata/rocksdb_backend.cpp b/src/daemon/backend/metadata/rocksdb_backend.cpp index 434163d5a..471145fe0 100644 --- a/src/daemon/backend/metadata/rocksdb_backend.cpp +++ b/src/daemon/backend/metadata/rocksdb_backend.cpp @@ -682,6 +682,7 @@ RocksDBBackend::get_dirents_filtered_impl( std::string last_scanned_key; bool eof = true; size_t scanned_count = 0; + size_t matched_count = 0; const size_t scan_limit = 5000000; // Limit scanned entries per PRC to avoid timeout @@ -776,16 +777,9 @@ RocksDBBackend::get_dirents_filtered_impl( } if(matched) { - unsigned char type = 0; - if(count_only) { - // If counting only, we don't store the metadata - // We just record a match implicitly (matched is true here) - // However, we still need to respect max_entries for pagination - // The actual count of matched items is what matters. - entries.emplace_back(std::forward_as_tuple( - "", type, 0, - 0)); // Dummy entry to keep track of size/pagination - } else { + matched_count++; + if(!count_only) { + unsigned char type = 0; if(S_ISDIR(mode)) { type = 1; } else if(S_ISLNK(mode)) { @@ -794,7 +788,7 @@ RocksDBBackend::get_dirents_filtered_impl( entries.emplace_back(std::forward_as_tuple( std::move(relative_name), type, size, ctime)); } - if(max_entries > 0 && entries.size() >= max_entries) { + if(max_entries > 0 && matched_count >= max_entries) { eof = false; break; } @@ -805,11 +799,6 @@ RocksDBBackend::get_dirents_filtered_impl( last_scanned_key = ""; } - size_t matched_count = entries.size(); - if(count_only) { - entries.clear(); - } - // assert(it->status().ok()); // only if eof check? return {entries, matched_count, scanned_count, last_scanned_key}; } -- GitLab From 34b6c083dc093a0654a9c6810af9be0196dd0503 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 15 Jun 2026 10:36:41 +0200 Subject: [PATCH 3/4] add changelog --- CHANGELOG.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 021f95cc7..97c74618b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,12 +7,6 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ### New - - Client-side asynchronous write cache with async flushing ([!306](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/306)) - - Added client-side asynchronous write caching for data writes to improve IO500 performance. - - Introduced new environment variable: `LIBGKFS_ASYNC_WRITE`. - - Metadata batching ([!305](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/305)) - - Added client-side metadata batching for file/node creation to reduce metadata RPC bottlenecks. - - Introduced new environment variables: `LIBGKFS_METADATA_BATCH` and `LIBGKFS_METADATA_BATCH_THRESHOLD`. - directory optimization with compression and reattemp ([!270](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/270)) - Refactor sfind so it can use SLURM_ environment variables to ask to different servers. - Create a sample bash script to gather all the info (map->reduce) @@ -44,11 +38,20 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Use LIBGKFS_ENABLE_FORK=1 to enable fork support in the client library. - This is used for example in DLIO. - Shrink capabilities added ([!304](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/304)) + - Client-side asynchronous write cache with async flushing ([!306](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/306)) + - Added client-side asynchronous write caching for data writes to improve IO500 performance. + - Introduced new environment variable: `LIBGKFS_ASYNC_WRITE`. + - Metadata batching ([!305](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/305)) + - Added client-side metadata batching for file/node creation to reduce metadata RPC bottlenecks. + - Introduced new environment variables: `LIBGKFS_METADATA_BATCH` and `LIBGKFS_METADATA_BATCH_THRESHOLD`. ### Changed + - Optimized count-only filtered directory listing (`sfind` with `-C`) ([!307](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/307)) + - Avoid client-side buffer allocation and memory exposure for count-only queries. + - Avoid daemon-side dummy entries vector allocation and resizing during RocksDB iteration. - Disabled at_parent/at_fork/at_child as it seems unneded now - Moved some CMAKE options to config.hpp and env variables ([!285](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/285)) - LIBGKFS/ GKFS _SYMLINK_SUPPORT, _RENAME_SUPPORT and _CREATE_CHECK_PARENTS. -- GitLab From 748d22435e82edadd499bd4c512299231eae9212 Mon Sep 17 00:00:00 2001 From: Ramon Nou Date: Mon, 15 Jun 2026 11:39:16 +0200 Subject: [PATCH 4/4] lint --- src/client/rpc/forward_metadata.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/client/rpc/forward_metadata.cpp b/src/client/rpc/forward_metadata.cpp index 80a67c682..d7a925f6c 100644 --- a/src/client/rpc/forward_metadata.cpp +++ b/src/client/rpc/forward_metadata.cpp @@ -875,7 +875,8 @@ forward_get_dirents_filtered(const std::string& path, int server, std::vector> segments; segments.emplace_back(large_buffer.get(), buffer_size); try { - exposed_buffer = CTX->rpc_engine()->expose(segments, tl::bulk_mode::write_only); + exposed_buffer = CTX->rpc_engine()->expose( + segments, tl::bulk_mode::write_only); } catch(const std::exception& e) { LOG(ERROR, "Failed to expose buffer: {}", e.what()); err = EBUSY; -- GitLab