Loading CHANGELOG.md +9 −6 Original line number Diff line number Diff line Loading @@ -7,12 +7,6 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ### New - Client-side asynchronous write cache with async flushing ([!306](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/306)) - Added client-side asynchronous write caching for data writes to improve IO500 performance. - Introduced new environment variable: `LIBGKFS_ASYNC_WRITE`. - Metadata batching ([!305](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/305)) - Added client-side metadata batching for file/node creation to reduce metadata RPC bottlenecks. - Introduced new environment variables: `LIBGKFS_METADATA_BATCH` and `LIBGKFS_METADATA_BATCH_THRESHOLD`. - directory optimization with compression and reattemp ([!270](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/270)) - Refactor sfind so it can use SLURM_ environment variables to ask to different servers. - Create a sample bash script to gather all the info (map->reduce) Loading Loading @@ -44,11 +38,20 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Use LIBGKFS_ENABLE_FORK=1 to enable fork support in the client library. - This is used for example in DLIO. - Shrink capabilities added ([!304](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/304)) - Client-side asynchronous write cache with async flushing ([!306](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/306)) - Added client-side asynchronous write caching for data writes to improve IO500 performance. - Introduced new environment variable: `LIBGKFS_ASYNC_WRITE`. - Metadata batching ([!305](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/305)) - Added client-side metadata batching for file/node creation to reduce metadata RPC bottlenecks. - Introduced new environment variables: `LIBGKFS_METADATA_BATCH` and `LIBGKFS_METADATA_BATCH_THRESHOLD`. ### Changed - Optimized count-only filtered directory listing (`sfind` with `-C`) ([!307](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/307)) - Avoid client-side buffer allocation and memory exposure for count-only queries. - Avoid daemon-side dummy entries vector allocation and resizing during RocksDB iteration. - Disabled at_parent/at_fork/at_child as it seems unneded now - Moved some CMAKE options to config.hpp and env variables ([!285](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/285)) - LIBGKFS/ GKFS _SYMLINK_SUPPORT, _RENAME_SUPPORT and _CREATE_CHECK_PARENTS. Loading src/client/rpc/forward_metadata.cpp +17 −7 Original line number Diff line number Diff line Loading @@ -866,13 +866,23 @@ forward_get_dirents_filtered(const std::string& path, int server, // Retry loop in case of buffer being too small while(true) { std::unique_ptr<char[]> large_buffer = std::unique_ptr<char[]>(new char[buffer_size]); std::unique_ptr<char[]> large_buffer; tl::bulk exposed_buffer; if(!count_only) { large_buffer = std::unique_ptr<char[]>(new char[buffer_size]); // expose the buffer std::vector<std::pair<void*, std::size_t>> segments; segments.emplace_back(large_buffer.get(), buffer_size); tl::bulk exposed_buffer = CTX->rpc_engine()->expose(segments, tl::bulk_mode::write_only); try { exposed_buffer = CTX->rpc_engine()->expose( segments, tl::bulk_mode::write_only); } catch(const std::exception& e) { LOG(ERROR, "Failed to expose buffer: {}", e.what()); err = EBUSY; break; } } in.bulk_handle = exposed_buffer; Loading src/daemon/backend/metadata/rocksdb_backend.cpp +17 −16 Original line number Diff line number Diff line Loading @@ -138,6 +138,18 @@ RocksDBBackend::RocksDBBackend(const std::string& path) { options_.OptimizeLevelStyleCompaction(); options_.create_if_missing = true; options_.merge_operator.reset(new MetadataMergeOperator); options_.write_buffer_size = 128 * 1024 * 1024; options_.max_write_buffer_number = 4; options_.compression = rocksdb::kNoCompression; options_.max_background_jobs = 4; rocksdb::BlockBasedTableOptions table_options; table_options.block_cache = rocksdb::NewLRUCache(512 * 1024 * 1024); // 512MB Cache table_options.filter_policy.reset( rocksdb::NewBloomFilterPolicy(10)); // 10 bits per key bloom filter options_.table_factory.reset( rocksdb::NewBlockBasedTableFactory(table_options)); optimize_database_impl(); // Enable WAL if requested via environment Loading Loading @@ -670,6 +682,7 @@ RocksDBBackend::get_dirents_filtered_impl( std::string last_scanned_key; bool eof = true; size_t scanned_count = 0; size_t matched_count = 0; const size_t scan_limit = 5000000; // Limit scanned entries per PRC to avoid timeout Loading Loading @@ -764,16 +777,9 @@ RocksDBBackend::get_dirents_filtered_impl( } if(matched) { matched_count++; if(!count_only) { unsigned char type = 0; if(count_only) { // If counting only, we don't store the metadata // We just record a match implicitly (matched is true here) // However, we still need to respect max_entries for pagination // The actual count of matched items is what matters. entries.emplace_back(std::forward_as_tuple( "", type, 0, 0)); // Dummy entry to keep track of size/pagination } else { if(S_ISDIR(mode)) { type = 1; } else if(S_ISLNK(mode)) { Loading @@ -782,7 +788,7 @@ RocksDBBackend::get_dirents_filtered_impl( entries.emplace_back(std::forward_as_tuple( std::move(relative_name), type, size, ctime)); } if(max_entries > 0 && entries.size() >= max_entries) { if(max_entries > 0 && matched_count >= max_entries) { eof = false; break; } Loading @@ -793,11 +799,6 @@ RocksDBBackend::get_dirents_filtered_impl( last_scanned_key = ""; } size_t matched_count = entries.size(); if(count_only) { entries.clear(); } // assert(it->status().ok()); // only if eof check? return {entries, matched_count, scanned_count, last_scanned_key}; } Loading src/proxy/rpc/forward_metadata.cpp +11 −0 Original line number Diff line number Diff line Loading @@ -152,6 +152,17 @@ forward_remove(const std::string& path, bool rm_dir) { // be removed, thus, we exit if(!S_ISREG(mode) || size == 0) return 0; // If inline data is enabled and the file size falls within the inline // threshold, the data is stored within the metadata entry (RocksDB) and // deleted automatically, so we can bypass sending remove_data RPCs to all // storage hosts. bool is_inline = gkfs::config::metadata::use_inline_data && static_cast<size_t>(size) <= gkfs::config::metadata::inline_data_size; if(is_inline) return 0; return remove_data(path); } Loading Loading
CHANGELOG.md +9 −6 Original line number Diff line number Diff line Loading @@ -7,12 +7,6 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ### New - Client-side asynchronous write cache with async flushing ([!306](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/306)) - Added client-side asynchronous write caching for data writes to improve IO500 performance. - Introduced new environment variable: `LIBGKFS_ASYNC_WRITE`. - Metadata batching ([!305](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/305)) - Added client-side metadata batching for file/node creation to reduce metadata RPC bottlenecks. - Introduced new environment variables: `LIBGKFS_METADATA_BATCH` and `LIBGKFS_METADATA_BATCH_THRESHOLD`. - directory optimization with compression and reattemp ([!270](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/270)) - Refactor sfind so it can use SLURM_ environment variables to ask to different servers. - Create a sample bash script to gather all the info (map->reduce) Loading Loading @@ -44,11 +38,20 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Use LIBGKFS_ENABLE_FORK=1 to enable fork support in the client library. - This is used for example in DLIO. - Shrink capabilities added ([!304](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/304)) - Client-side asynchronous write cache with async flushing ([!306](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/306)) - Added client-side asynchronous write caching for data writes to improve IO500 performance. - Introduced new environment variable: `LIBGKFS_ASYNC_WRITE`. - Metadata batching ([!305](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/305)) - Added client-side metadata batching for file/node creation to reduce metadata RPC bottlenecks. - Introduced new environment variables: `LIBGKFS_METADATA_BATCH` and `LIBGKFS_METADATA_BATCH_THRESHOLD`. ### Changed - Optimized count-only filtered directory listing (`sfind` with `-C`) ([!307](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/307)) - Avoid client-side buffer allocation and memory exposure for count-only queries. - Avoid daemon-side dummy entries vector allocation and resizing during RocksDB iteration. - Disabled at_parent/at_fork/at_child as it seems unneded now - Moved some CMAKE options to config.hpp and env variables ([!285](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/285)) - LIBGKFS/ GKFS _SYMLINK_SUPPORT, _RENAME_SUPPORT and _CREATE_CHECK_PARENTS. Loading
src/client/rpc/forward_metadata.cpp +17 −7 Original line number Diff line number Diff line Loading @@ -866,13 +866,23 @@ forward_get_dirents_filtered(const std::string& path, int server, // Retry loop in case of buffer being too small while(true) { std::unique_ptr<char[]> large_buffer = std::unique_ptr<char[]>(new char[buffer_size]); std::unique_ptr<char[]> large_buffer; tl::bulk exposed_buffer; if(!count_only) { large_buffer = std::unique_ptr<char[]>(new char[buffer_size]); // expose the buffer std::vector<std::pair<void*, std::size_t>> segments; segments.emplace_back(large_buffer.get(), buffer_size); tl::bulk exposed_buffer = CTX->rpc_engine()->expose(segments, tl::bulk_mode::write_only); try { exposed_buffer = CTX->rpc_engine()->expose( segments, tl::bulk_mode::write_only); } catch(const std::exception& e) { LOG(ERROR, "Failed to expose buffer: {}", e.what()); err = EBUSY; break; } } in.bulk_handle = exposed_buffer; Loading
src/daemon/backend/metadata/rocksdb_backend.cpp +17 −16 Original line number Diff line number Diff line Loading @@ -138,6 +138,18 @@ RocksDBBackend::RocksDBBackend(const std::string& path) { options_.OptimizeLevelStyleCompaction(); options_.create_if_missing = true; options_.merge_operator.reset(new MetadataMergeOperator); options_.write_buffer_size = 128 * 1024 * 1024; options_.max_write_buffer_number = 4; options_.compression = rocksdb::kNoCompression; options_.max_background_jobs = 4; rocksdb::BlockBasedTableOptions table_options; table_options.block_cache = rocksdb::NewLRUCache(512 * 1024 * 1024); // 512MB Cache table_options.filter_policy.reset( rocksdb::NewBloomFilterPolicy(10)); // 10 bits per key bloom filter options_.table_factory.reset( rocksdb::NewBlockBasedTableFactory(table_options)); optimize_database_impl(); // Enable WAL if requested via environment Loading Loading @@ -670,6 +682,7 @@ RocksDBBackend::get_dirents_filtered_impl( std::string last_scanned_key; bool eof = true; size_t scanned_count = 0; size_t matched_count = 0; const size_t scan_limit = 5000000; // Limit scanned entries per PRC to avoid timeout Loading Loading @@ -764,16 +777,9 @@ RocksDBBackend::get_dirents_filtered_impl( } if(matched) { matched_count++; if(!count_only) { unsigned char type = 0; if(count_only) { // If counting only, we don't store the metadata // We just record a match implicitly (matched is true here) // However, we still need to respect max_entries for pagination // The actual count of matched items is what matters. entries.emplace_back(std::forward_as_tuple( "", type, 0, 0)); // Dummy entry to keep track of size/pagination } else { if(S_ISDIR(mode)) { type = 1; } else if(S_ISLNK(mode)) { Loading @@ -782,7 +788,7 @@ RocksDBBackend::get_dirents_filtered_impl( entries.emplace_back(std::forward_as_tuple( std::move(relative_name), type, size, ctime)); } if(max_entries > 0 && entries.size() >= max_entries) { if(max_entries > 0 && matched_count >= max_entries) { eof = false; break; } Loading @@ -793,11 +799,6 @@ RocksDBBackend::get_dirents_filtered_impl( last_scanned_key = ""; } size_t matched_count = entries.size(); if(count_only) { entries.clear(); } // assert(it->status().ok()); // only if eof check? return {entries, matched_count, scanned_count, last_scanned_key}; } Loading
src/proxy/rpc/forward_metadata.cpp +11 −0 Original line number Diff line number Diff line Loading @@ -152,6 +152,17 @@ forward_remove(const std::string& path, bool rm_dir) { // be removed, thus, we exit if(!S_ISREG(mode) || size == 0) return 0; // If inline data is enabled and the file size falls within the inline // threshold, the data is stored within the metadata entry (RocksDB) and // deleted automatically, so we can bypass sending remove_data RPCs to all // storage hosts. bool is_inline = gkfs::config::metadata::use_inline_data && static_cast<size_t>(size) <= gkfs::config::metadata::inline_data_size; if(is_inline) return 0; return remove_data(path); } Loading