diff --git a/CHANGELOG.md b/CHANGELOG.md index 021f95cc7dbb9e249d4dd5acfe5752b05ced198f..97c74618bef59cc4d8ffa59791c30117be633491 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,12 +7,6 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] ### New - - Client-side asynchronous write cache with async flushing ([!306](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/306)) - - Added client-side asynchronous write caching for data writes to improve IO500 performance. - - Introduced new environment variable: `LIBGKFS_ASYNC_WRITE`. - - Metadata batching ([!305](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/305)) - - Added client-side metadata batching for file/node creation to reduce metadata RPC bottlenecks. - - Introduced new environment variables: `LIBGKFS_METADATA_BATCH` and `LIBGKFS_METADATA_BATCH_THRESHOLD`. - directory optimization with compression and reattemp ([!270](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/270)) - Refactor sfind so it can use SLURM_ environment variables to ask to different servers. - Create a sample bash script to gather all the info (map->reduce) @@ -44,11 +38,20 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - Use LIBGKFS_ENABLE_FORK=1 to enable fork support in the client library. - This is used for example in DLIO. - Shrink capabilities added ([!304](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/304)) + - Client-side asynchronous write cache with async flushing ([!306](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/306)) + - Added client-side asynchronous write caching for data writes to improve IO500 performance. + - Introduced new environment variable: `LIBGKFS_ASYNC_WRITE`. + - Metadata batching ([!305](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/305)) + - Added client-side metadata batching for file/node creation to reduce metadata RPC bottlenecks. + - Introduced new environment variables: `LIBGKFS_METADATA_BATCH` and `LIBGKFS_METADATA_BATCH_THRESHOLD`. ### Changed + - Optimized count-only filtered directory listing (`sfind` with `-C`) ([!307](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/307)) + - Avoid client-side buffer allocation and memory exposure for count-only queries. + - Avoid daemon-side dummy entries vector allocation and resizing during RocksDB iteration. - Disabled at_parent/at_fork/at_child as it seems unneded now - Moved some CMAKE options to config.hpp and env variables ([!285](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/285)) - LIBGKFS/ GKFS _SYMLINK_SUPPORT, _RENAME_SUPPORT and _CREATE_CHECK_PARENTS. diff --git a/src/client/rpc/forward_metadata.cpp b/src/client/rpc/forward_metadata.cpp index 2208139ad5cf5d3d9fddf5e827c1366e9cfff7d8..d7a925f6c315be78368536f7d5a1a4134cb8cce0 100644 --- a/src/client/rpc/forward_metadata.cpp +++ b/src/client/rpc/forward_metadata.cpp @@ -866,13 +866,23 @@ forward_get_dirents_filtered(const std::string& path, int server, // Retry loop in case of buffer being too small while(true) { - std::unique_ptr large_buffer = - std::unique_ptr(new char[buffer_size]); - // expose the buffer - std::vector> segments; - segments.emplace_back(large_buffer.get(), buffer_size); - tl::bulk exposed_buffer = - CTX->rpc_engine()->expose(segments, tl::bulk_mode::write_only); + std::unique_ptr large_buffer; + tl::bulk exposed_buffer; + + if(!count_only) { + large_buffer = std::unique_ptr(new char[buffer_size]); + // expose the buffer + std::vector> segments; + segments.emplace_back(large_buffer.get(), buffer_size); + try { + exposed_buffer = CTX->rpc_engine()->expose( + segments, tl::bulk_mode::write_only); + } catch(const std::exception& e) { + LOG(ERROR, "Failed to expose buffer: {}", e.what()); + err = EBUSY; + break; + } + } in.bulk_handle = exposed_buffer; diff --git a/src/daemon/backend/metadata/rocksdb_backend.cpp b/src/daemon/backend/metadata/rocksdb_backend.cpp index bf0ece2539eb6bd373855db4bbaadc50e90995fc..471145fe025a787903a61133938308114e0c819b 100644 --- a/src/daemon/backend/metadata/rocksdb_backend.cpp +++ b/src/daemon/backend/metadata/rocksdb_backend.cpp @@ -138,6 +138,18 @@ RocksDBBackend::RocksDBBackend(const std::string& path) { options_.OptimizeLevelStyleCompaction(); options_.create_if_missing = true; options_.merge_operator.reset(new MetadataMergeOperator); + options_.write_buffer_size = 128 * 1024 * 1024; + options_.max_write_buffer_number = 4; + options_.compression = rocksdb::kNoCompression; + options_.max_background_jobs = 4; + rocksdb::BlockBasedTableOptions table_options; + table_options.block_cache = + rocksdb::NewLRUCache(512 * 1024 * 1024); // 512MB Cache + table_options.filter_policy.reset( + rocksdb::NewBloomFilterPolicy(10)); // 10 bits per key bloom filter + options_.table_factory.reset( + rocksdb::NewBlockBasedTableFactory(table_options)); + optimize_database_impl(); // Enable WAL if requested via environment @@ -670,6 +682,7 @@ RocksDBBackend::get_dirents_filtered_impl( std::string last_scanned_key; bool eof = true; size_t scanned_count = 0; + size_t matched_count = 0; const size_t scan_limit = 5000000; // Limit scanned entries per PRC to avoid timeout @@ -764,16 +777,9 @@ RocksDBBackend::get_dirents_filtered_impl( } if(matched) { - unsigned char type = 0; - if(count_only) { - // If counting only, we don't store the metadata - // We just record a match implicitly (matched is true here) - // However, we still need to respect max_entries for pagination - // The actual count of matched items is what matters. - entries.emplace_back(std::forward_as_tuple( - "", type, 0, - 0)); // Dummy entry to keep track of size/pagination - } else { + matched_count++; + if(!count_only) { + unsigned char type = 0; if(S_ISDIR(mode)) { type = 1; } else if(S_ISLNK(mode)) { @@ -782,7 +788,7 @@ RocksDBBackend::get_dirents_filtered_impl( entries.emplace_back(std::forward_as_tuple( std::move(relative_name), type, size, ctime)); } - if(max_entries > 0 && entries.size() >= max_entries) { + if(max_entries > 0 && matched_count >= max_entries) { eof = false; break; } @@ -793,11 +799,6 @@ RocksDBBackend::get_dirents_filtered_impl( last_scanned_key = ""; } - size_t matched_count = entries.size(); - if(count_only) { - entries.clear(); - } - // assert(it->status().ok()); // only if eof check? return {entries, matched_count, scanned_count, last_scanned_key}; } diff --git a/src/proxy/rpc/forward_metadata.cpp b/src/proxy/rpc/forward_metadata.cpp index c3c61ab2a2f974b18e9eeadc1ea40ba3fd2f0483..a4f296742afff6f5a6501436c5ee16061a5a0710 100644 --- a/src/proxy/rpc/forward_metadata.cpp +++ b/src/proxy/rpc/forward_metadata.cpp @@ -152,6 +152,17 @@ forward_remove(const std::string& path, bool rm_dir) { // be removed, thus, we exit if(!S_ISREG(mode) || size == 0) return 0; + + // If inline data is enabled and the file size falls within the inline + // threshold, the data is stored within the metadata entry (RocksDB) and + // deleted automatically, so we can bypass sending remove_data RPCs to all + // storage hosts. + bool is_inline = gkfs::config::metadata::use_inline_data && + static_cast(size) <= + gkfs::config::metadata::inline_data_size; + if(is_inline) + return 0; + return remove_data(path); }