Commit b0d22cf7 authored by Ramon Nou's avatar Ramon Nou
Browse files

Merge branch 'rnou/inlinedelete' into 'master'

delete inline corrected


See merge request !307
parents e1e8f854 0107e91a
Loading
Loading
Loading
Loading
Loading
+9 −6
Original line number Diff line number Diff line
@@ -7,12 +7,6 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### New
  - Client-side asynchronous write cache with async flushing ([!306](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/306))
    - Added client-side asynchronous write caching for data writes to improve IO500 performance.
    - Introduced new environment variable: `LIBGKFS_ASYNC_WRITE`.
  - Metadata batching ([!305](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/305))
    - Added client-side metadata batching for file/node creation to reduce metadata RPC bottlenecks.
    - Introduced new environment variables: `LIBGKFS_METADATA_BATCH` and `LIBGKFS_METADATA_BATCH_THRESHOLD`.
  - directory optimization with compression and reattemp ([!270](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/270))
    - Refactor sfind so it can use SLURM_ environment variables to ask to different servers.
    - Create a sample bash script to gather all the info (map->reduce)
@@ -44,11 +38,20 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
    - Use LIBGKFS_ENABLE_FORK=1 to enable fork support in the client library.
    - This is used for example in DLIO.
  - Shrink capabilities added ([!304](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/304))
  - Client-side asynchronous write cache with async flushing ([!306](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/306))
    - Added client-side asynchronous write caching for data writes to improve IO500 performance.
    - Introduced new environment variable: `LIBGKFS_ASYNC_WRITE`.
  - Metadata batching ([!305](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/305))
    - Added client-side metadata batching for file/node creation to reduce metadata RPC bottlenecks.
    - Introduced new environment variables: `LIBGKFS_METADATA_BATCH` and `LIBGKFS_METADATA_BATCH_THRESHOLD`.
  
 


### Changed 
  - Optimized count-only filtered directory listing (`sfind` with `-C`) ([!307](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/307))
    - Avoid client-side buffer allocation and memory exposure for count-only queries.
    - Avoid daemon-side dummy entries vector allocation and resizing during RocksDB iteration.
  - Disabled at_parent/at_fork/at_child as it seems unneded now 
  - Moved some CMAKE options to config.hpp and env variables ([!285](https://storage.bsc.es/gitlab/hpc/gekkofs/-/merge_requests/285))
    - LIBGKFS/ GKFS _SYMLINK_SUPPORT, _RENAME_SUPPORT and _CREATE_CHECK_PARENTS.
+17 −7
Original line number Diff line number Diff line
@@ -866,13 +866,23 @@ forward_get_dirents_filtered(const std::string& path, int server,

    // Retry loop in case of buffer being too small
    while(true) {
        std::unique_ptr<char[]> large_buffer =
                std::unique_ptr<char[]>(new char[buffer_size]);
        std::unique_ptr<char[]> large_buffer;
        tl::bulk exposed_buffer;

        if(!count_only) {
            large_buffer = std::unique_ptr<char[]>(new char[buffer_size]);
            // expose the buffer
            std::vector<std::pair<void*, std::size_t>> segments;
            segments.emplace_back(large_buffer.get(), buffer_size);
        tl::bulk exposed_buffer =
                CTX->rpc_engine()->expose(segments, tl::bulk_mode::write_only);
            try {
                exposed_buffer = CTX->rpc_engine()->expose(
                        segments, tl::bulk_mode::write_only);
            } catch(const std::exception& e) {
                LOG(ERROR, "Failed to expose buffer: {}", e.what());
                err = EBUSY;
                break;
            }
        }

        in.bulk_handle = exposed_buffer;

+17 −16
Original line number Diff line number Diff line
@@ -138,6 +138,18 @@ RocksDBBackend::RocksDBBackend(const std::string& path) {
    options_.OptimizeLevelStyleCompaction();
    options_.create_if_missing = true;
    options_.merge_operator.reset(new MetadataMergeOperator);
    options_.write_buffer_size = 128 * 1024 * 1024;
    options_.max_write_buffer_number = 4;
    options_.compression = rocksdb::kNoCompression;
    options_.max_background_jobs = 4;
    rocksdb::BlockBasedTableOptions table_options;
    table_options.block_cache =
            rocksdb::NewLRUCache(512 * 1024 * 1024); // 512MB Cache
    table_options.filter_policy.reset(
            rocksdb::NewBloomFilterPolicy(10)); // 10 bits per key bloom filter
    options_.table_factory.reset(
            rocksdb::NewBlockBasedTableFactory(table_options));

    optimize_database_impl();

    // Enable WAL if requested via environment
@@ -670,6 +682,7 @@ RocksDBBackend::get_dirents_filtered_impl(
    std::string last_scanned_key;
    bool eof = true;
    size_t scanned_count = 0;
    size_t matched_count = 0;
    const size_t scan_limit =
            5000000; // Limit scanned entries per PRC to avoid timeout

@@ -764,16 +777,9 @@ RocksDBBackend::get_dirents_filtered_impl(
        }

        if(matched) {
            matched_count++;
            if(!count_only) {
                unsigned char type = 0;
            if(count_only) {
                // If counting only, we don't store the metadata
                // We just record a match implicitly (matched is true here)
                // However, we still need to respect max_entries for pagination
                // The actual count of matched items is what matters.
                entries.emplace_back(std::forward_as_tuple(
                        "", type, 0,
                        0)); // Dummy entry to keep track of size/pagination
            } else {
                if(S_ISDIR(mode)) {
                    type = 1;
                } else if(S_ISLNK(mode)) {
@@ -782,7 +788,7 @@ RocksDBBackend::get_dirents_filtered_impl(
                entries.emplace_back(std::forward_as_tuple(
                        std::move(relative_name), type, size, ctime));
            }
            if(max_entries > 0 && entries.size() >= max_entries) {
            if(max_entries > 0 && matched_count >= max_entries) {
                eof = false;
                break;
            }
@@ -793,11 +799,6 @@ RocksDBBackend::get_dirents_filtered_impl(
        last_scanned_key = "";
    }

    size_t matched_count = entries.size();
    if(count_only) {
        entries.clear();
    }

    // assert(it->status().ok()); // only if eof check?
    return {entries, matched_count, scanned_count, last_scanned_key};
}
+11 −0
Original line number Diff line number Diff line
@@ -152,6 +152,17 @@ forward_remove(const std::string& path, bool rm_dir) {
    // be removed, thus, we exit
    if(!S_ISREG(mode) || size == 0)
        return 0;

    // If inline data is enabled and the file size falls within the inline
    // threshold, the data is stored within the metadata entry (RocksDB) and
    // deleted automatically, so we can bypass sending remove_data RPCs to all
    // storage hosts.
    bool is_inline = gkfs::config::metadata::use_inline_data &&
                     static_cast<size_t>(size) <=
                             gkfs::config::metadata::inline_data_size;
    if(is_inline)
        return 0;

    return remove_data(path);
}