Commit cd02cdfb authored by Frederic Schimmelpfennig's avatar Frederic Schimmelpfennig
Browse files

working version Mogon2 GPU Tensorflow benchmarks

parent 50bb37b0
Loading
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -39,7 +39,7 @@ DEP_CONFIG=""
VALID_DEP_OPTIONS="mogongpu mogon2 mogon1 ngio direct all ci"

MOGONGPU_DEPS=(
    "zstd" "lz4" "snappy" "capstone" "mercury" "argobots" "json-c" "margo" "rocksdb"
    "zstd" "lz4" "snappy" "capstone" "ofi" "mercury" "argobots" "json-c" "margo" "rocksdb"
    "syscall_intercept" "date" "verbs"
)

+1 −1
Original line number Diff line number Diff line
@@ -39,7 +39,7 @@ VERBOSE=false
VALID_DEP_OPTIONS="mogongpu mogon2 mogon1 ngio direct all ci"

MOGONGPU_DEPS=(
    "bzip2" "zstd" "lz4" "snappy" "capstone" "mercury-experimental" "argobots" "json-c" "margo" "rocksdb-experimental"
    "bzip2" "zstd" "lz4" "snappy" "capstone" "ofi-experimental" "mercury-experimental" "argobots" "json-c" "margo" "rocksdb-experimental"
    "syscall_intercept" "date"
)

+4 −1
Original line number Diff line number Diff line
@@ -111,7 +111,10 @@ extract_protocol(const string& uri) {
        protocol = gkfs::rpc::protocol::ofi_psm2;
    } else if(uri.find(gkfs::rpc::protocol::ofi_verbs) != string::npos) {
        protocol = gkfs::rpc::protocol::ofi_verbs;
    } else if(uri.find(gkfs::rpc::protocol::ofi_tcp) != string::npos) {
        protocol = gkfs::rpc::protocol::ofi_tcp;
    }

    // check for shared memory protocol. Can be plain shared memory or real ofi
    // protocol + auto_sm
    if(uri.find(gkfs::rpc::protocol::na_sm) != string::npos) {
+48 −36
Original line number Diff line number Diff line
@@ -334,6 +334,11 @@ forward_read(const string& path, void* buf, const off64_t offset,

        auto endp = CTX->hosts().at(target);

        auto retry_max = 5;

        auto retry_cnt = 0;
        do {
            retry_cnt++;
            try {

                LOG(DEBUG, "Sending RPC ...");
@@ -363,15 +368,22 @@ forward_read(const string& path, void* buf, const off64_t offset,

                LOG(DEBUG, "host: {}, path: {}, chunks: {}, size: {}, offset: {}",
                    target, path, in.chunk_n(), total_chunk_size, in.offset());

                break;
            } catch(const std::exception& ex) {
                LOG(ERROR,
                    "Unable to send non-blocking rpc for path \"{}\" "
                "[peer: {}]",
                path, target);
                    "[peer: {}]. Try: '{}' (max '{}' tries",
                    path, target, retry_cnt, retry_max);
                if (retry_cnt == retry_max) {
                    LOG(ERROR,
                        "Exceeded all retries... fail");
                    return make_pair(EBUSY, 0);
                }
            }
        } while(retry_cnt != retry_max);

    }


    // Wait for RPC responses and then get response and add it to out_size
    // which is the read size. All potential outputs are served to free
+4 −0
Original line number Diff line number Diff line
@@ -216,6 +216,10 @@ ChunkStorage::read_chunk(const string& file_path, gkfs::rpc::chnk_id_t chunk_id,

    FileHandle fh(open(chunk_path.c_str(), O_RDONLY), chunk_path);
    if(!fh.valid()) {
        if (errno == ENOENT && chunk_id > 0) {
            //log_->warn("XXX {}(): Weird corner case. Return 0 read no error? path '{}' chunkid '{}' size '{}' offset '{}' ", __func__, file_path, chunk_id, size, offset);
            return 0;
        }
        auto err_str = fmt::format(
                "{}() Failed to open chunk file for read. File: '{}', Error: '{}'",
                __func__, chunk_path, ::strerror(errno));
Loading