Line data Source code
1 : /*
2 : Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain
3 : Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany
4 :
5 : This software was partially supported by the
6 : EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu).
7 :
8 : This software was partially supported by the
9 : ADA-FS project under the SPPEXA project funded by the DFG.
10 :
11 : This file is part of GekkoFS.
12 :
13 : GekkoFS is free software: you can redistribute it and/or modify
14 : it under the terms of the GNU General Public License as published by
15 : the Free Software Foundation, either version 3 of the License, or
16 : (at your option) any later version.
17 :
18 : GekkoFS is distributed in the hope that it will be useful,
19 : but WITHOUT ANY WARRANTY; without even the implied warranty of
20 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 : GNU General Public License for more details.
22 :
23 : You should have received a copy of the GNU General Public License
24 : along with GekkoFS. If not, see <https://www.gnu.org/licenses/>.
25 :
26 : SPDX-License-Identifier: GPL-3.0-or-later
27 : */
28 :
29 : #include <daemon/backend/metadata/db.hpp>
30 : #include <daemon/backend/metadata/merge.hpp>
31 : #include <daemon/backend/exceptions.hpp>
32 : #include <daemon/backend/metadata/metadata_module.hpp>
33 :
34 : #include <common/metadata.hpp>
35 : #include <common/path_util.hpp>
36 : #include <iostream>
37 : #include <daemon/backend/metadata/rocksdb_backend.hpp>
38 : extern "C" {
39 : #include <sys/stat.h>
40 : }
41 :
42 : namespace gkfs::metadata {
43 :
44 : /**
45 : * Called when the daemon is started: Connects to the KV store
46 : * @param path where KV store data is stored
47 : */
48 33 : RocksDBBackend::RocksDBBackend(const std::string& path) {
49 :
50 : // Optimize RocksDB. This is the easiest way to get RocksDB to perform well
51 33 : options_.IncreaseParallelism();
52 33 : options_.OptimizeLevelStyleCompaction();
53 : // create the DB if it's not already present
54 33 : options_.create_if_missing = true;
55 33 : options_.merge_operator.reset(new MetadataMergeOperator);
56 33 : optimize_database_impl();
57 33 : write_opts_.disableWAL = !(gkfs::config::rocksdb::use_write_ahead_log);
58 33 : rdb::DB* rdb_ptr = nullptr;
59 33 : auto s = rocksdb::DB::Open(options_, path, &rdb_ptr);
60 33 : if(!s.ok()) {
61 0 : throw std::runtime_error("Failed to open RocksDB: " + s.ToString());
62 : }
63 33 : this->db_.reset(rdb_ptr);
64 33 : }
65 :
66 :
67 66 : RocksDBBackend::~RocksDBBackend() {
68 66 : this->db_.reset();
69 66 : }
70 :
71 : /**
72 : * Exception wrapper on Status object. Throws NotFoundException if
73 : * s.IsNotFound(), general DBException otherwise
74 : * @param RocksDB status
75 : * @throws DBException
76 : */
77 : void
78 24 : RocksDBBackend::throw_status_excpt(const rdb::Status& s) {
79 24 : assert(!s.ok());
80 :
81 24 : if(s.IsNotFound()) {
82 48 : throw NotFoundException(s.ToString());
83 : } else {
84 0 : throw DBException(s.ToString());
85 : }
86 : }
87 :
88 :
89 : /**
90 : * Gets a KV store value for a key
91 : * @param key
92 : * @return value
93 : * @throws DBException on failure, NotFoundException if entry doesn't exist
94 : */
95 : std::string
96 1406 : RocksDBBackend::get_impl(const std::string& key) const {
97 1406 : std::string val;
98 :
99 1430 : auto s = db_->Get(rdb::ReadOptions(), key, &val);
100 1406 : if(!s.ok()) {
101 24 : throw_status_excpt(s);
102 : }
103 :
104 1382 : return val;
105 : }
106 :
107 : /**
108 : * Puts an entry into the KV store
109 : * @param key
110 : * @param val
111 : * @throws DBException on failure
112 : */
113 : void
114 1097 : RocksDBBackend::put_impl(const std::string& key, const std::string& val) {
115 :
116 1097 : auto cop = CreateOperand(val);
117 2194 : auto s = db_->Merge(write_opts_, key, cop.serialize());
118 1097 : if(!s.ok()) {
119 0 : throw_status_excpt(s);
120 : }
121 1097 : }
122 :
123 : /**
124 : * Puts an entry into the KV store if it doesn't exist. This function does not
125 : * use a mutex.
126 : * @param key
127 : * @param val
128 : * @throws DBException on failure, ExistException if entry already exists
129 : */
130 : void
131 1100 : RocksDBBackend::put_no_exist_impl(const std::string& key,
132 : const std::string& val) {
133 :
134 1100 : if(exists(key))
135 3 : throw ExistsException(key);
136 1097 : put(key, val);
137 1097 : }
138 :
139 : /**
140 : * Removes an entry from the KV store
141 : * @param key
142 : * @throws DBException on failure, NotFoundException if entry doesn't exist
143 : */
144 : void
145 8 : RocksDBBackend::remove_impl(const std::string& key) {
146 :
147 8 : auto s = db_->Delete(write_opts_, key);
148 8 : if(!s.ok()) {
149 0 : throw_status_excpt(s);
150 : }
151 8 : }
152 :
153 : /**
154 : * checks for existence of an entry
155 : * @param key
156 : * @return true if exists
157 : * @throws DBException on failure
158 : */
159 : bool
160 1100 : RocksDBBackend::exists_impl(const std::string& key) {
161 :
162 2200 : std::string val;
163 :
164 2200 : auto s = db_->Get(rdb::ReadOptions(), key, &val);
165 1100 : if(!s.ok()) {
166 1097 : if(s.IsNotFound()) {
167 : return false;
168 : } else {
169 0 : throw_status_excpt(s);
170 : }
171 : }
172 : return true;
173 : }
174 :
175 : /**
176 : * Updates a metadentry atomically and also allows to change keys
177 : * @param old_key
178 : * @param new_key
179 : * @param val
180 : * @throws DBException on failure, NotFoundException if entry doesn't exist
181 : */
182 : void
183 31 : RocksDBBackend::update_impl(const std::string& old_key,
184 : const std::string& new_key,
185 : const std::string& val) {
186 :
187 : // TODO use rdb::Put() method
188 31 : rdb::WriteBatch batch;
189 31 : batch.Delete(old_key);
190 31 : batch.Put(new_key, val);
191 62 : auto s = db_->Write(write_opts_, &batch);
192 31 : if(!s.ok()) {
193 0 : throw_status_excpt(s);
194 : }
195 31 : }
196 :
197 : /**
198 : * Updates the size on the metadata
199 : * Operation. E.g., called before a write() call
200 : *
201 : * A special case represents the append operation. Since multiple processes
202 : * could want to append a file in parallel, the corresponding offsets where the
203 : * write operation starts, needs to be reserved. This is an expensive operation
204 : * as we need to force a RocksDB Merge operation to receive the starting offset
205 : * for this write request.
206 : *
207 : * @param key
208 : * @param io_size
209 : * @param offset
210 : * @param append
211 : * @return offset where the write operation should start. This is only used when
212 : * append is set
213 : */
214 : off_t
215 41 : RocksDBBackend::increase_size_impl(const std::string& key, size_t io_size,
216 : off_t offset, bool append) {
217 41 : off_t out_offset = -1;
218 41 : if(append) {
219 3 : auto merge_id = gkfs::metadata::gen_unique_id(key);
220 : // no offset needed because new size is current file size + io_size
221 3 : auto uop = IncreaseSizeOperand(io_size, merge_id, append);
222 6 : auto s = db_->Merge(write_opts_, key, uop.serialize());
223 3 : if(!s.ok()) {
224 0 : throw_status_excpt(s);
225 : } else {
226 : // force merge operation to run
227 3 : get_impl(key);
228 3 : try {
229 : // the offset was added during FullMergeV2() call
230 3 : out_offset =
231 3 : GKFS_METADATA_MOD->append_offset_reserve_get_and_erase(
232 : merge_id);
233 0 : } catch(std::out_of_range& e) {
234 0 : GKFS_METADATA_MOD->log()->warn(
235 : "{}() - out_of_range exception: {} when attempting to get offset for key {}",
236 0 : __func__, e.what(), key);
237 : }
238 : }
239 : } else {
240 : // In the standard case we simply add the I/O request size to the
241 : // offset.
242 38 : auto uop = IncreaseSizeOperand(offset + io_size);
243 76 : auto s = db_->Merge(write_opts_, key, uop.serialize());
244 38 : if(!s.ok()) {
245 0 : throw_status_excpt(s);
246 : }
247 : }
248 41 : return out_offset;
249 : }
250 :
251 : /**
252 : * Decreases the size on the metadata
253 : * Operation E.g., called before a truncate() call
254 : * @param key
255 : * @param size
256 : * @throws DBException on failure
257 : */
258 : void
259 3 : RocksDBBackend::decrease_size_impl(const std::string& key, size_t size) {
260 :
261 3 : auto uop = DecreaseSizeOperand(size);
262 3 : auto s = db_->Merge(write_opts_, key, uop.serialize());
263 3 : if(!s.ok()) {
264 0 : throw_status_excpt(s);
265 : }
266 3 : }
267 :
268 : /**
269 : * Return all the first-level entries of the directory @dir
270 : *
271 : * @return vector of pair <std::string name, bool is_dir>,
272 : * where name is the name of the entries and is_dir
273 : * is true in the case the entry is a directory.
274 : */
275 : std::vector<std::pair<std::string, bool>>
276 25 : RocksDBBackend::get_dirents_impl(const std::string& dir) const {
277 25 : auto root_path = dir;
278 50 : rocksdb::ReadOptions ropts;
279 25 : auto it = db_->NewIterator(ropts);
280 :
281 25 : std::vector<std::pair<std::string, bool>> entries;
282 1072 : for(it->Seek(root_path); it->Valid() && it->key().starts_with(root_path);
283 1047 : it->Next()) {
284 :
285 1047 : if(it->key().size() == root_path.size()) {
286 : // we skip this path cause it is exactly the root_path
287 6 : continue;
288 : }
289 :
290 : /***** Get File name *****/
291 2070 : auto name = it->key().ToString();
292 1041 : if(name.find_first_of('/', root_path.size()) != std::string::npos) {
293 : // skip stuff deeper then one level depth
294 29 : continue;
295 : }
296 : // remove prefix
297 1030 : name = name.substr(root_path.size());
298 :
299 : // relative path of directory entries must not be empty
300 1030 : assert(!name.empty());
301 :
302 2059 : Metadata md(it->value().ToString());
303 : #ifdef HAS_RENAME
304 : // Remove entries with negative blocks (rename)
305 1030 : if(md.blocks() == -1) {
306 13 : continue;
307 : }
308 : #endif // HAS_RENAME
309 1029 : auto is_dir = S_ISDIR(md.mode());
310 :
311 1029 : entries.emplace_back(std::move(name), is_dir);
312 : }
313 25 : assert(it->status().ok());
314 50 : return entries;
315 : }
316 :
317 : /**
318 : * Return all the first-level entries of the directory @dir
319 : *
320 : * @return vector of pair <std::string name, bool is_dir - size - ctime>,
321 : * where name is the name of the entries and is_dir
322 : * is true in the case the entry is a directory.
323 : */
324 : std::vector<std::tuple<std::string, bool, size_t, time_t>>
325 4 : RocksDBBackend::get_dirents_extended_impl(const std::string& dir) const {
326 4 : auto root_path = dir;
327 8 : rocksdb::ReadOptions ropts;
328 4 : auto it = db_->NewIterator(ropts);
329 :
330 4 : std::vector<std::tuple<std::string, bool, size_t, time_t>> entries;
331 :
332 9 : for(it->Seek(root_path); it->Valid() && it->key().starts_with(root_path);
333 5 : it->Next()) {
334 :
335 5 : if(it->key().size() == root_path.size()) {
336 : // we skip this path cause it is exactly the root_path
337 0 : continue;
338 : }
339 :
340 : /***** Get File name *****/
341 9 : auto name = it->key().ToString();
342 5 : if(name.find_first_of('/', root_path.size()) != std::string::npos) {
343 : // skip stuff deeper then one level depth
344 2 : continue;
345 : }
346 : // remove prefix
347 4 : name = name.substr(root_path.size());
348 :
349 : // relative path of directory entries must not be empty
350 4 : assert(!name.empty());
351 :
352 8 : Metadata md(it->value().ToString());
353 : #ifdef HAS_RENAME
354 : // Remove entries with negative blocks (rename)
355 4 : if(md.blocks() == -1) {
356 1 : continue;
357 : }
358 : #endif // HAS_RENAME
359 4 : auto is_dir = S_ISDIR(md.mode());
360 :
361 4 : entries.emplace_back(std::forward_as_tuple(std::move(name), is_dir,
362 4 : md.size(), md.ctime()));
363 : }
364 4 : assert(it->status().ok());
365 8 : return entries;
366 : }
367 :
368 :
369 : /**
370 : * Code example for iterating all entries in KV store. This is for debug only as
371 : * it is too expensive
372 : */
373 : void
374 0 : RocksDBBackend::iterate_all_impl() const {
375 0 : std::string key;
376 0 : std::string val;
377 : // Do RangeScan on parent inode
378 0 : auto iter = db_->NewIterator(rdb::ReadOptions());
379 0 : for(iter->SeekToFirst(); iter->Valid(); iter->Next()) {
380 0 : key = iter->key().ToString();
381 0 : val = iter->value().ToString();
382 0 : std::cout << key << std::endl;
383 : }
384 0 : }
385 :
386 : /**
387 : * Used for setting KV store settings
388 : */
389 : void
390 33 : RocksDBBackend::optimize_database_impl() {
391 33 : options_.max_successive_merges = 128;
392 33 : }
393 :
394 :
395 : } // namespace gkfs::metadata
|