Line data Source code
1 : /*
2 : Copyright 2018-2024, Barcelona Supercomputing Center (BSC), Spain
3 : Copyright 2015-2024, Johannes Gutenberg Universitaet Mainz, Germany
4 :
5 : This software was partially supported by the
6 : EC H2020 funded project NEXTGenIO (Project ID: 671951, www.nextgenio.eu).
7 :
8 : This software was partially supported by the
9 : ADA-FS project under the SPPEXA project funded by the DFG.
10 :
11 : This file is part of GekkoFS.
12 :
13 : GekkoFS is free software: you can redistribute it and/or modify
14 : it under the terms of the GNU General Public License as published by
15 : the Free Software Foundation, either version 3 of the License, or
16 : (at your option) any later version.
17 :
18 : GekkoFS is distributed in the hope that it will be useful,
19 : but WITHOUT ANY WARRANTY; without even the implied warranty of
20 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 : GNU General Public License for more details.
22 :
23 : You should have received a copy of the GNU General Public License
24 : along with GekkoFS. If not, see <https://www.gnu.org/licenses/>.
25 :
26 : SPDX-License-Identifier: GPL-3.0-or-later
27 : */
28 :
29 : /* Based on pfind from ior500 */
30 : /* https://github.com/VI4IO/pfind/ */
31 :
32 : #include <cmath>
33 : #include <cstring>
34 : #include <getopt.h>
35 : #include <iostream>
36 : #include <queue>
37 : #include <regex.h>
38 : #include <stdio.h>
39 : #include <string>
40 : #include <sys/stat.h>
41 : #include <sys/types.h>
42 : #include <unistd.h>
43 : #include <limits>
44 : #include <cstdint>
45 :
46 : using namespace std;
47 :
48 : /* Minimal struct needed for io500 find */
49 : /* We could also do the filtering on the server */
50 : struct dirent_extended {
51 : size_t size;
52 : time_t ctime;
53 : unsigned short d_reclen;
54 : unsigned char d_type;
55 : char d_name[1];
56 : };
57 :
58 : /* Function exported from GekkoFS LD_PRELOAD, code needs to be compiled with
59 : * -fPIC */
60 : extern "C" int gkfs_getsingleserverdir(const char *path,
61 : struct dirent_extended *dirp,
62 : unsigned int count, int server)
63 : __attribute__((weak));
64 :
65 : /* PFIND OPTIONS EXTENDED We need to add the GekkoFS mount dir and the number of
66 : * servers */
67 : typedef struct {
68 : char* workdir;
69 : int just_count;
70 : int print_by_process;
71 : char *results_dir;
72 : int stonewall_timer;
73 : int print_rates;
74 :
75 : char *timestamp_file;
76 : char *name_pattern;
77 : regex_t name_regex;
78 : uint64_t size;
79 :
80 : int num_servers;
81 : char *mountdir;
82 : // optimizing parameters NOT USED
83 : int queue_length;
84 : int max_entries_per_iter;
85 : int steal_from_next; // if true, then steal from the next process
86 : int parallel_single_dir_access; // if 1, use hashing to parallelize single
87 : // directory access, if 2 sequential increment
88 :
89 : int verbosity;
90 : } pfind_options_t;
91 :
92 : typedef struct {
93 : uint64_t ctime_min;
94 : double stonewall_endtime;
95 : FILE *logfile;
96 : int needs_stat;
97 : } pfind_runtime_options_t;
98 :
99 : static pfind_runtime_options_t runtime;
100 :
101 : int pfind_rank;
102 :
103 : static pfind_options_t *opt;
104 :
105 0 : void pfind_abort(const std::string str) {
106 0 : printf("%s", str.c_str());
107 0 : exit(1);
108 : }
109 :
110 0 : static void pfind_print_help(pfind_options_t *res) {
111 0 : printf("pfind \nSynopsis:\n"
112 : "pfind <workdir> [-newer <timestamp file>] [-size <size>c] [-name "
113 : "<substr>] [-regex <regex>] [-S <numserver>] [-M <mountdir>]\n"
114 : "\tworkdir = \"%s\"\n"
115 : "\t-newer = \"%s\"\n"
116 : "\t-name|-regex = \"%s\"\n"
117 : "\t-S: num servers = \"%d\"\n"
118 : "\t-M: mountdir = \"%s\"\n"
119 : "Optional flags\n"
120 : "\t-h: prints the help\n"
121 : "\t--help: prints the help without initializing MPI\n",res->workdir,
122 : res->timestamp_file, res->name_pattern, res->num_servers,
123 : res->mountdir );
124 0 : }
125 : int pfind_size;
126 1 : pfind_options_t *pfind_parse_args(int argc, char **argv, int force_print_help){
127 :
128 1 : pfind_rank = 0;
129 1 : pfind_size = 1;
130 :
131 1 : pfind_options_t *res = (pfind_options_t *)malloc(sizeof(pfind_options_t));
132 : // Init Values
133 1 : res->just_count = 0;
134 1 : res->print_by_process = 0;
135 1 : res->stonewall_timer = 0;
136 1 : res->print_rates = 0;
137 1 : res->name_regex = {};
138 1 : res->num_servers = 0;
139 1 : res->mountdir = nullptr;
140 1 : res->queue_length = 0;
141 1 : res->max_entries_per_iter = 0;
142 1 : res->steal_from_next = 0;
143 1 : res->parallel_single_dir_access = 0;
144 :
145 1 : auto print_help = force_print_help;
146 1 : res->workdir = nullptr;
147 1 : res->results_dir = nullptr;
148 1 : res->verbosity = 0;
149 1 : res->timestamp_file = nullptr;
150 1 : res->name_pattern = nullptr;
151 :
152 1 : res->size = std::numeric_limits<uint64_t>::max();
153 1 : res->queue_length = 100000;
154 1 : res->max_entries_per_iter = 1000;
155 1 : char *firstarg = nullptr;
156 :
157 : // when we find special args, we process them
158 : // but we need to replace them with 0 so that getopt will ignore them
159 : // and getopt will continue to process beyond them
160 7 : for (auto i = 1; i < argc - 1; i++) {
161 6 : if (strcmp(argv[i], "-newer") == 0) {
162 0 : res->timestamp_file = strdup(argv[i + 1]);
163 0 : argv[i][0] = 0;
164 0 : argv[++i][0] = 0;
165 6 : } else if (strcmp(argv[i], "-size") == 0) {
166 0 : char *str = argv[i + 1];
167 0 : char extension = str[strlen(str) - 1];
168 0 : str[strlen(str) - 1] = 0;
169 0 : res->size = atoll(str);
170 0 : switch (extension) {
171 : case 'c':
172 : break;
173 0 : default:
174 0 : pfind_abort("Unsupported exension for -size\n");
175 : }
176 0 : argv[i][0] = 0;
177 0 : argv[++i][0] = 0;
178 6 : } else if (strcmp(argv[i], "-name") == 0) {
179 1 : res->name_pattern = (char *)malloc(strlen(argv[i + 1]) * 4 + 100);
180 : // transform a traditional name pattern to a regex:
181 1 : char *str = argv[i + 1];
182 1 : char *out = res->name_pattern;
183 1 : auto pos = 0;
184 5 : for (long unsigned int i = 0; i < strlen(str); i++) {
185 4 : if (str[i] == '*') {
186 2 : pos += sprintf(out + pos, ".*");
187 2 : } else if (str[i] == '.') {
188 0 : pos += sprintf(out + pos, "[.]");
189 2 : } else if (str[i] == '"' || str[i] == '\"') {
190 : // erase the "
191 : } else {
192 2 : out[pos] = str[i];
193 2 : pos++;
194 : }
195 : }
196 1 : out[pos] = 0;
197 :
198 1 : int ret = regcomp(&res->name_regex, res->name_pattern, 0);
199 1 : if (ret) {
200 0 : pfind_abort("Invalid regex for name given\n");
201 : }
202 1 : argv[i][0] = 0;
203 1 : argv[++i][0] = 0;
204 5 : } else if (strcmp(argv[i], "-regex") == 0) {
205 0 : res->name_pattern = strdup(argv[i + 1]);
206 0 : int ret = regcomp(&res->name_regex, res->name_pattern, 0);
207 0 : if (ret) {
208 0 : pfind_abort("Invalid regex for name given\n");
209 : }
210 0 : argv[i][0] = 0;
211 0 : argv[++i][0] = 0;
212 5 : } else if (!firstarg) {
213 1 : firstarg = strdup(argv[i]);
214 1 : argv[i][0] = 0;
215 : }
216 : }
217 1 : if (argc == 2) {
218 0 : firstarg = strdup(argv[1]);
219 : }
220 :
221 3 : int c;
222 3 : while ((c = getopt(argc, argv, "CPs:r:vhD:xq:H:NM:S:")) != -1) {
223 2 : if (c == -1) {
224 : break;
225 : }
226 :
227 2 : switch (c) {
228 0 : case 'H':
229 0 : res->parallel_single_dir_access = atoi(optarg);
230 0 : break;
231 0 : case 'N':
232 0 : res->steal_from_next = 1;
233 0 : break;
234 : case 'x':
235 : /* ignore fake arg that we added when we processed the extra args */
236 : break;
237 0 : case 'P':
238 0 : res->print_by_process = 1;
239 0 : break;
240 0 : case 'C':
241 0 : res->just_count = 1;
242 0 : break;
243 0 : case 'D':
244 0 : if (strcmp(optarg, "rates") == 0) {
245 0 : res->print_rates = 1;
246 : } else {
247 0 : pfind_abort("Unsupported debug flag\n");
248 : }
249 : break;
250 0 : case 'h':
251 0 : print_help = 1;
252 0 : break;
253 0 : case 'r':
254 0 : res->results_dir = strdup(optarg);
255 0 : break;
256 0 : case 'q':
257 0 : res->queue_length = atoi(optarg);
258 0 : break;
259 : if (res->queue_length < 10) {
260 : pfind_abort("Queue must be at least 10 elements!\n");
261 : }
262 : break;
263 0 : case 's':
264 0 : res->stonewall_timer = atol(optarg);
265 0 : break;
266 1 : case 'S':
267 1 : res->num_servers = atoi(optarg);
268 1 : break;
269 1 : case 'M':
270 1 : res->mountdir = strdup(optarg);
271 1 : break;
272 0 : case 'v':
273 0 : res->verbosity++;
274 0 : break;
275 : case 0:
276 : break;
277 : }
278 : }
279 1 : if (res->verbosity > 2 && pfind_rank == 0) {
280 0 : printf("Regex: %s\n", res->name_pattern);
281 : }
282 :
283 1 : if (print_help) {
284 0 : if (pfind_rank == 0)
285 0 : pfind_print_help(res);
286 0 : exit(0);
287 : }
288 :
289 1 : if (!firstarg) {
290 0 : pfind_abort("Error: pfind <directory>\n");
291 : }
292 1 : res->workdir = firstarg;
293 :
294 1 : return res;
295 : }
296 :
297 : /* Client Processing a path.
298 : * We increment local checked/found based on the filters
299 : * Each client sends the request to a subset of GekkoFS servers.
300 : * We use 102400 (plus space from 255 chars paths) so it is nearly 1M files per
301 : * server, which is enough for most cases
302 : *
303 : */
304 4 : void dirProcess(const string path, unsigned long long &checked,
305 : unsigned long long &found, queue<string> &dirs,
306 : unsigned int world_rank, unsigned int world_size,
307 : pfind_options_t *opt) {
308 4 : struct dirent_extended *getdir = (struct dirent_extended *)malloc(
309 : (sizeof(struct dirent_extended) + 255) * 1024 * 100);
310 4 : memset(getdir, 0, (sizeof(struct dirent_extended) + 255) * 1024 * 100);
311 : // cout << "PROCESSING " << world_rank << "/"<< world_size << " = " << path <<
312 : // endl;
313 :
314 8 : for (auto server = 0; server < opt->num_servers; server++) {
315 4 : unsigned long long total_size = 0;
316 4 : long unsigned int n = gkfs_getsingleserverdir(
317 : path.c_str(), getdir,
318 4 : (sizeof(struct dirent_extended) + 255) * 1024 * 100, server);
319 4 : struct dirent_extended *temp = getdir;
320 :
321 8 : while (total_size < n) {
322 6 : if (strlen(temp->d_name) == 0)
323 : break;
324 4 : total_size += temp->d_reclen;
325 : /* Queue directory to process */
326 4 : if (temp->d_type == 1) {
327 6 : string slash;
328 3 : if (path[path.size() - 1] != '/')
329 3 : slash = "/";
330 3 : checked++;
331 6 : dirs.push(path + slash + temp->d_name);
332 3 : temp =
333 3 : reinterpret_cast<dirent_extended *>(reinterpret_cast<char *>(temp) + temp->d_reclen);
334 3 : continue;
335 : }
336 : /* Find filtering */
337 1 : auto timeOK = true;
338 1 : if (opt->timestamp_file) {
339 0 : if ((uint64_t)temp->ctime < runtime.ctime_min)
340 : timeOK = false;
341 : }
342 1 : if (timeOK and (temp->size == opt->size or opt->size == std::numeric_limits<uint64_t>::max()))
343 2 : if (!(opt->name_pattern &&
344 1 : regexec(&opt->name_regex, temp->d_name, 0, nullptr, 0)))
345 0 : found++;
346 1 : checked++;
347 1 : temp = reinterpret_cast<dirent_extended *>(reinterpret_cast<char *>(temp) + temp->d_reclen);
348 : }
349 : }
350 4 : }
351 :
352 1 : int process(pfind_options_t *opt) {
353 : // Print off a hello world message
354 1 : unsigned long long found,checked;
355 : // INIT PFIND
356 1 : found = 0;
357 1 : checked = 0;
358 1 : memset(&runtime, 0, sizeof(pfind_runtime_options_t));
359 :
360 : /* Get timestamp file */
361 1 : if (opt->timestamp_file) {
362 0 : if (pfind_rank == 0) {
363 0 : static struct stat timer_file{};
364 0 : if (lstat(opt->timestamp_file, &timer_file) != 0) {
365 0 : printf("Could not open: \"%s\", error: %s", opt->timestamp_file,
366 0 : strerror(errno));
367 0 : pfind_abort("\n");
368 : }
369 0 : runtime.ctime_min = timer_file.st_ctime;
370 : }
371 : }
372 :
373 1 : queue<string> dirs;
374 2 : string workdir = opt->workdir;
375 1 : workdir = workdir.substr(strlen(opt->mountdir), workdir.size());
376 1 : if (workdir.size() == 0)
377 0 : workdir = "/";
378 1 : dirs.push(workdir);
379 :
380 4 : do {
381 4 : string processpath = dirs.front();
382 4 : dirs.pop();
383 :
384 8 : dirProcess(processpath, checked, found, dirs, 0, 1, opt);
385 : // cout << "NO more paths " << dirs.size() << endl;
386 4 : } while (!dirs.empty());
387 :
388 1 : cout << "MATCHED " << found << "/" << checked << endl;
389 :
390 2 : return 0;
391 : }
392 :
393 1 : int main(int argc, char **argv) {
394 :
395 9 : for (auto i = 0; i < argc; i++) {
396 8 : if (strcmp(argv[i], "--help") == 0) {
397 0 : argv[i][0] = 0;
398 0 : pfind_rank = 0;
399 0 : pfind_parse_args(argc, argv, 1);
400 0 : exit(0);
401 : }
402 : }
403 :
404 1 : opt = pfind_parse_args(argc, argv, 0);
405 :
406 1 : process(opt);
407 :
408 1 : }
|