Line data Source code
1 : /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2 :
3 : /***
4 : This file is part of systemd.
5 :
6 : Copyright 2011 Lennart Poettering
7 :
8 : systemd is free software; you can redistribute it and/or modify it
9 : under the terms of the GNU Lesser General Public License as published by
10 : the Free Software Foundation; either version 2.1 of the License, or
11 : (at your option) any later version.
12 :
13 : systemd is distributed in the hope that it will be useful, but
14 : WITHOUT ANY WARRANTY; without even the implied warranty of
15 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 : Lesser General Public License for more details.
17 :
18 : You should have received a copy of the GNU Lesser General Public License
19 : along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 : ***/
21 :
22 : #include <sys/mman.h>
23 : #include <errno.h>
24 : #include <sys/uio.h>
25 : #include <unistd.h>
26 : #include <sys/statvfs.h>
27 : #include <fcntl.h>
28 : #include <stddef.h>
29 : #include <linux/fs.h>
30 :
31 : #include "btrfs-util.h"
32 : #include "journal-def.h"
33 : #include "journal-file.h"
34 : #include "journal-authenticate.h"
35 : #include "lookup3.h"
36 : #include "compress.h"
37 : #include "random-util.h"
38 :
39 : #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 : #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 :
42 : #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 :
44 : /* This is the minimum journal file size */
45 : #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46 :
47 : /* These are the lower and upper bounds if we deduce the max_use value
48 : * from the file system size */
49 : #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50 : #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 :
52 : /* This is the upper bound if we deduce max_size from max_use */
53 : #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 :
55 : /* This is the upper bound if we deduce the keep_free value from the
56 : * file system size */
57 : #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 :
59 : /* This is the keep_free value when we can't determine the system
60 : * size */
61 : #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 :
63 : /* n_data was the first entry we added after the initial file format design */
64 : #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 :
66 : /* How many entries to keep in the entry array chain cache at max */
67 : #define CHAIN_CACHE_MAX 20
68 :
69 : /* How much to increase the journal file size at once each time we allocate something new. */
70 : #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71 :
72 : /* Reread fstat() of the file for detecting deletions at least this often */
73 : #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
74 :
75 : /* The mmap context to use for the header we pick as one above the last defined typed */
76 : #define CONTEXT_HEADER _OBJECT_TYPE_MAX
77 :
78 32666 : static int journal_file_set_online(JournalFile *f) {
79 32666 : assert(f);
80 :
81 32666 : if (!f->writable)
82 0 : return -EPERM;
83 :
84 32666 : if (!(f->fd >= 0 && f->header))
85 0 : return -EINVAL;
86 :
87 32666 : if (mmap_cache_got_sigbus(f->mmap, f->fd))
88 0 : return -EIO;
89 :
90 32666 : switch(f->header->state) {
91 : case STATE_ONLINE:
92 32647 : return 0;
93 :
94 : case STATE_OFFLINE:
95 19 : f->header->state = STATE_ONLINE;
96 19 : fsync(f->fd);
97 19 : return 0;
98 :
99 : default:
100 0 : return -EINVAL;
101 : }
102 : }
103 :
104 1365 : int journal_file_set_offline(JournalFile *f) {
105 1365 : assert(f);
106 :
107 1365 : if (!f->writable)
108 1346 : return -EPERM;
109 :
110 19 : if (!(f->fd >= 0 && f->header))
111 0 : return -EINVAL;
112 :
113 19 : if (f->header->state != STATE_ONLINE)
114 2 : return 0;
115 :
116 17 : fsync(f->fd);
117 :
118 17 : if (mmap_cache_got_sigbus(f->mmap, f->fd))
119 0 : return -EIO;
120 :
121 17 : f->header->state = STATE_OFFLINE;
122 :
123 17 : if (mmap_cache_got_sigbus(f->mmap, f->fd))
124 0 : return -EIO;
125 :
126 17 : fsync(f->fd);
127 :
128 17 : return 0;
129 : }
130 :
131 1365 : void journal_file_close(JournalFile *f) {
132 1365 : assert(f);
133 :
134 : #ifdef HAVE_GCRYPT
135 : /* Write the final tag */
136 1365 : if (f->seal && f->writable)
137 0 : journal_file_append_tag(f);
138 : #endif
139 :
140 1365 : journal_file_set_offline(f);
141 :
142 1365 : if (f->mmap && f->fd >= 0)
143 1365 : mmap_cache_close_fd(f->mmap, f->fd);
144 :
145 1365 : if (f->fd >= 0 && f->defrag_on_close) {
146 :
147 : /* Be friendly to btrfs: turn COW back on again now,
148 : * and defragment the file. We won't write to the file
149 : * ever again, hence remove all fragmentation, and
150 : * reenable all the good bits COW usually provides
151 : * (such as data checksumming). */
152 :
153 2 : (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
154 2 : (void) btrfs_defrag_fd(f->fd);
155 : }
156 :
157 1365 : safe_close(f->fd);
158 1365 : free(f->path);
159 :
160 1365 : if (f->mmap)
161 1365 : mmap_cache_unref(f->mmap);
162 :
163 1365 : ordered_hashmap_free_free(f->chain_cache);
164 :
165 : #if defined(HAVE_XZ) || defined(HAVE_LZ4)
166 1365 : free(f->compress_buffer);
167 : #endif
168 :
169 : #ifdef HAVE_GCRYPT
170 1365 : if (f->fss_file)
171 0 : munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
172 1365 : else if (f->fsprg_state)
173 0 : free(f->fsprg_state);
174 :
175 1365 : free(f->fsprg_seed);
176 :
177 1365 : if (f->hmac)
178 0 : gcry_md_close(f->hmac);
179 : #endif
180 :
181 1365 : free(f);
182 1365 : }
183 :
184 18 : static int journal_file_init_header(JournalFile *f, JournalFile *template) {
185 18 : Header h = {};
186 : ssize_t k;
187 : int r;
188 :
189 18 : assert(f);
190 :
191 18 : memcpy(h.signature, HEADER_SIGNATURE, 8);
192 18 : h.header_size = htole64(ALIGN64(sizeof(h)));
193 :
194 18 : h.incompatible_flags |= htole32(
195 36 : f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
196 18 : f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
197 :
198 18 : h.compatible_flags = htole32(
199 18 : f->seal * HEADER_COMPATIBLE_SEALED);
200 :
201 18 : r = sd_id128_randomize(&h.file_id);
202 18 : if (r < 0)
203 0 : return r;
204 :
205 18 : if (template) {
206 3 : h.seqnum_id = template->header->seqnum_id;
207 3 : h.tail_entry_seqnum = template->header->tail_entry_seqnum;
208 : } else
209 15 : h.seqnum_id = h.file_id;
210 :
211 18 : k = pwrite(f->fd, &h, sizeof(h), 0);
212 18 : if (k < 0)
213 0 : return -errno;
214 :
215 18 : if (k != sizeof(h))
216 0 : return -EIO;
217 :
218 18 : return 0;
219 : }
220 :
221 19 : static int journal_file_refresh_header(JournalFile *f) {
222 : sd_id128_t boot_id;
223 : int r;
224 :
225 19 : assert(f);
226 :
227 19 : r = sd_id128_get_machine(&f->header->machine_id);
228 19 : if (r < 0)
229 0 : return r;
230 :
231 19 : r = sd_id128_get_boot(&boot_id);
232 19 : if (r < 0)
233 0 : return r;
234 :
235 19 : if (sd_id128_equal(boot_id, f->header->boot_id))
236 1 : f->tail_entry_monotonic_valid = true;
237 :
238 19 : f->header->boot_id = boot_id;
239 :
240 19 : r = journal_file_set_online(f);
241 :
242 : /* Sync the online state to disk */
243 19 : fsync(f->fd);
244 :
245 19 : return r;
246 : }
247 :
248 1347 : static int journal_file_verify_header(JournalFile *f) {
249 : uint32_t flags;
250 :
251 1347 : assert(f);
252 :
253 1347 : if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
254 0 : return -EBADMSG;
255 :
256 : /* In both read and write mode we refuse to open files with
257 : * incompatible flags we don't know */
258 1347 : flags = le32toh(f->header->incompatible_flags);
259 1347 : if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
260 816 : if (flags & ~HEADER_INCOMPATIBLE_ANY)
261 0 : log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
262 : f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
263 816 : flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
264 816 : if (flags)
265 816 : log_debug("Journal file %s uses incompatible flags %"PRIx32
266 : " disabled at compilation time.", f->path, flags);
267 816 : return -EPROTONOSUPPORT;
268 : }
269 :
270 : /* When open for writing we refuse to open files with
271 : * compatible flags, too */
272 531 : flags = le32toh(f->header->compatible_flags);
273 531 : if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
274 0 : if (flags & ~HEADER_COMPATIBLE_ANY)
275 0 : log_debug("Journal file %s has unknown compatible flags %"PRIx32,
276 : f->path, flags & ~HEADER_COMPATIBLE_ANY);
277 0 : flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
278 0 : if (flags)
279 0 : log_debug("Journal file %s uses compatible flags %"PRIx32
280 : " disabled at compilation time.", f->path, flags);
281 0 : return -EPROTONOSUPPORT;
282 : }
283 :
284 531 : if (f->header->state >= _STATE_MAX)
285 0 : return -EBADMSG;
286 :
287 : /* The first addition was n_data, so check that we are at least this large */
288 531 : if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
289 0 : return -EBADMSG;
290 :
291 531 : if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
292 0 : return -EBADMSG;
293 :
294 531 : if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
295 0 : return -ENODATA;
296 :
297 531 : if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
298 0 : return -ENODATA;
299 :
300 1062 : if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
301 1062 : !VALID64(le64toh(f->header->field_hash_table_offset)) ||
302 1062 : !VALID64(le64toh(f->header->tail_object_offset)) ||
303 531 : !VALID64(le64toh(f->header->entry_array_offset)))
304 0 : return -ENODATA;
305 :
306 531 : if (f->writable) {
307 : uint8_t state;
308 : sd_id128_t machine_id;
309 : int r;
310 :
311 1 : r = sd_id128_get_machine(&machine_id);
312 1 : if (r < 0)
313 0 : return r;
314 :
315 1 : if (!sd_id128_equal(machine_id, f->header->machine_id))
316 0 : return -EHOSTDOWN;
317 :
318 1 : state = f->header->state;
319 :
320 1 : if (state == STATE_ONLINE) {
321 0 : log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
322 0 : return -EBUSY;
323 1 : } else if (state == STATE_ARCHIVED)
324 0 : return -ESHUTDOWN;
325 1 : else if (state != STATE_OFFLINE) {
326 0 : log_debug("Journal file %s has unknown state %i.", f->path, state);
327 0 : return -EBUSY;
328 : }
329 : }
330 :
331 531 : f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
332 531 : f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
333 :
334 531 : f->seal = JOURNAL_HEADER_SEALED(f->header);
335 :
336 531 : return 0;
337 : }
338 :
339 1402 : static int journal_file_fstat(JournalFile *f) {
340 1402 : assert(f);
341 1402 : assert(f->fd >= 0);
342 :
343 1402 : if (fstat(f->fd, &f->last_stat) < 0)
344 0 : return -errno;
345 :
346 1402 : f->last_stat_usec = now(CLOCK_MONOTONIC);
347 :
348 : /* Refuse appending to files that are already deleted */
349 1402 : if (f->last_stat.st_nlink <= 0)
350 0 : return -EIDRM;
351 :
352 1402 : return 0;
353 : }
354 :
355 32647 : static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
356 : uint64_t old_size, new_size;
357 : int r;
358 :
359 32647 : assert(f);
360 :
361 : /* We assume that this file is not sparse, and we know that
362 : * for sure, since we always call posix_fallocate()
363 : * ourselves */
364 :
365 32647 : if (mmap_cache_got_sigbus(f->mmap, f->fd))
366 0 : return -EIO;
367 :
368 32647 : old_size =
369 32647 : le64toh(f->header->header_size) +
370 32647 : le64toh(f->header->arena_size);
371 :
372 32647 : new_size = PAGE_ALIGN(offset + size);
373 32647 : if (new_size < le64toh(f->header->header_size))
374 0 : new_size = le64toh(f->header->header_size);
375 :
376 32647 : if (new_size <= old_size) {
377 :
378 : /* We already pre-allocated enough space, but before
379 : * we write to it, let's check with fstat() if the
380 : * file got deleted, in order make sure we don't throw
381 : * away the data immediately. Don't check fstat() for
382 : * all writes though, but only once ever 10s. */
383 :
384 32628 : if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
385 32628 : return 0;
386 :
387 0 : return journal_file_fstat(f);
388 : }
389 :
390 : /* Allocate more space. */
391 :
392 19 : if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
393 0 : return -E2BIG;
394 :
395 19 : if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
396 : struct statvfs svfs;
397 :
398 0 : if (fstatvfs(f->fd, &svfs) >= 0) {
399 : uint64_t available;
400 :
401 0 : available = svfs.f_bfree * svfs.f_bsize;
402 :
403 0 : if (available >= f->metrics.keep_free)
404 0 : available -= f->metrics.keep_free;
405 : else
406 0 : available = 0;
407 :
408 0 : if (new_size - old_size > available)
409 0 : return -E2BIG;
410 : }
411 : }
412 :
413 : /* Increase by larger blocks at once */
414 19 : new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
415 19 : if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
416 0 : new_size = f->metrics.max_size;
417 :
418 : /* Note that the glibc fallocate() fallback is very
419 : inefficient, hence we try to minimize the allocation area
420 : as we can. */
421 19 : r = posix_fallocate(f->fd, old_size, new_size - old_size);
422 19 : if (r != 0)
423 0 : return -r;
424 :
425 19 : f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
426 :
427 19 : return journal_file_fstat(f);
428 : }
429 :
430 3886752 : static unsigned type_to_context(ObjectType type) {
431 : /* One context for each type, plus one catch-all for the rest */
432 : assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
433 : assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
434 3886752 : return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
435 : }
436 :
437 3886752 : static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
438 : int r;
439 :
440 3886752 : assert(f);
441 3886752 : assert(ret);
442 :
443 3886752 : if (size <= 0)
444 0 : return -EINVAL;
445 :
446 : /* Avoid SIGBUS on invalid accesses */
447 3886752 : if (offset + size > (uint64_t) f->last_stat.st_size) {
448 : /* Hmm, out of range? Let's refresh the fstat() data
449 : * first, before we trust that check. */
450 :
451 0 : r = journal_file_fstat(f);
452 0 : if (r < 0)
453 0 : return r;
454 :
455 0 : if (offset + size > (uint64_t) f->last_stat.st_size)
456 0 : return -EADDRNOTAVAIL;
457 : }
458 :
459 3886752 : return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
460 : }
461 :
462 1927034 : static uint64_t minimum_header_size(Object *o) {
463 :
464 : static const uint64_t table[] = {
465 : [OBJECT_DATA] = sizeof(DataObject),
466 : [OBJECT_FIELD] = sizeof(FieldObject),
467 : [OBJECT_ENTRY] = sizeof(EntryObject),
468 : [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
469 : [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
470 : [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
471 : [OBJECT_TAG] = sizeof(TagObject),
472 : };
473 :
474 1927034 : if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
475 0 : return sizeof(ObjectHeader);
476 :
477 1927034 : return table[o->object.type];
478 : }
479 :
480 1927034 : int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
481 : int r;
482 : void *t;
483 : Object *o;
484 : uint64_t s;
485 :
486 1927034 : assert(f);
487 1927034 : assert(ret);
488 :
489 : /* Objects may only be located at multiple of 64 bit */
490 1927034 : if (!VALID64(offset))
491 0 : return -EFAULT;
492 :
493 1927034 : r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
494 1927034 : if (r < 0)
495 0 : return r;
496 :
497 1927034 : o = (Object*) t;
498 1927034 : s = le64toh(o->object.size);
499 :
500 1927034 : if (s < sizeof(ObjectHeader))
501 0 : return -EBADMSG;
502 :
503 1927034 : if (o->object.type <= OBJECT_UNUSED)
504 0 : return -EBADMSG;
505 :
506 1927034 : if (s < minimum_header_size(o))
507 0 : return -EBADMSG;
508 :
509 1927034 : if (type > OBJECT_UNUSED && o->object.type != type)
510 0 : return -EBADMSG;
511 :
512 1927034 : if (s > sizeof(ObjectHeader)) {
513 1927034 : r = journal_file_move_to(f, type, false, offset, s, &t);
514 1927034 : if (r < 0)
515 0 : return r;
516 :
517 1927034 : o = (Object*) t;
518 : }
519 :
520 1927034 : *ret = o;
521 1927034 : return 0;
522 : }
523 :
524 16279 : static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
525 : uint64_t r;
526 :
527 16279 : assert(f);
528 :
529 16279 : r = le64toh(f->header->tail_entry_seqnum) + 1;
530 :
531 16279 : if (seqnum) {
532 : /* If an external seqnum counter was passed, we update
533 : * both the local and the external one, and set it to
534 : * the maximum of both */
535 :
536 7 : if (*seqnum + 1 > r)
537 1 : r = *seqnum + 1;
538 :
539 7 : *seqnum = r;
540 : }
541 :
542 16279 : f->header->tail_entry_seqnum = htole64(r);
543 :
544 16279 : if (f->header->head_entry_seqnum == 0)
545 12 : f->header->head_entry_seqnum = htole64(r);
546 :
547 16279 : return r;
548 : }
549 :
550 32647 : int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
551 : int r;
552 : uint64_t p;
553 : Object *tail, *o;
554 : void *t;
555 :
556 32647 : assert(f);
557 32647 : assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
558 32647 : assert(size >= sizeof(ObjectHeader));
559 32647 : assert(offset);
560 32647 : assert(ret);
561 :
562 32647 : r = journal_file_set_online(f);
563 32647 : if (r < 0)
564 0 : return r;
565 :
566 32647 : p = le64toh(f->header->tail_object_offset);
567 32647 : if (p == 0)
568 18 : p = le64toh(f->header->header_size);
569 : else {
570 32629 : r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
571 32629 : if (r < 0)
572 0 : return r;
573 :
574 32629 : p += ALIGN64(le64toh(tail->object.size));
575 : }
576 :
577 32647 : r = journal_file_allocate(f, p, size);
578 32647 : if (r < 0)
579 0 : return r;
580 :
581 32647 : r = journal_file_move_to(f, type, false, p, size, &t);
582 32647 : if (r < 0)
583 0 : return r;
584 :
585 32647 : o = (Object*) t;
586 :
587 32647 : zero(o->object);
588 32647 : o->object.type = type;
589 32647 : o->object.size = htole64(size);
590 :
591 32647 : f->header->tail_object_offset = htole64(p);
592 32647 : f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
593 :
594 32647 : *ret = o;
595 32647 : *offset = p;
596 :
597 32647 : return 0;
598 : }
599 :
600 18 : static int journal_file_setup_data_hash_table(JournalFile *f) {
601 : uint64_t s, p;
602 : Object *o;
603 : int r;
604 :
605 18 : assert(f);
606 :
607 : /* We estimate that we need 1 hash table entry per 768 of
608 : journal file and we want to make sure we never get beyond
609 : 75% fill level. Calculate the hash table size for the
610 : maximum file size based on these metrics. */
611 :
612 18 : s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
613 18 : if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
614 18 : s = DEFAULT_DATA_HASH_TABLE_SIZE;
615 :
616 18 : log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
617 :
618 18 : r = journal_file_append_object(f,
619 : OBJECT_DATA_HASH_TABLE,
620 : offsetof(Object, hash_table.items) + s,
621 : &o, &p);
622 18 : if (r < 0)
623 0 : return r;
624 :
625 18 : memzero(o->hash_table.items, s);
626 :
627 18 : f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
628 18 : f->header->data_hash_table_size = htole64(s);
629 :
630 18 : return 0;
631 : }
632 :
633 18 : static int journal_file_setup_field_hash_table(JournalFile *f) {
634 : uint64_t s, p;
635 : Object *o;
636 : int r;
637 :
638 18 : assert(f);
639 :
640 : /* We use a fixed size hash table for the fields as this
641 : * number should grow very slowly only */
642 :
643 18 : s = DEFAULT_FIELD_HASH_TABLE_SIZE;
644 18 : r = journal_file_append_object(f,
645 : OBJECT_FIELD_HASH_TABLE,
646 : offsetof(Object, hash_table.items) + s,
647 : &o, &p);
648 18 : if (r < 0)
649 0 : return r;
650 :
651 18 : memzero(o->hash_table.items, s);
652 :
653 18 : f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
654 18 : f->header->field_hash_table_size = htole64(s);
655 :
656 18 : return 0;
657 : }
658 :
659 165946 : int journal_file_map_data_hash_table(JournalFile *f) {
660 : uint64_t s, p;
661 : void *t;
662 : int r;
663 :
664 165946 : assert(f);
665 :
666 165946 : if (f->data_hash_table)
667 165925 : return 0;
668 :
669 21 : p = le64toh(f->header->data_hash_table_offset);
670 21 : s = le64toh(f->header->data_hash_table_size);
671 :
672 21 : r = journal_file_move_to(f,
673 : OBJECT_DATA_HASH_TABLE,
674 : true,
675 : p, s,
676 : &t);
677 21 : if (r < 0)
678 0 : return r;
679 :
680 21 : f->data_hash_table = t;
681 21 : return 0;
682 : }
683 :
684 11306 : int journal_file_map_field_hash_table(JournalFile *f) {
685 : uint64_t s, p;
686 : void *t;
687 : int r;
688 :
689 11306 : assert(f);
690 :
691 11306 : if (f->field_hash_table)
692 11290 : return 0;
693 :
694 16 : p = le64toh(f->header->field_hash_table_offset);
695 16 : s = le64toh(f->header->field_hash_table_size);
696 :
697 16 : r = journal_file_move_to(f,
698 : OBJECT_FIELD_HASH_TABLE,
699 : true,
700 : p, s,
701 : &t);
702 16 : if (r < 0)
703 0 : return r;
704 :
705 16 : f->field_hash_table = t;
706 16 : return 0;
707 : }
708 :
709 71 : static int journal_file_link_field(
710 : JournalFile *f,
711 : Object *o,
712 : uint64_t offset,
713 : uint64_t hash) {
714 :
715 : uint64_t p, h, m;
716 : int r;
717 :
718 71 : assert(f);
719 71 : assert(o);
720 71 : assert(offset > 0);
721 :
722 71 : if (o->object.type != OBJECT_FIELD)
723 0 : return -EINVAL;
724 :
725 71 : m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
726 71 : if (m <= 0)
727 0 : return -EBADMSG;
728 :
729 : /* This might alter the window we are looking at */
730 71 : o->field.next_hash_offset = o->field.head_data_offset = 0;
731 :
732 71 : h = hash % m;
733 71 : p = le64toh(f->field_hash_table[h].tail_hash_offset);
734 71 : if (p == 0)
735 65 : f->field_hash_table[h].head_hash_offset = htole64(offset);
736 : else {
737 6 : r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
738 6 : if (r < 0)
739 0 : return r;
740 :
741 6 : o->field.next_hash_offset = htole64(offset);
742 : }
743 :
744 71 : f->field_hash_table[h].tail_hash_offset = htole64(offset);
745 :
746 71 : if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
747 71 : f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
748 :
749 71 : return 0;
750 : }
751 :
752 11303 : static int journal_file_link_data(
753 : JournalFile *f,
754 : Object *o,
755 : uint64_t offset,
756 : uint64_t hash) {
757 :
758 : uint64_t p, h, m;
759 : int r;
760 :
761 11303 : assert(f);
762 11303 : assert(o);
763 11303 : assert(offset > 0);
764 :
765 11303 : if (o->object.type != OBJECT_DATA)
766 0 : return -EINVAL;
767 :
768 11303 : m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
769 11303 : if (m <= 0)
770 0 : return -EBADMSG;
771 :
772 : /* This might alter the window we are looking at */
773 11303 : o->data.next_hash_offset = o->data.next_field_offset = 0;
774 11303 : o->data.entry_offset = o->data.entry_array_offset = 0;
775 11303 : o->data.n_entries = 0;
776 :
777 11303 : h = hash % m;
778 11303 : p = le64toh(f->data_hash_table[h].tail_hash_offset);
779 11303 : if (p == 0)
780 : /* Only entry in the hash table is easy */
781 2378 : f->data_hash_table[h].head_hash_offset = htole64(offset);
782 : else {
783 : /* Move back to the previous data object, to patch in
784 : * pointer */
785 :
786 8925 : r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
787 8925 : if (r < 0)
788 0 : return r;
789 :
790 8925 : o->data.next_hash_offset = htole64(offset);
791 : }
792 :
793 11303 : f->data_hash_table[h].tail_hash_offset = htole64(offset);
794 :
795 11303 : if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
796 11303 : f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
797 :
798 11303 : return 0;
799 : }
800 :
801 11306 : int journal_file_find_field_object_with_hash(
802 : JournalFile *f,
803 : const void *field, uint64_t size, uint64_t hash,
804 : Object **ret, uint64_t *offset) {
805 :
806 : uint64_t p, osize, h, m;
807 : int r;
808 :
809 11306 : assert(f);
810 11306 : assert(field && size > 0);
811 :
812 : /* If the field hash table is empty, we can't find anything */
813 11306 : if (le64toh(f->header->field_hash_table_size) <= 0)
814 0 : return 0;
815 :
816 : /* Map the field hash table, if it isn't mapped yet. */
817 11306 : r = journal_file_map_field_hash_table(f);
818 11306 : if (r < 0)
819 0 : return r;
820 :
821 11306 : osize = offsetof(Object, field.payload) + size;
822 :
823 11306 : m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
824 11306 : if (m <= 0)
825 0 : return -EBADMSG;
826 :
827 11306 : h = hash % m;
828 11306 : p = le64toh(f->field_hash_table[h].head_hash_offset);
829 :
830 11306 : while (p > 0) {
831 : Object *o;
832 :
833 11248 : r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
834 11248 : if (r < 0)
835 11235 : return r;
836 :
837 22483 : if (le64toh(o->field.hash) == hash &&
838 22470 : le64toh(o->object.size) == osize &&
839 11235 : memcmp(o->field.payload, field, size) == 0) {
840 :
841 11235 : if (ret)
842 11235 : *ret = o;
843 11235 : if (offset)
844 11232 : *offset = p;
845 :
846 11235 : return 1;
847 : }
848 :
849 13 : p = le64toh(o->field.next_hash_offset);
850 : }
851 :
852 71 : return 0;
853 : }
854 :
855 3 : int journal_file_find_field_object(
856 : JournalFile *f,
857 : const void *field, uint64_t size,
858 : Object **ret, uint64_t *offset) {
859 :
860 : uint64_t hash;
861 :
862 3 : assert(f);
863 3 : assert(field && size > 0);
864 :
865 3 : hash = hash64(field, size);
866 :
867 3 : return journal_file_find_field_object_with_hash(f,
868 : field, size, hash,
869 : ret, offset);
870 : }
871 :
872 159945 : int journal_file_find_data_object_with_hash(
873 : JournalFile *f,
874 : const void *data, uint64_t size, uint64_t hash,
875 : Object **ret, uint64_t *offset) {
876 :
877 : uint64_t p, osize, h, m;
878 : int r;
879 :
880 159945 : assert(f);
881 159945 : assert(data || size == 0);
882 :
883 : /* If there's no data hash table, then there's no entry. */
884 159945 : if (le64toh(f->header->data_hash_table_size) <= 0)
885 0 : return 0;
886 :
887 : /* Map the data hash table, if it isn't mapped yet. */
888 159945 : r = journal_file_map_data_hash_table(f);
889 159945 : if (r < 0)
890 0 : return r;
891 :
892 159945 : osize = offsetof(Object, data.payload) + size;
893 :
894 159945 : m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
895 159945 : if (m <= 0)
896 0 : return -EBADMSG;
897 :
898 159945 : h = hash % m;
899 159945 : p = le64toh(f->data_hash_table[h].head_hash_offset);
900 :
901 159945 : while (p > 0) {
902 : Object *o;
903 :
904 282621 : r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
905 282621 : if (r < 0)
906 148261 : return r;
907 :
908 282621 : if (le64toh(o->data.hash) != hash)
909 134360 : goto next;
910 :
911 148261 : if (o->object.flags & OBJECT_COMPRESSION_MASK) {
912 : #if defined(HAVE_XZ) || defined(HAVE_LZ4)
913 : uint64_t l;
914 0 : size_t rsize = 0;
915 :
916 0 : l = le64toh(o->object.size);
917 0 : if (l <= offsetof(Object, data.payload))
918 0 : return -EBADMSG;
919 :
920 0 : l -= offsetof(Object, data.payload);
921 :
922 0 : r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
923 0 : o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
924 0 : if (r < 0)
925 0 : return r;
926 :
927 0 : if (rsize == size &&
928 0 : memcmp(f->compress_buffer, data, size) == 0) {
929 :
930 0 : if (ret)
931 0 : *ret = o;
932 :
933 0 : if (offset)
934 0 : *offset = p;
935 :
936 0 : return 1;
937 : }
938 : #else
939 : return -EPROTONOSUPPORT;
940 : #endif
941 296522 : } else if (le64toh(o->object.size) == osize &&
942 148261 : memcmp(o->data.payload, data, size) == 0) {
943 :
944 148261 : if (ret)
945 148069 : *ret = o;
946 :
947 148261 : if (offset)
948 148261 : *offset = p;
949 :
950 148261 : return 1;
951 : }
952 :
953 : next:
954 134360 : p = le64toh(o->data.next_hash_offset);
955 : }
956 :
957 11684 : return 0;
958 : }
959 :
960 7 : int journal_file_find_data_object(
961 : JournalFile *f,
962 : const void *data, uint64_t size,
963 : Object **ret, uint64_t *offset) {
964 :
965 : uint64_t hash;
966 :
967 7 : assert(f);
968 7 : assert(data || size == 0);
969 :
970 7 : hash = hash64(data, size);
971 :
972 7 : return journal_file_find_data_object_with_hash(f,
973 : data, size, hash,
974 : ret, offset);
975 : }
976 :
977 11303 : static int journal_file_append_field(
978 : JournalFile *f,
979 : const void *field, uint64_t size,
980 : Object **ret, uint64_t *offset) {
981 :
982 : uint64_t hash, p;
983 : uint64_t osize;
984 : Object *o;
985 : int r;
986 :
987 11303 : assert(f);
988 11303 : assert(field && size > 0);
989 :
990 11303 : hash = hash64(field, size);
991 :
992 11303 : r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
993 11303 : if (r < 0)
994 0 : return r;
995 11303 : else if (r > 0) {
996 :
997 11232 : if (ret)
998 11232 : *ret = o;
999 :
1000 11232 : if (offset)
1001 11232 : *offset = p;
1002 :
1003 11232 : return 0;
1004 : }
1005 :
1006 71 : osize = offsetof(Object, field.payload) + size;
1007 71 : r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1008 71 : if (r < 0)
1009 0 : return r;
1010 :
1011 71 : o->field.hash = htole64(hash);
1012 71 : memcpy(o->field.payload, field, size);
1013 :
1014 71 : r = journal_file_link_field(f, o, p, hash);
1015 71 : if (r < 0)
1016 0 : return r;
1017 :
1018 : /* The linking might have altered the window, so let's
1019 : * refresh our pointer */
1020 71 : r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1021 71 : if (r < 0)
1022 0 : return r;
1023 :
1024 : #ifdef HAVE_GCRYPT
1025 71 : r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1026 71 : if (r < 0)
1027 0 : return r;
1028 : #endif
1029 :
1030 71 : if (ret)
1031 71 : *ret = o;
1032 :
1033 71 : if (offset)
1034 71 : *offset = p;
1035 :
1036 71 : return 0;
1037 : }
1038 :
1039 159312 : static int journal_file_append_data(
1040 : JournalFile *f,
1041 : const void *data, uint64_t size,
1042 : Object **ret, uint64_t *offset) {
1043 :
1044 : uint64_t hash, p;
1045 : uint64_t osize;
1046 : Object *o;
1047 159312 : int r, compression = 0;
1048 : const void *eq;
1049 :
1050 159312 : assert(f);
1051 159312 : assert(data || size == 0);
1052 :
1053 159312 : hash = hash64(data, size);
1054 :
1055 159312 : r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1056 159312 : if (r < 0)
1057 0 : return r;
1058 159312 : else if (r > 0) {
1059 :
1060 148009 : if (ret)
1061 148009 : *ret = o;
1062 :
1063 148009 : if (offset)
1064 148009 : *offset = p;
1065 :
1066 148009 : return 0;
1067 : }
1068 :
1069 11303 : osize = offsetof(Object, data.payload) + size;
1070 11303 : r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1071 11303 : if (r < 0)
1072 0 : return r;
1073 :
1074 11303 : o->data.hash = htole64(hash);
1075 :
1076 : #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1077 11303 : if (f->compress_xz &&
1078 : size >= COMPRESSION_SIZE_THRESHOLD) {
1079 0 : size_t rsize = 0;
1080 :
1081 0 : compression = compress_blob(data, size, o->data.payload, &rsize);
1082 :
1083 0 : if (compression) {
1084 0 : o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1085 0 : o->object.flags |= compression;
1086 :
1087 0 : log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1088 : size, rsize, object_compressed_to_string(compression));
1089 : }
1090 : }
1091 : #endif
1092 :
1093 11303 : if (!compression && size > 0)
1094 11303 : memcpy(o->data.payload, data, size);
1095 :
1096 11303 : r = journal_file_link_data(f, o, p, hash);
1097 11303 : if (r < 0)
1098 0 : return r;
1099 :
1100 : /* The linking might have altered the window, so let's
1101 : * refresh our pointer */
1102 11303 : r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1103 11303 : if (r < 0)
1104 0 : return r;
1105 :
1106 11303 : if (!data)
1107 0 : eq = NULL;
1108 : else
1109 11303 : eq = memchr(data, '=', size);
1110 11303 : if (eq && eq > data) {
1111 11303 : Object *fo = NULL;
1112 : uint64_t fp;
1113 :
1114 : /* Create field object ... */
1115 11303 : r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1116 11303 : if (r < 0)
1117 0 : return r;
1118 :
1119 : /* ... and link it in. */
1120 11303 : o->data.next_field_offset = fo->field.head_data_offset;
1121 11303 : fo->field.head_data_offset = le64toh(p);
1122 : }
1123 :
1124 : #ifdef HAVE_GCRYPT
1125 11303 : r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1126 11303 : if (r < 0)
1127 0 : return r;
1128 : #endif
1129 :
1130 11303 : if (ret)
1131 11303 : *ret = o;
1132 :
1133 11303 : if (offset)
1134 11303 : *offset = p;
1135 :
1136 11303 : return 0;
1137 : }
1138 :
1139 50876 : uint64_t journal_file_entry_n_items(Object *o) {
1140 50876 : assert(o);
1141 :
1142 50876 : if (o->object.type != OBJECT_ENTRY)
1143 0 : return 0;
1144 :
1145 50876 : return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1146 : }
1147 :
1148 1043534 : uint64_t journal_file_entry_array_n_items(Object *o) {
1149 1043534 : assert(o);
1150 :
1151 1043534 : if (o->object.type != OBJECT_ENTRY_ARRAY)
1152 0 : return 0;
1153 :
1154 1043534 : return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1155 : }
1156 :
1157 2382 : uint64_t journal_file_hash_table_n_items(Object *o) {
1158 2382 : assert(o);
1159 :
1160 2716 : if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1161 334 : o->object.type != OBJECT_FIELD_HASH_TABLE)
1162 0 : return 0;
1163 :
1164 2382 : return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1165 : }
1166 :
1167 164288 : static int link_entry_into_array(JournalFile *f,
1168 : le64_t *first,
1169 : le64_t *idx,
1170 : uint64_t p) {
1171 : int r;
1172 164288 : uint64_t n = 0, ap = 0, q, i, a, hidx;
1173 : Object *o;
1174 :
1175 164288 : assert(f);
1176 164288 : assert(first);
1177 164288 : assert(idx);
1178 164288 : assert(p > 0);
1179 :
1180 164288 : a = le64toh(*first);
1181 164288 : i = hidx = le64toh(*idx);
1182 1129581 : while (a > 0) {
1183 :
1184 960335 : r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1185 960335 : if (r < 0)
1186 0 : return r;
1187 :
1188 960335 : n = journal_file_entry_array_n_items(o);
1189 960335 : if (i < n) {
1190 159330 : o->entry_array.items[i] = htole64(p);
1191 159330 : *idx = htole64(hidx + 1);
1192 159330 : return 0;
1193 : }
1194 :
1195 801005 : i -= n;
1196 801005 : ap = a;
1197 801005 : a = le64toh(o->entry_array.next_entry_array_offset);
1198 : }
1199 :
1200 4958 : if (hidx > n)
1201 907 : n = (hidx+1) * 2;
1202 : else
1203 4051 : n = n * 2;
1204 :
1205 4958 : if (n < 4)
1206 2619 : n = 4;
1207 :
1208 4958 : r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1209 : offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1210 : &o, &q);
1211 4958 : if (r < 0)
1212 0 : return r;
1213 :
1214 : #ifdef HAVE_GCRYPT
1215 4958 : r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1216 4958 : if (r < 0)
1217 0 : return r;
1218 : #endif
1219 :
1220 4958 : o->entry_array.items[i] = htole64(p);
1221 :
1222 4958 : if (ap == 0)
1223 2619 : *first = htole64(q);
1224 : else {
1225 2339 : r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1226 2339 : if (r < 0)
1227 0 : return r;
1228 :
1229 2339 : o->entry_array.next_entry_array_offset = htole64(q);
1230 : }
1231 :
1232 4958 : if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1233 4958 : f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1234 :
1235 4958 : *idx = htole64(hidx + 1);
1236 :
1237 4958 : return 0;
1238 : }
1239 :
1240 159312 : static int link_entry_into_array_plus_one(JournalFile *f,
1241 : le64_t *extra,
1242 : le64_t *first,
1243 : le64_t *idx,
1244 : uint64_t p) {
1245 :
1246 : int r;
1247 :
1248 159312 : assert(f);
1249 159312 : assert(extra);
1250 159312 : assert(first);
1251 159312 : assert(idx);
1252 159312 : assert(p > 0);
1253 :
1254 159312 : if (*idx == 0)
1255 11303 : *extra = htole64(p);
1256 : else {
1257 : le64_t i;
1258 :
1259 148009 : i = htole64(le64toh(*idx) - 1);
1260 148009 : r = link_entry_into_array(f, first, &i, p);
1261 148009 : if (r < 0)
1262 0 : return r;
1263 : }
1264 :
1265 159312 : *idx = htole64(le64toh(*idx) + 1);
1266 159312 : return 0;
1267 : }
1268 :
1269 159312 : static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1270 : uint64_t p;
1271 : int r;
1272 159312 : assert(f);
1273 159312 : assert(o);
1274 159312 : assert(offset > 0);
1275 :
1276 159312 : p = le64toh(o->entry.items[i].object_offset);
1277 159312 : if (p == 0)
1278 0 : return -EINVAL;
1279 :
1280 159312 : r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1281 159312 : if (r < 0)
1282 0 : return r;
1283 :
1284 477936 : return link_entry_into_array_plus_one(f,
1285 159312 : &o->data.entry_offset,
1286 159312 : &o->data.entry_array_offset,
1287 159312 : &o->data.n_entries,
1288 : offset);
1289 : }
1290 :
1291 16279 : static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1292 : uint64_t n, i;
1293 : int r;
1294 :
1295 16279 : assert(f);
1296 16279 : assert(o);
1297 16279 : assert(offset > 0);
1298 :
1299 16279 : if (o->object.type != OBJECT_ENTRY)
1300 0 : return -EINVAL;
1301 :
1302 16279 : __sync_synchronize();
1303 :
1304 : /* Link up the entry itself */
1305 32558 : r = link_entry_into_array(f,
1306 16279 : &f->header->entry_array_offset,
1307 16279 : &f->header->n_entries,
1308 : offset);
1309 16279 : if (r < 0)
1310 0 : return r;
1311 :
1312 : /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1313 :
1314 16279 : if (f->header->head_entry_realtime == 0)
1315 12 : f->header->head_entry_realtime = o->entry.realtime;
1316 :
1317 16279 : f->header->tail_entry_realtime = o->entry.realtime;
1318 16279 : f->header->tail_entry_monotonic = o->entry.monotonic;
1319 :
1320 16279 : f->tail_entry_monotonic_valid = true;
1321 :
1322 : /* Link up the items */
1323 16279 : n = journal_file_entry_n_items(o);
1324 175591 : for (i = 0; i < n; i++) {
1325 159312 : r = journal_file_link_entry_item(f, o, offset, i);
1326 159312 : if (r < 0)
1327 0 : return r;
1328 : }
1329 :
1330 16279 : return 0;
1331 : }
1332 :
1333 16279 : static int journal_file_append_entry_internal(
1334 : JournalFile *f,
1335 : const dual_timestamp *ts,
1336 : uint64_t xor_hash,
1337 : const EntryItem items[], unsigned n_items,
1338 : uint64_t *seqnum,
1339 : Object **ret, uint64_t *offset) {
1340 : uint64_t np;
1341 : uint64_t osize;
1342 : Object *o;
1343 : int r;
1344 :
1345 16279 : assert(f);
1346 16279 : assert(items || n_items == 0);
1347 16279 : assert(ts);
1348 :
1349 16279 : osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1350 :
1351 16279 : r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1352 16279 : if (r < 0)
1353 0 : return r;
1354 :
1355 16279 : o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1356 16279 : memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1357 16279 : o->entry.realtime = htole64(ts->realtime);
1358 16279 : o->entry.monotonic = htole64(ts->monotonic);
1359 16279 : o->entry.xor_hash = htole64(xor_hash);
1360 16279 : o->entry.boot_id = f->header->boot_id;
1361 :
1362 : #ifdef HAVE_GCRYPT
1363 16279 : r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1364 16279 : if (r < 0)
1365 0 : return r;
1366 : #endif
1367 :
1368 16279 : r = journal_file_link_entry(f, o, np);
1369 16279 : if (r < 0)
1370 0 : return r;
1371 :
1372 16279 : if (ret)
1373 0 : *ret = o;
1374 :
1375 16279 : if (offset)
1376 0 : *offset = np;
1377 :
1378 16279 : return 0;
1379 : }
1380 :
1381 6278 : void journal_file_post_change(JournalFile *f) {
1382 6278 : assert(f);
1383 :
1384 : /* inotify() does not receive IN_MODIFY events from file
1385 : * accesses done via mmap(). After each access we hence
1386 : * trigger IN_MODIFY by truncating the journal file to its
1387 : * current size which triggers IN_MODIFY. */
1388 :
1389 6278 : __sync_synchronize();
1390 :
1391 6278 : if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1392 0 : log_error_errno(errno, "Failed to truncate file to its own size: %m");
1393 6278 : }
1394 :
1395 260 : static int entry_item_cmp(const void *_a, const void *_b) {
1396 260 : const EntryItem *a = _a, *b = _b;
1397 :
1398 260 : if (le64toh(a->object_offset) < le64toh(b->object_offset))
1399 5 : return -1;
1400 255 : if (le64toh(a->object_offset) > le64toh(b->object_offset))
1401 255 : return 1;
1402 0 : return 0;
1403 : }
1404 :
1405 6278 : int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1406 : unsigned i;
1407 : EntryItem *items;
1408 : int r;
1409 6278 : uint64_t xor_hash = 0;
1410 : struct dual_timestamp _ts;
1411 :
1412 6278 : assert(f);
1413 6278 : assert(iovec || n_iovec == 0);
1414 :
1415 6278 : if (!ts) {
1416 0 : dual_timestamp_get(&_ts);
1417 0 : ts = &_ts;
1418 : }
1419 :
1420 12545 : if (f->tail_entry_monotonic_valid &&
1421 6267 : ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1422 0 : return -EINVAL;
1423 :
1424 : #ifdef HAVE_GCRYPT
1425 6278 : r = journal_file_maybe_append_tag(f, ts->realtime);
1426 6278 : if (r < 0)
1427 0 : return r;
1428 : #endif
1429 :
1430 : /* alloca() can't take 0, hence let's allocate at least one */
1431 6278 : items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1432 :
1433 12816 : for (i = 0; i < n_iovec; i++) {
1434 : uint64_t p;
1435 : Object *o;
1436 :
1437 6538 : r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1438 6538 : if (r < 0)
1439 0 : return r;
1440 :
1441 6538 : xor_hash ^= le64toh(o->data.hash);
1442 6538 : items[i].object_offset = htole64(p);
1443 6538 : items[i].hash = o->data.hash;
1444 : }
1445 :
1446 : /* Order by the position on disk, in order to improve seek
1447 : * times for rotating media. */
1448 6278 : qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1449 :
1450 6278 : r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1451 :
1452 : /* If the memory mapping triggered a SIGBUS then we return an
1453 : * IO error and ignore the error code passed down to us, since
1454 : * it is very likely just an effect of a nullified replacement
1455 : * mapping page */
1456 :
1457 6278 : if (mmap_cache_got_sigbus(f->mmap, f->fd))
1458 0 : r = -EIO;
1459 :
1460 6278 : journal_file_post_change(f);
1461 :
1462 6278 : return r;
1463 : }
1464 :
1465 : typedef struct ChainCacheItem {
1466 : uint64_t first; /* the array at the beginning of the chain */
1467 : uint64_t array; /* the cached array */
1468 : uint64_t begin; /* the first item in the cached array */
1469 : uint64_t total; /* the total number of items in all arrays before this one in the chain */
1470 : uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1471 : } ChainCacheItem;
1472 :
1473 20771 : static void chain_cache_put(
1474 : OrderedHashmap *h,
1475 : ChainCacheItem *ci,
1476 : uint64_t first,
1477 : uint64_t array,
1478 : uint64_t begin,
1479 : uint64_t total,
1480 : uint64_t last_index) {
1481 :
1482 20771 : if (!ci) {
1483 : /* If the chain item to cache for this chain is the
1484 : * first one it's not worth caching anything */
1485 157 : if (array == first)
1486 148 : return;
1487 :
1488 9 : if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1489 0 : ci = ordered_hashmap_steal_first(h);
1490 0 : assert(ci);
1491 : } else {
1492 9 : ci = new(ChainCacheItem, 1);
1493 9 : if (!ci)
1494 0 : return;
1495 : }
1496 :
1497 9 : ci->first = first;
1498 :
1499 9 : if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1500 0 : free(ci);
1501 0 : return;
1502 : }
1503 : } else
1504 20614 : assert(ci->first == first);
1505 :
1506 20623 : ci->array = array;
1507 20623 : ci->begin = begin;
1508 20623 : ci->total = total;
1509 20623 : ci->last_index = last_index;
1510 : }
1511 :
1512 10313 : static int generic_array_get(
1513 : JournalFile *f,
1514 : uint64_t first,
1515 : uint64_t i,
1516 : Object **ret, uint64_t *offset) {
1517 :
1518 : Object *o;
1519 10313 : uint64_t p = 0, a, t = 0;
1520 : int r;
1521 : ChainCacheItem *ci;
1522 :
1523 10313 : assert(f);
1524 :
1525 10313 : a = first;
1526 :
1527 : /* Try the chain cache first */
1528 10313 : ci = ordered_hashmap_get(f->chain_cache, &first);
1529 10313 : if (ci && i > ci->total) {
1530 10237 : a = ci->array;
1531 10237 : i -= ci->total;
1532 10237 : t = ci->total;
1533 : }
1534 :
1535 20652 : while (a > 0) {
1536 : uint64_t k;
1537 :
1538 10339 : r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1539 10339 : if (r < 0)
1540 0 : return r;
1541 :
1542 10339 : k = journal_file_entry_array_n_items(o);
1543 10339 : if (i < k) {
1544 10313 : p = le64toh(o->entry_array.items[i]);
1545 10313 : goto found;
1546 : }
1547 :
1548 26 : i -= k;
1549 26 : t += k;
1550 26 : a = le64toh(o->entry_array.next_entry_array_offset);
1551 : }
1552 :
1553 0 : return 0;
1554 :
1555 : found:
1556 : /* Let's cache this item for the next invocation */
1557 10313 : chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1558 :
1559 10313 : r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1560 10313 : if (r < 0)
1561 0 : return r;
1562 :
1563 10313 : if (ret)
1564 10310 : *ret = o;
1565 :
1566 10313 : if (offset)
1567 10312 : *offset = p;
1568 :
1569 10313 : return 1;
1570 : }
1571 :
1572 19 : static int generic_array_get_plus_one(
1573 : JournalFile *f,
1574 : uint64_t extra,
1575 : uint64_t first,
1576 : uint64_t i,
1577 : Object **ret, uint64_t *offset) {
1578 :
1579 : Object *o;
1580 :
1581 19 : assert(f);
1582 :
1583 19 : if (i == 0) {
1584 : int r;
1585 :
1586 15 : r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1587 15 : if (r < 0)
1588 0 : return r;
1589 :
1590 15 : if (ret)
1591 3 : *ret = o;
1592 :
1593 15 : if (offset)
1594 12 : *offset = extra;
1595 :
1596 15 : return 1;
1597 : }
1598 :
1599 4 : return generic_array_get(f, first, i-1, ret, offset);
1600 : }
1601 :
1602 : enum {
1603 : TEST_FOUND,
1604 : TEST_LEFT,
1605 : TEST_RIGHT
1606 : };
1607 :
1608 10472 : static int generic_array_bisect(
1609 : JournalFile *f,
1610 : uint64_t first,
1611 : uint64_t n,
1612 : uint64_t needle,
1613 : int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1614 : direction_t direction,
1615 : Object **ret,
1616 : uint64_t *offset,
1617 : uint64_t *idx) {
1618 :
1619 10472 : uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1620 10472 : bool subtract_one = false;
1621 10472 : Object *o, *array = NULL;
1622 : int r;
1623 : ChainCacheItem *ci;
1624 :
1625 10472 : assert(f);
1626 10472 : assert(test_object);
1627 :
1628 : /* Start with the first array in the chain */
1629 10472 : a = first;
1630 :
1631 10472 : ci = ordered_hashmap_get(f->chain_cache, &first);
1632 10472 : if (ci && n > ci->total) {
1633 : /* Ah, we have iterated this bisection array chain
1634 : * previously! Let's see if we can skip ahead in the
1635 : * chain, as far as the last time. But we can't jump
1636 : * backwards in the chain, so let's check that
1637 : * first. */
1638 :
1639 10386 : r = test_object(f, ci->begin, needle);
1640 10386 : if (r < 0)
1641 0 : return r;
1642 :
1643 10386 : if (r == TEST_LEFT) {
1644 : /* OK, what we are looking for is right of the
1645 : * begin of this EntryArray, so let's jump
1646 : * straight to previously cached array in the
1647 : * chain */
1648 :
1649 10346 : a = ci->array;
1650 10346 : n -= ci->total;
1651 10346 : t = ci->total;
1652 10346 : last_index = ci->last_index;
1653 : }
1654 : }
1655 :
1656 21026 : while (a > 0) {
1657 : uint64_t left, right, k, lp;
1658 :
1659 10550 : r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1660 10550 : if (r < 0)
1661 0 : return r;
1662 :
1663 10550 : k = journal_file_entry_array_n_items(array);
1664 10550 : right = MIN(k, n);
1665 10550 : if (right <= 0)
1666 0 : return 0;
1667 :
1668 10550 : i = right - 1;
1669 10550 : lp = p = le64toh(array->entry_array.items[i]);
1670 10550 : if (p <= 0)
1671 0 : return -EBADMSG;
1672 :
1673 10550 : r = test_object(f, p, needle);
1674 10550 : if (r < 0)
1675 0 : return r;
1676 :
1677 10550 : if (r == TEST_FOUND)
1678 56 : r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1679 :
1680 10550 : if (r == TEST_RIGHT) {
1681 10451 : left = 0;
1682 10451 : right -= 1;
1683 :
1684 10451 : if (last_index != (uint64_t) -1) {
1685 10324 : assert(last_index <= right);
1686 :
1687 : /* If we cached the last index we
1688 : * looked at, let's try to not to jump
1689 : * too wildly around and see if we can
1690 : * limit the range to look at early to
1691 : * the immediate neighbors of the last
1692 : * index we looked at. */
1693 :
1694 10324 : if (last_index > 0) {
1695 10308 : uint64_t x = last_index - 1;
1696 :
1697 10308 : p = le64toh(array->entry_array.items[x]);
1698 10308 : if (p <= 0)
1699 0 : return -EBADMSG;
1700 :
1701 10308 : r = test_object(f, p, needle);
1702 10308 : if (r < 0)
1703 0 : return r;
1704 :
1705 10308 : if (r == TEST_FOUND)
1706 0 : r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1707 :
1708 10308 : if (r == TEST_RIGHT)
1709 0 : right = x;
1710 : else
1711 10308 : left = x + 1;
1712 : }
1713 :
1714 10324 : if (last_index < right) {
1715 10302 : uint64_t y = last_index + 1;
1716 :
1717 10302 : p = le64toh(array->entry_array.items[y]);
1718 10302 : if (p <= 0)
1719 0 : return -EBADMSG;
1720 :
1721 10302 : r = test_object(f, p, needle);
1722 10302 : if (r < 0)
1723 0 : return r;
1724 :
1725 10302 : if (r == TEST_FOUND)
1726 1 : r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1727 :
1728 10302 : if (r == TEST_RIGHT)
1729 10302 : right = y;
1730 : else
1731 0 : left = y + 1;
1732 : }
1733 : }
1734 :
1735 : for (;;) {
1736 21058 : if (left == right) {
1737 10451 : if (direction == DIRECTION_UP)
1738 44 : subtract_one = true;
1739 :
1740 10451 : i = left;
1741 10451 : goto found;
1742 : }
1743 :
1744 10607 : assert(left < right);
1745 10607 : i = (left + right) / 2;
1746 :
1747 10607 : p = le64toh(array->entry_array.items[i]);
1748 10607 : if (p <= 0)
1749 0 : return -EBADMSG;
1750 :
1751 10607 : r = test_object(f, p, needle);
1752 10607 : if (r < 0)
1753 0 : return r;
1754 :
1755 10607 : if (r == TEST_FOUND)
1756 10268 : r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1757 :
1758 10607 : if (r == TEST_RIGHT)
1759 10472 : right = i;
1760 : else
1761 135 : left = i + 1;
1762 10607 : }
1763 : }
1764 :
1765 99 : if (k >= n) {
1766 17 : if (direction == DIRECTION_UP) {
1767 10 : i = n;
1768 10 : subtract_one = true;
1769 10 : goto found;
1770 : }
1771 :
1772 7 : return 0;
1773 : }
1774 :
1775 82 : last_p = lp;
1776 :
1777 82 : n -= k;
1778 82 : t += k;
1779 82 : last_index = (uint64_t) -1;
1780 82 : a = le64toh(array->entry_array.next_entry_array_offset);
1781 : }
1782 :
1783 4 : return 0;
1784 :
1785 : found:
1786 10461 : if (subtract_one && t == 0 && i == 0)
1787 3 : return 0;
1788 :
1789 : /* Let's cache this item for the next invocation */
1790 10458 : chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1791 :
1792 10458 : if (subtract_one && i == 0)
1793 5 : p = last_p;
1794 10453 : else if (subtract_one)
1795 46 : p = le64toh(array->entry_array.items[i-1]);
1796 : else
1797 10407 : p = le64toh(array->entry_array.items[i]);
1798 :
1799 10458 : r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1800 10458 : if (r < 0)
1801 0 : return r;
1802 :
1803 10458 : if (ret)
1804 11 : *ret = o;
1805 :
1806 10458 : if (offset)
1807 148 : *offset = p;
1808 :
1809 10458 : if (idx)
1810 10307 : *idx = t + i + (subtract_one ? -1 : 0);
1811 :
1812 10458 : return 1;
1813 : }
1814 :
1815 175 : static int generic_array_bisect_plus_one(
1816 : JournalFile *f,
1817 : uint64_t extra,
1818 : uint64_t first,
1819 : uint64_t n,
1820 : uint64_t needle,
1821 : int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1822 : direction_t direction,
1823 : Object **ret,
1824 : uint64_t *offset,
1825 : uint64_t *idx) {
1826 :
1827 : int r;
1828 175 : bool step_back = false;
1829 : Object *o;
1830 :
1831 175 : assert(f);
1832 175 : assert(test_object);
1833 :
1834 175 : if (n <= 0)
1835 0 : return 0;
1836 :
1837 : /* This bisects the array in object 'first', but first checks
1838 : * an extra */
1839 175 : r = test_object(f, extra, needle);
1840 175 : if (r < 0)
1841 0 : return r;
1842 :
1843 175 : if (r == TEST_FOUND)
1844 16 : r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1845 :
1846 : /* if we are looking with DIRECTION_UP then we need to first
1847 : see if in the actual array there is a matching entry, and
1848 : return the last one of that. But if there isn't any we need
1849 : to return this one. Hence remember this, and return it
1850 : below. */
1851 175 : if (r == TEST_LEFT)
1852 153 : step_back = direction == DIRECTION_UP;
1853 :
1854 175 : if (r == TEST_RIGHT) {
1855 22 : if (direction == DIRECTION_DOWN)
1856 19 : goto found;
1857 : else
1858 3 : return 0;
1859 : }
1860 :
1861 153 : r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1862 :
1863 153 : if (r == 0 && step_back)
1864 3 : goto found;
1865 :
1866 150 : if (r > 0 && idx)
1867 0 : (*idx) ++;
1868 :
1869 150 : return r;
1870 :
1871 : found:
1872 22 : r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1873 22 : if (r < 0)
1874 0 : return r;
1875 :
1876 22 : if (ret)
1877 0 : *ret = o;
1878 :
1879 22 : if (offset)
1880 22 : *offset = extra;
1881 :
1882 22 : if (idx)
1883 0 : *idx = 0;
1884 :
1885 22 : return 1;
1886 : }
1887 :
1888 52307 : _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1889 52307 : assert(f);
1890 52307 : assert(p > 0);
1891 :
1892 52307 : if (p == needle)
1893 10355 : return TEST_FOUND;
1894 41952 : else if (p < needle)
1895 21028 : return TEST_LEFT;
1896 : else
1897 20924 : return TEST_RIGHT;
1898 : }
1899 :
1900 15 : static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1901 : Object *o;
1902 : int r;
1903 :
1904 15 : assert(f);
1905 15 : assert(p > 0);
1906 :
1907 15 : r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1908 15 : if (r < 0)
1909 0 : return r;
1910 :
1911 15 : if (le64toh(o->entry.seqnum) == needle)
1912 7 : return TEST_FOUND;
1913 8 : else if (le64toh(o->entry.seqnum) < needle)
1914 3 : return TEST_LEFT;
1915 : else
1916 5 : return TEST_RIGHT;
1917 : }
1918 :
1919 8 : int journal_file_move_to_entry_by_seqnum(
1920 : JournalFile *f,
1921 : uint64_t seqnum,
1922 : direction_t direction,
1923 : Object **ret,
1924 : uint64_t *offset) {
1925 :
1926 16 : return generic_array_bisect(f,
1927 8 : le64toh(f->header->entry_array_offset),
1928 8 : le64toh(f->header->n_entries),
1929 : seqnum,
1930 : test_object_seqnum,
1931 : direction,
1932 : ret, offset, NULL);
1933 : }
1934 :
1935 6 : static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1936 : Object *o;
1937 : int r;
1938 :
1939 6 : assert(f);
1940 6 : assert(p > 0);
1941 :
1942 6 : r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1943 6 : if (r < 0)
1944 0 : return r;
1945 :
1946 6 : if (le64toh(o->entry.realtime) == needle)
1947 0 : return TEST_FOUND;
1948 6 : else if (le64toh(o->entry.realtime) < needle)
1949 2 : return TEST_LEFT;
1950 : else
1951 4 : return TEST_RIGHT;
1952 : }
1953 :
1954 4 : int journal_file_move_to_entry_by_realtime(
1955 : JournalFile *f,
1956 : uint64_t realtime,
1957 : direction_t direction,
1958 : Object **ret,
1959 : uint64_t *offset) {
1960 :
1961 8 : return generic_array_bisect(f,
1962 4 : le64toh(f->header->entry_array_offset),
1963 4 : le64toh(f->header->n_entries),
1964 : realtime,
1965 : test_object_realtime,
1966 : direction,
1967 : ret, offset, NULL);
1968 : }
1969 :
1970 0 : static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1971 : Object *o;
1972 : int r;
1973 :
1974 0 : assert(f);
1975 0 : assert(p > 0);
1976 :
1977 0 : r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1978 0 : if (r < 0)
1979 0 : return r;
1980 :
1981 0 : if (le64toh(o->entry.monotonic) == needle)
1982 0 : return TEST_FOUND;
1983 0 : else if (le64toh(o->entry.monotonic) < needle)
1984 0 : return TEST_LEFT;
1985 : else
1986 0 : return TEST_RIGHT;
1987 : }
1988 :
1989 4 : static int find_data_object_by_boot_id(
1990 : JournalFile *f,
1991 : sd_id128_t boot_id,
1992 : Object **o,
1993 : uint64_t *b) {
1994 :
1995 4 : char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1996 :
1997 4 : sd_id128_to_string(boot_id, t + 9);
1998 4 : return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1999 : }
2000 :
2001 4 : int journal_file_move_to_entry_by_monotonic(
2002 : JournalFile *f,
2003 : sd_id128_t boot_id,
2004 : uint64_t monotonic,
2005 : direction_t direction,
2006 : Object **ret,
2007 : uint64_t *offset) {
2008 :
2009 : Object *o;
2010 : int r;
2011 :
2012 4 : assert(f);
2013 :
2014 4 : r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2015 4 : if (r < 0)
2016 0 : return r;
2017 4 : if (r == 0)
2018 4 : return -ENOENT;
2019 :
2020 0 : return generic_array_bisect_plus_one(f,
2021 0 : le64toh(o->data.entry_offset),
2022 0 : le64toh(o->data.entry_array_offset),
2023 0 : le64toh(o->data.n_entries),
2024 : monotonic,
2025 : test_object_monotonic,
2026 : direction,
2027 : ret, offset, NULL);
2028 : }
2029 :
2030 664 : void journal_file_reset_location(JournalFile *f) {
2031 664 : f->location_type = LOCATION_HEAD;
2032 664 : f->current_offset = 0;
2033 664 : f->current_seqnum = 0;
2034 664 : f->current_realtime = 0;
2035 664 : f->current_monotonic = 0;
2036 664 : zero(f->current_boot_id);
2037 664 : f->current_xor_hash = 0;
2038 664 : }
2039 :
2040 10457 : void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2041 10457 : f->location_type = LOCATION_SEEK;
2042 10457 : f->current_offset = offset;
2043 10457 : f->current_seqnum = le64toh(o->entry.seqnum);
2044 10457 : f->current_realtime = le64toh(o->entry.realtime);
2045 10457 : f->current_monotonic = le64toh(o->entry.monotonic);
2046 10457 : f->current_boot_id = o->entry.boot_id;
2047 10457 : f->current_xor_hash = le64toh(o->entry.xor_hash);
2048 10457 : }
2049 :
2050 40656 : int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2051 40656 : assert(af);
2052 40656 : assert(bf);
2053 40656 : assert(af->location_type == LOCATION_SEEK);
2054 40656 : assert(bf->location_type == LOCATION_SEEK);
2055 :
2056 : /* If contents and timestamps match, these entries are
2057 : * identical, even if the seqnum does not match */
2058 50635 : if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2059 10061 : af->current_monotonic == bf->current_monotonic &&
2060 164 : af->current_realtime == bf->current_realtime &&
2061 82 : af->current_xor_hash == bf->current_xor_hash)
2062 82 : return 0;
2063 :
2064 40574 : if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2065 :
2066 : /* If this is from the same seqnum source, compare
2067 : * seqnums */
2068 0 : if (af->current_seqnum < bf->current_seqnum)
2069 0 : return -1;
2070 0 : if (af->current_seqnum > bf->current_seqnum)
2071 0 : return 1;
2072 :
2073 : /* Wow! This is weird, different data but the same
2074 : * seqnums? Something is borked, but let's make the
2075 : * best of it and compare by time. */
2076 : }
2077 :
2078 40574 : if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2079 :
2080 : /* If the boot id matches, compare monotonic time */
2081 9897 : if (af->current_monotonic < bf->current_monotonic)
2082 1336 : return -1;
2083 8561 : if (af->current_monotonic > bf->current_monotonic)
2084 8561 : return 1;
2085 : }
2086 :
2087 : /* Otherwise, compare UTC time */
2088 30677 : if (af->current_realtime < bf->current_realtime)
2089 0 : return -1;
2090 30677 : if (af->current_realtime > bf->current_realtime)
2091 30677 : return 1;
2092 :
2093 : /* Finally, compare by contents */
2094 0 : if (af->current_xor_hash < bf->current_xor_hash)
2095 0 : return -1;
2096 0 : if (af->current_xor_hash > bf->current_xor_hash)
2097 0 : return 1;
2098 :
2099 0 : return 0;
2100 : }
2101 :
2102 10333 : int journal_file_next_entry(
2103 : JournalFile *f,
2104 : uint64_t p,
2105 : direction_t direction,
2106 : Object **ret, uint64_t *offset) {
2107 :
2108 : uint64_t i, n, ofs;
2109 : int r;
2110 :
2111 10333 : assert(f);
2112 :
2113 10333 : n = le64toh(f->header->n_entries);
2114 10333 : if (n <= 0)
2115 0 : return 0;
2116 :
2117 10333 : if (p == 0)
2118 26 : i = direction == DIRECTION_DOWN ? 0 : n - 1;
2119 : else {
2120 20614 : r = generic_array_bisect(f,
2121 10307 : le64toh(f->header->entry_array_offset),
2122 10307 : le64toh(f->header->n_entries),
2123 : p,
2124 : test_object_offset,
2125 : DIRECTION_DOWN,
2126 : NULL, NULL,
2127 : &i);
2128 10307 : if (r <= 0)
2129 0 : return r;
2130 :
2131 10307 : if (direction == DIRECTION_DOWN) {
2132 10285 : if (i >= n - 1)
2133 14 : return 0;
2134 :
2135 10271 : i++;
2136 : } else {
2137 22 : if (i <= 0)
2138 10 : return 0;
2139 :
2140 12 : i--;
2141 : }
2142 : }
2143 :
2144 : /* And jump to it */
2145 20618 : r = generic_array_get(f,
2146 10309 : le64toh(f->header->entry_array_offset),
2147 : i,
2148 : ret, &ofs);
2149 10309 : if (r <= 0)
2150 0 : return r;
2151 :
2152 10309 : if (p > 0 &&
2153 : (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2154 0 : log_debug("%s: entry array corrupted at entry %"PRIu64,
2155 : f->path, i);
2156 0 : return -EBADMSG;
2157 : }
2158 :
2159 10309 : if (offset)
2160 10309 : *offset = ofs;
2161 :
2162 10309 : return 1;
2163 : }
2164 :
2165 19 : int journal_file_next_entry_for_data(
2166 : JournalFile *f,
2167 : Object *o, uint64_t p,
2168 : uint64_t data_offset,
2169 : direction_t direction,
2170 : Object **ret, uint64_t *offset) {
2171 :
2172 : uint64_t n, i;
2173 : int r;
2174 : Object *d;
2175 :
2176 19 : assert(f);
2177 19 : assert(p > 0 || !o);
2178 :
2179 19 : r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2180 19 : if (r < 0)
2181 0 : return r;
2182 :
2183 19 : n = le64toh(d->data.n_entries);
2184 19 : if (n <= 0)
2185 0 : return n;
2186 :
2187 19 : if (!o)
2188 19 : i = direction == DIRECTION_DOWN ? 0 : n - 1;
2189 : else {
2190 0 : if (o->object.type != OBJECT_ENTRY)
2191 0 : return -EINVAL;
2192 :
2193 0 : r = generic_array_bisect_plus_one(f,
2194 0 : le64toh(d->data.entry_offset),
2195 0 : le64toh(d->data.entry_array_offset),
2196 0 : le64toh(d->data.n_entries),
2197 : p,
2198 : test_object_offset,
2199 : DIRECTION_DOWN,
2200 : NULL, NULL,
2201 : &i);
2202 :
2203 0 : if (r <= 0)
2204 0 : return r;
2205 :
2206 0 : if (direction == DIRECTION_DOWN) {
2207 0 : if (i >= n - 1)
2208 0 : return 0;
2209 :
2210 0 : i++;
2211 : } else {
2212 0 : if (i <= 0)
2213 0 : return 0;
2214 :
2215 0 : i--;
2216 : }
2217 :
2218 : }
2219 :
2220 57 : return generic_array_get_plus_one(f,
2221 19 : le64toh(d->data.entry_offset),
2222 19 : le64toh(d->data.entry_array_offset),
2223 : i,
2224 : ret, offset);
2225 : }
2226 :
2227 175 : int journal_file_move_to_entry_by_offset_for_data(
2228 : JournalFile *f,
2229 : uint64_t data_offset,
2230 : uint64_t p,
2231 : direction_t direction,
2232 : Object **ret, uint64_t *offset) {
2233 :
2234 : int r;
2235 : Object *d;
2236 :
2237 175 : assert(f);
2238 :
2239 175 : r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2240 175 : if (r < 0)
2241 0 : return r;
2242 :
2243 525 : return generic_array_bisect_plus_one(f,
2244 175 : le64toh(d->data.entry_offset),
2245 175 : le64toh(d->data.entry_array_offset),
2246 175 : le64toh(d->data.n_entries),
2247 : p,
2248 : test_object_offset,
2249 : direction,
2250 : ret, offset, NULL);
2251 : }
2252 :
2253 0 : int journal_file_move_to_entry_by_monotonic_for_data(
2254 : JournalFile *f,
2255 : uint64_t data_offset,
2256 : sd_id128_t boot_id,
2257 : uint64_t monotonic,
2258 : direction_t direction,
2259 : Object **ret, uint64_t *offset) {
2260 :
2261 : Object *o, *d;
2262 : int r;
2263 : uint64_t b, z;
2264 :
2265 0 : assert(f);
2266 :
2267 : /* First, seek by time */
2268 0 : r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2269 0 : if (r < 0)
2270 0 : return r;
2271 0 : if (r == 0)
2272 0 : return -ENOENT;
2273 :
2274 0 : r = generic_array_bisect_plus_one(f,
2275 0 : le64toh(o->data.entry_offset),
2276 0 : le64toh(o->data.entry_array_offset),
2277 0 : le64toh(o->data.n_entries),
2278 : monotonic,
2279 : test_object_monotonic,
2280 : direction,
2281 : NULL, &z, NULL);
2282 0 : if (r <= 0)
2283 0 : return r;
2284 :
2285 : /* And now, continue seeking until we find an entry that
2286 : * exists in both bisection arrays */
2287 :
2288 : for (;;) {
2289 : Object *qo;
2290 : uint64_t p, q;
2291 :
2292 0 : r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2293 0 : if (r < 0)
2294 0 : return r;
2295 :
2296 0 : r = generic_array_bisect_plus_one(f,
2297 0 : le64toh(d->data.entry_offset),
2298 0 : le64toh(d->data.entry_array_offset),
2299 0 : le64toh(d->data.n_entries),
2300 : z,
2301 : test_object_offset,
2302 : direction,
2303 : NULL, &p, NULL);
2304 0 : if (r <= 0)
2305 0 : return r;
2306 :
2307 0 : r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2308 0 : if (r < 0)
2309 0 : return r;
2310 :
2311 0 : r = generic_array_bisect_plus_one(f,
2312 0 : le64toh(o->data.entry_offset),
2313 0 : le64toh(o->data.entry_array_offset),
2314 0 : le64toh(o->data.n_entries),
2315 : p,
2316 : test_object_offset,
2317 : direction,
2318 : &qo, &q, NULL);
2319 :
2320 0 : if (r <= 0)
2321 0 : return r;
2322 :
2323 0 : if (p == q) {
2324 0 : if (ret)
2325 0 : *ret = qo;
2326 0 : if (offset)
2327 0 : *offset = q;
2328 :
2329 0 : return 1;
2330 : }
2331 :
2332 0 : z = q;
2333 0 : }
2334 : }
2335 :
2336 0 : int journal_file_move_to_entry_by_seqnum_for_data(
2337 : JournalFile *f,
2338 : uint64_t data_offset,
2339 : uint64_t seqnum,
2340 : direction_t direction,
2341 : Object **ret, uint64_t *offset) {
2342 :
2343 : Object *d;
2344 : int r;
2345 :
2346 0 : assert(f);
2347 :
2348 0 : r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2349 0 : if (r < 0)
2350 0 : return r;
2351 :
2352 0 : return generic_array_bisect_plus_one(f,
2353 0 : le64toh(d->data.entry_offset),
2354 0 : le64toh(d->data.entry_array_offset),
2355 0 : le64toh(d->data.n_entries),
2356 : seqnum,
2357 : test_object_seqnum,
2358 : direction,
2359 : ret, offset, NULL);
2360 : }
2361 :
2362 0 : int journal_file_move_to_entry_by_realtime_for_data(
2363 : JournalFile *f,
2364 : uint64_t data_offset,
2365 : uint64_t realtime,
2366 : direction_t direction,
2367 : Object **ret, uint64_t *offset) {
2368 :
2369 : Object *d;
2370 : int r;
2371 :
2372 0 : assert(f);
2373 :
2374 0 : r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2375 0 : if (r < 0)
2376 0 : return r;
2377 :
2378 0 : return generic_array_bisect_plus_one(f,
2379 0 : le64toh(d->data.entry_offset),
2380 0 : le64toh(d->data.entry_array_offset),
2381 0 : le64toh(d->data.n_entries),
2382 : realtime,
2383 : test_object_realtime,
2384 : direction,
2385 : ret, offset, NULL);
2386 : }
2387 :
2388 2 : void journal_file_dump(JournalFile *f) {
2389 : Object *o;
2390 : int r;
2391 : uint64_t p;
2392 :
2393 2 : assert(f);
2394 :
2395 2 : journal_file_print_header(f);
2396 :
2397 2 : p = le64toh(f->header->header_size);
2398 6411 : while (p != 0) {
2399 6407 : r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2400 6407 : if (r < 0)
2401 0 : goto fail;
2402 :
2403 6407 : switch (o->object.type) {
2404 :
2405 : case OBJECT_UNUSED:
2406 0 : printf("Type: OBJECT_UNUSED\n");
2407 0 : break;
2408 :
2409 : case OBJECT_DATA:
2410 79 : printf("Type: OBJECT_DATA\n");
2411 79 : break;
2412 :
2413 : case OBJECT_FIELD:
2414 3 : printf("Type: OBJECT_FIELD\n");
2415 3 : break;
2416 :
2417 : case OBJECT_ENTRY:
2418 18009 : printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2419 6003 : le64toh(o->entry.seqnum),
2420 6003 : le64toh(o->entry.monotonic),
2421 6003 : le64toh(o->entry.realtime));
2422 6003 : break;
2423 :
2424 : case OBJECT_FIELD_HASH_TABLE:
2425 2 : printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2426 2 : break;
2427 :
2428 : case OBJECT_DATA_HASH_TABLE:
2429 2 : printf("Type: OBJECT_DATA_HASH_TABLE\n");
2430 2 : break;
2431 :
2432 : case OBJECT_ENTRY_ARRAY:
2433 318 : printf("Type: OBJECT_ENTRY_ARRAY\n");
2434 318 : break;
2435 :
2436 : case OBJECT_TAG:
2437 0 : printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2438 0 : le64toh(o->tag.seqnum),
2439 0 : le64toh(o->tag.epoch));
2440 0 : break;
2441 :
2442 : default:
2443 0 : printf("Type: unknown (%i)\n", o->object.type);
2444 0 : break;
2445 : }
2446 :
2447 6407 : if (o->object.flags & OBJECT_COMPRESSION_MASK)
2448 0 : printf("Flags: %s\n",
2449 0 : object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2450 :
2451 6407 : if (p == le64toh(f->header->tail_object_offset))
2452 2 : p = 0;
2453 : else
2454 6405 : p = p + ALIGN64(le64toh(o->object.size));
2455 : }
2456 :
2457 2 : return;
2458 : fail:
2459 0 : log_error("File corrupt");
2460 : }
2461 :
2462 12 : static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2463 : const char *x;
2464 :
2465 12 : x = format_timestamp(buf, l, t);
2466 12 : if (x)
2467 4 : return x;
2468 8 : return " --- ";
2469 : }
2470 :
2471 6 : void journal_file_print_header(JournalFile *f) {
2472 : char a[33], b[33], c[33], d[33];
2473 : char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2474 : struct stat st;
2475 : char bytes[FORMAT_BYTES_MAX];
2476 :
2477 6 : assert(f);
2478 :
2479 137 : printf("File Path: %s\n"
2480 : "File ID: %s\n"
2481 : "Machine ID: %s\n"
2482 : "Boot ID: %s\n"
2483 : "Sequential Number ID: %s\n"
2484 : "State: %s\n"
2485 : "Compatible Flags:%s%s\n"
2486 : "Incompatible Flags:%s%s%s\n"
2487 : "Header size: %"PRIu64"\n"
2488 : "Arena size: %"PRIu64"\n"
2489 : "Data Hash Table Size: %"PRIu64"\n"
2490 : "Field Hash Table Size: %"PRIu64"\n"
2491 : "Rotate Suggested: %s\n"
2492 : "Head Sequential Number: %"PRIu64"\n"
2493 : "Tail Sequential Number: %"PRIu64"\n"
2494 : "Head Realtime Timestamp: %s\n"
2495 : "Tail Realtime Timestamp: %s\n"
2496 : "Tail Monotonic Timestamp: %s\n"
2497 : "Objects: %"PRIu64"\n"
2498 : "Entry Objects: %"PRIu64"\n",
2499 : f->path,
2500 6 : sd_id128_to_string(f->header->file_id, a),
2501 6 : sd_id128_to_string(f->header->machine_id, b),
2502 6 : sd_id128_to_string(f->header->boot_id, c),
2503 6 : sd_id128_to_string(f->header->seqnum_id, d),
2504 6 : f->header->state == STATE_OFFLINE ? "OFFLINE" :
2505 5 : f->header->state == STATE_ONLINE ? "ONLINE" :
2506 0 : f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2507 6 : JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2508 6 : (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2509 6 : JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2510 6 : JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2511 6 : (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2512 6 : le64toh(f->header->header_size),
2513 6 : le64toh(f->header->arena_size),
2514 6 : le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2515 6 : le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2516 6 : yes_no(journal_file_rotate_suggested(f, 0)),
2517 6 : le64toh(f->header->head_entry_seqnum),
2518 6 : le64toh(f->header->tail_entry_seqnum),
2519 6 : format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2520 6 : format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2521 6 : format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2522 6 : le64toh(f->header->n_objects),
2523 6 : le64toh(f->header->n_entries));
2524 :
2525 6 : if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2526 12 : printf("Data Objects: %"PRIu64"\n"
2527 : "Data Hash Table Fill: %.1f%%\n",
2528 6 : le64toh(f->header->n_data),
2529 6 : 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2530 :
2531 6 : if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2532 12 : printf("Field Objects: %"PRIu64"\n"
2533 : "Field Hash Table Fill: %.1f%%\n",
2534 6 : le64toh(f->header->n_fields),
2535 6 : 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2536 :
2537 6 : if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2538 6 : printf("Tag Objects: %"PRIu64"\n",
2539 6 : le64toh(f->header->n_tags));
2540 6 : if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2541 6 : printf("Entry Array Objects: %"PRIu64"\n",
2542 6 : le64toh(f->header->n_entry_arrays));
2543 :
2544 6 : if (fstat(f->fd, &st) >= 0)
2545 6 : printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2546 6 : }
2547 :
2548 18 : static int journal_file_warn_btrfs(JournalFile *f) {
2549 : unsigned attrs;
2550 : int r;
2551 :
2552 18 : assert(f);
2553 :
2554 : /* Before we write anything, check if the COW logic is turned
2555 : * off on btrfs. Given our write pattern that is quite
2556 : * unfriendly to COW file systems this should greatly improve
2557 : * performance on COW file systems, such as btrfs, at the
2558 : * expense of data integrity features (which shouldn't be too
2559 : * bad, given that we do our own checksumming). */
2560 :
2561 18 : r = btrfs_is_filesystem(f->fd);
2562 18 : if (r < 0)
2563 0 : return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2564 18 : if (!r)
2565 17 : return 0;
2566 :
2567 1 : r = read_attr_fd(f->fd, &attrs);
2568 1 : if (r < 0)
2569 0 : return log_warning_errno(r, "Failed to read file attributes: %m");
2570 :
2571 1 : if (attrs & FS_NOCOW_FL) {
2572 0 : log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2573 0 : return 0;
2574 : }
2575 :
2576 1 : log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2577 : "This is likely to slow down journal access substantially, please consider turning "
2578 : "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2579 :
2580 1 : return 1;
2581 : }
2582 :
2583 1365 : int journal_file_open(
2584 : const char *fname,
2585 : int flags,
2586 : mode_t mode,
2587 : bool compress,
2588 : bool seal,
2589 : JournalMetrics *metrics,
2590 : MMapCache *mmap_cache,
2591 : JournalFile *template,
2592 : JournalFile **ret) {
2593 :
2594 1365 : bool newly_created = false;
2595 : JournalFile *f;
2596 : void *h;
2597 : int r;
2598 :
2599 1365 : assert(fname);
2600 1365 : assert(ret);
2601 :
2602 1384 : if ((flags & O_ACCMODE) != O_RDONLY &&
2603 19 : (flags & O_ACCMODE) != O_RDWR)
2604 0 : return -EINVAL;
2605 :
2606 1977 : if (!endswith(fname, ".journal") &&
2607 612 : !endswith(fname, ".journal~"))
2608 0 : return -EINVAL;
2609 :
2610 1365 : f = new0(JournalFile, 1);
2611 1365 : if (!f)
2612 0 : return -ENOMEM;
2613 :
2614 1365 : f->fd = -1;
2615 1365 : f->mode = mode;
2616 :
2617 1365 : f->flags = flags;
2618 1365 : f->prot = prot_from_flags(flags);
2619 1365 : f->writable = (flags & O_ACCMODE) != O_RDONLY;
2620 : #if defined(HAVE_LZ4)
2621 : f->compress_lz4 = compress;
2622 : #elif defined(HAVE_XZ)
2623 1365 : f->compress_xz = compress;
2624 : #endif
2625 : #ifdef HAVE_GCRYPT
2626 1365 : f->seal = seal;
2627 : #endif
2628 :
2629 1365 : if (mmap_cache)
2630 1347 : f->mmap = mmap_cache_ref(mmap_cache);
2631 : else {
2632 18 : f->mmap = mmap_cache_new();
2633 18 : if (!f->mmap) {
2634 0 : r = -ENOMEM;
2635 0 : goto fail;
2636 : }
2637 : }
2638 :
2639 1365 : f->path = strdup(fname);
2640 1365 : if (!f->path) {
2641 0 : r = -ENOMEM;
2642 0 : goto fail;
2643 : }
2644 :
2645 1365 : f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2646 1365 : if (!f->chain_cache) {
2647 0 : r = -ENOMEM;
2648 0 : goto fail;
2649 : }
2650 :
2651 1365 : f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2652 1365 : if (f->fd < 0) {
2653 0 : r = -errno;
2654 0 : goto fail;
2655 : }
2656 :
2657 1365 : r = journal_file_fstat(f);
2658 1365 : if (r < 0)
2659 0 : goto fail;
2660 :
2661 1365 : if (f->last_stat.st_size == 0 && f->writable) {
2662 :
2663 18 : (void) journal_file_warn_btrfs(f);
2664 :
2665 : /* Let's attach the creation time to the journal file,
2666 : * so that the vacuuming code knows the age of this
2667 : * file even if the file might end up corrupted one
2668 : * day... Ideally we'd just use the creation time many
2669 : * file systems maintain for each file, but there is
2670 : * currently no usable API to query this, hence let's
2671 : * emulate this via extended attributes. If extended
2672 : * attributes are not supported we'll just skip this,
2673 : * and rely solely on mtime/atime/ctime of the file. */
2674 :
2675 18 : fd_setcrtime(f->fd, 0);
2676 :
2677 : #ifdef HAVE_GCRYPT
2678 : /* Try to load the FSPRG state, and if we can't, then
2679 : * just don't do sealing */
2680 18 : if (f->seal) {
2681 5 : r = journal_file_fss_load(f);
2682 5 : if (r < 0)
2683 5 : f->seal = false;
2684 : }
2685 : #endif
2686 :
2687 18 : r = journal_file_init_header(f, template);
2688 18 : if (r < 0)
2689 0 : goto fail;
2690 :
2691 18 : r = journal_file_fstat(f);
2692 18 : if (r < 0)
2693 0 : goto fail;
2694 :
2695 18 : newly_created = true;
2696 : }
2697 :
2698 1365 : if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2699 0 : r = -EIO;
2700 0 : goto fail;
2701 : }
2702 :
2703 1365 : r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2704 1365 : if (r < 0)
2705 0 : goto fail;
2706 :
2707 1365 : f->header = h;
2708 :
2709 1365 : if (!newly_created) {
2710 1347 : r = journal_file_verify_header(f);
2711 1347 : if (r < 0)
2712 816 : goto fail;
2713 : }
2714 :
2715 : #ifdef HAVE_GCRYPT
2716 549 : if (!newly_created && f->writable) {
2717 1 : r = journal_file_fss_load(f);
2718 1 : if (r < 0)
2719 0 : goto fail;
2720 : }
2721 : #endif
2722 :
2723 549 : if (f->writable) {
2724 19 : if (metrics) {
2725 0 : journal_default_metrics(metrics, f->fd);
2726 0 : f->metrics = *metrics;
2727 19 : } else if (template)
2728 3 : f->metrics = template->metrics;
2729 :
2730 19 : r = journal_file_refresh_header(f);
2731 19 : if (r < 0)
2732 0 : goto fail;
2733 : }
2734 :
2735 : #ifdef HAVE_GCRYPT
2736 549 : r = journal_file_hmac_setup(f);
2737 549 : if (r < 0)
2738 0 : goto fail;
2739 : #endif
2740 :
2741 549 : if (newly_created) {
2742 18 : r = journal_file_setup_field_hash_table(f);
2743 18 : if (r < 0)
2744 0 : goto fail;
2745 :
2746 18 : r = journal_file_setup_data_hash_table(f);
2747 18 : if (r < 0)
2748 0 : goto fail;
2749 :
2750 : #ifdef HAVE_GCRYPT
2751 18 : r = journal_file_append_first_tag(f);
2752 18 : if (r < 0)
2753 0 : goto fail;
2754 : #endif
2755 : }
2756 :
2757 549 : if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2758 0 : r = -EIO;
2759 0 : goto fail;
2760 : }
2761 :
2762 549 : *ret = f;
2763 549 : return 0;
2764 :
2765 : fail:
2766 816 : if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2767 0 : r = -EIO;
2768 :
2769 816 : journal_file_close(f);
2770 :
2771 816 : return r;
2772 : }
2773 :
2774 2 : int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2775 4 : _cleanup_free_ char *p = NULL;
2776 : size_t l;
2777 2 : JournalFile *old_file, *new_file = NULL;
2778 : int r;
2779 :
2780 2 : assert(f);
2781 2 : assert(*f);
2782 :
2783 2 : old_file = *f;
2784 :
2785 2 : if (!old_file->writable)
2786 0 : return -EINVAL;
2787 :
2788 2 : if (!endswith(old_file->path, ".journal"))
2789 0 : return -EINVAL;
2790 :
2791 2 : l = strlen(old_file->path);
2792 38 : r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2793 2 : (int) l - 8, old_file->path,
2794 32 : SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2795 2 : le64toh((*f)->header->head_entry_seqnum),
2796 2 : le64toh((*f)->header->head_entry_realtime));
2797 2 : if (r < 0)
2798 0 : return -ENOMEM;
2799 :
2800 : /* Try to rename the file to the archived version. If the file
2801 : * already was deleted, we'll get ENOENT, let's ignore that
2802 : * case. */
2803 2 : r = rename(old_file->path, p);
2804 2 : if (r < 0 && errno != ENOENT)
2805 0 : return -errno;
2806 :
2807 2 : old_file->header->state = STATE_ARCHIVED;
2808 :
2809 : /* Currently, btrfs is not very good with out write patterns
2810 : * and fragments heavily. Let's defrag our journal files when
2811 : * we archive them */
2812 2 : old_file->defrag_on_close = true;
2813 :
2814 2 : r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2815 2 : journal_file_close(old_file);
2816 :
2817 2 : *f = new_file;
2818 2 : return r;
2819 : }
2820 :
2821 0 : int journal_file_open_reliably(
2822 : const char *fname,
2823 : int flags,
2824 : mode_t mode,
2825 : bool compress,
2826 : bool seal,
2827 : JournalMetrics *metrics,
2828 : MMapCache *mmap_cache,
2829 : JournalFile *template,
2830 : JournalFile **ret) {
2831 :
2832 : int r;
2833 : size_t l;
2834 0 : _cleanup_free_ char *p = NULL;
2835 :
2836 0 : r = journal_file_open(fname, flags, mode, compress, seal,
2837 : metrics, mmap_cache, template, ret);
2838 0 : if (!IN_SET(r,
2839 : -EBADMSG, /* corrupted */
2840 : -ENODATA, /* truncated */
2841 : -EHOSTDOWN, /* other machine */
2842 : -EPROTONOSUPPORT, /* incompatible feature */
2843 : -EBUSY, /* unclean shutdown */
2844 : -ESHUTDOWN, /* already archived */
2845 : -EIO, /* IO error, including SIGBUS on mmap */
2846 : -EIDRM /* File has been deleted */))
2847 0 : return r;
2848 :
2849 0 : if ((flags & O_ACCMODE) == O_RDONLY)
2850 0 : return r;
2851 :
2852 0 : if (!(flags & O_CREAT))
2853 0 : return r;
2854 :
2855 0 : if (!endswith(fname, ".journal"))
2856 0 : return r;
2857 :
2858 : /* The file is corrupted. Rotate it away and try it again (but only once) */
2859 :
2860 0 : l = strlen(fname);
2861 0 : if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2862 0 : (int) l - 8, fname,
2863 : now(CLOCK_REALTIME),
2864 : random_u64()) < 0)
2865 0 : return -ENOMEM;
2866 :
2867 0 : r = rename(fname, p);
2868 0 : if (r < 0)
2869 0 : return -errno;
2870 :
2871 : /* btrfs doesn't cope well with our write pattern and
2872 : * fragments heavily. Let's defrag all files we rotate */
2873 :
2874 0 : (void) chattr_path(p, false, FS_NOCOW_FL);
2875 0 : (void) btrfs_defrag(p);
2876 :
2877 0 : log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2878 :
2879 0 : return journal_file_open(fname, flags, mode, compress, seal,
2880 : metrics, mmap_cache, template, ret);
2881 : }
2882 :
2883 10001 : int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2884 : uint64_t i, n;
2885 10001 : uint64_t q, xor_hash = 0;
2886 : int r;
2887 : EntryItem *items;
2888 : dual_timestamp ts;
2889 :
2890 10001 : assert(from);
2891 10001 : assert(to);
2892 10001 : assert(o);
2893 10001 : assert(p);
2894 :
2895 10001 : if (!to->writable)
2896 0 : return -EPERM;
2897 :
2898 10001 : ts.monotonic = le64toh(o->entry.monotonic);
2899 10001 : ts.realtime = le64toh(o->entry.realtime);
2900 :
2901 10001 : n = journal_file_entry_n_items(o);
2902 : /* alloca() can't take 0, hence let's allocate at least one */
2903 10001 : items = alloca(sizeof(EntryItem) * MAX(1u, n));
2904 :
2905 325550 : for (i = 0; i < n; i++) {
2906 : uint64_t l, h;
2907 : le64_t le_hash;
2908 : size_t t;
2909 : void *data;
2910 : Object *u;
2911 :
2912 152774 : q = le64toh(o->entry.items[i].object_offset);
2913 152774 : le_hash = o->entry.items[i].hash;
2914 :
2915 152774 : r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2916 152774 : if (r < 0)
2917 0 : return r;
2918 :
2919 152774 : if (le_hash != o->data.hash)
2920 0 : return -EBADMSG;
2921 :
2922 152774 : l = le64toh(o->object.size) - offsetof(Object, data.payload);
2923 152774 : t = (size_t) l;
2924 :
2925 : /* We hit the limit on 32bit machines */
2926 152774 : if ((uint64_t) t != l)
2927 0 : return -E2BIG;
2928 :
2929 152774 : if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2930 : #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2931 1 : size_t rsize = 0;
2932 :
2933 2 : r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2934 1 : o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2935 1 : if (r < 0)
2936 0 : return r;
2937 :
2938 1 : data = from->compress_buffer;
2939 1 : l = rsize;
2940 : #else
2941 : return -EPROTONOSUPPORT;
2942 : #endif
2943 : } else
2944 152773 : data = o->data.payload;
2945 :
2946 152774 : r = journal_file_append_data(to, data, l, &u, &h);
2947 152774 : if (r < 0)
2948 0 : return r;
2949 :
2950 152774 : xor_hash ^= le64toh(u->data.hash);
2951 152774 : items[i].object_offset = htole64(h);
2952 152774 : items[i].hash = u->data.hash;
2953 :
2954 152774 : r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2955 152774 : if (r < 0)
2956 0 : return r;
2957 : }
2958 :
2959 10001 : r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2960 :
2961 10001 : if (mmap_cache_got_sigbus(to->mmap, to->fd))
2962 0 : return -EIO;
2963 :
2964 10001 : return r;
2965 : }
2966 :
2967 0 : void journal_default_metrics(JournalMetrics *m, int fd) {
2968 0 : uint64_t fs_size = 0;
2969 : struct statvfs ss;
2970 : char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2971 :
2972 0 : assert(m);
2973 0 : assert(fd >= 0);
2974 :
2975 0 : if (fstatvfs(fd, &ss) >= 0)
2976 0 : fs_size = ss.f_frsize * ss.f_blocks;
2977 :
2978 0 : if (m->max_use == (uint64_t) -1) {
2979 :
2980 0 : if (fs_size > 0) {
2981 0 : m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2982 :
2983 0 : if (m->max_use > DEFAULT_MAX_USE_UPPER)
2984 0 : m->max_use = DEFAULT_MAX_USE_UPPER;
2985 :
2986 0 : if (m->max_use < DEFAULT_MAX_USE_LOWER)
2987 0 : m->max_use = DEFAULT_MAX_USE_LOWER;
2988 : } else
2989 0 : m->max_use = DEFAULT_MAX_USE_LOWER;
2990 : } else {
2991 0 : m->max_use = PAGE_ALIGN(m->max_use);
2992 :
2993 0 : if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2994 0 : m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2995 : }
2996 :
2997 0 : if (m->max_size == (uint64_t) -1) {
2998 0 : m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2999 :
3000 0 : if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3001 0 : m->max_size = DEFAULT_MAX_SIZE_UPPER;
3002 : } else
3003 0 : m->max_size = PAGE_ALIGN(m->max_size);
3004 :
3005 0 : if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3006 0 : m->max_size = JOURNAL_FILE_SIZE_MIN;
3007 :
3008 0 : if (m->max_size*2 > m->max_use)
3009 0 : m->max_use = m->max_size*2;
3010 :
3011 0 : if (m->min_size == (uint64_t) -1)
3012 0 : m->min_size = JOURNAL_FILE_SIZE_MIN;
3013 : else {
3014 0 : m->min_size = PAGE_ALIGN(m->min_size);
3015 :
3016 0 : if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3017 0 : m->min_size = JOURNAL_FILE_SIZE_MIN;
3018 :
3019 0 : if (m->min_size > m->max_size)
3020 0 : m->max_size = m->min_size;
3021 : }
3022 :
3023 0 : if (m->keep_free == (uint64_t) -1) {
3024 :
3025 0 : if (fs_size > 0) {
3026 0 : m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
3027 :
3028 0 : if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3029 0 : m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3030 :
3031 : } else
3032 0 : m->keep_free = DEFAULT_KEEP_FREE;
3033 : }
3034 :
3035 0 : log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
3036 : format_bytes(a, sizeof(a), m->max_use),
3037 : format_bytes(b, sizeof(b), m->max_size),
3038 : format_bytes(c, sizeof(c), m->min_size),
3039 : format_bytes(d, sizeof(d), m->keep_free));
3040 0 : }
3041 :
3042 0 : int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3043 0 : assert(f);
3044 0 : assert(from || to);
3045 :
3046 0 : if (from) {
3047 0 : if (f->header->head_entry_realtime == 0)
3048 0 : return -ENOENT;
3049 :
3050 0 : *from = le64toh(f->header->head_entry_realtime);
3051 : }
3052 :
3053 0 : if (to) {
3054 0 : if (f->header->tail_entry_realtime == 0)
3055 0 : return -ENOENT;
3056 :
3057 0 : *to = le64toh(f->header->tail_entry_realtime);
3058 : }
3059 :
3060 0 : return 1;
3061 : }
3062 :
3063 0 : int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3064 : Object *o;
3065 : uint64_t p;
3066 : int r;
3067 :
3068 0 : assert(f);
3069 0 : assert(from || to);
3070 :
3071 0 : r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3072 0 : if (r <= 0)
3073 0 : return r;
3074 :
3075 0 : if (le64toh(o->data.n_entries) <= 0)
3076 0 : return 0;
3077 :
3078 0 : if (from) {
3079 0 : r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3080 0 : if (r < 0)
3081 0 : return r;
3082 :
3083 0 : *from = le64toh(o->entry.monotonic);
3084 : }
3085 :
3086 0 : if (to) {
3087 0 : r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3088 0 : if (r < 0)
3089 0 : return r;
3090 :
3091 0 : r = generic_array_get_plus_one(f,
3092 0 : le64toh(o->data.entry_offset),
3093 0 : le64toh(o->data.entry_array_offset),
3094 0 : le64toh(o->data.n_entries)-1,
3095 : &o, NULL);
3096 0 : if (r <= 0)
3097 0 : return r;
3098 :
3099 0 : *to = le64toh(o->entry.monotonic);
3100 : }
3101 :
3102 0 : return 1;
3103 : }
3104 :
3105 6 : bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3106 6 : assert(f);
3107 :
3108 : /* If we gained new header fields we gained new features,
3109 : * hence suggest a rotation */
3110 6 : if (le64toh(f->header->header_size) < sizeof(Header)) {
3111 0 : log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3112 0 : return true;
3113 : }
3114 :
3115 : /* Let's check if the hash tables grew over a certain fill
3116 : * level (75%, borrowing this value from Java's hash table
3117 : * implementation), and if so suggest a rotation. To calculate
3118 : * the fill level we need the n_data field, which only exists
3119 : * in newer versions. */
3120 :
3121 6 : if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3122 6 : if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3123 0 : log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3124 : f->path,
3125 : 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3126 : le64toh(f->header->n_data),
3127 : le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3128 : (unsigned long long) f->last_stat.st_size,
3129 : f->last_stat.st_size / le64toh(f->header->n_data));
3130 0 : return true;
3131 : }
3132 :
3133 6 : if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3134 6 : if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3135 0 : log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3136 : f->path,
3137 : 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3138 : le64toh(f->header->n_fields),
3139 : le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3140 0 : return true;
3141 : }
3142 :
3143 : /* Are the data objects properly indexed by field objects? */
3144 12 : if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3145 12 : JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3146 8 : le64toh(f->header->n_data) > 0 &&
3147 2 : le64toh(f->header->n_fields) == 0)
3148 0 : return true;
3149 :
3150 6 : if (max_file_usec > 0) {
3151 : usec_t t, h;
3152 :
3153 0 : h = le64toh(f->header->head_entry_realtime);
3154 0 : t = now(CLOCK_REALTIME);
3155 :
3156 0 : if (h > 0 && t > h + max_file_usec)
3157 0 : return true;
3158 : }
3159 :
3160 6 : return false;
3161 : }
|