Line data Source code
1 : /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2 :
3 : /***
4 : This file is part of systemd.
5 :
6 : Copyright 2013 Lennart Poettering
7 :
8 : systemd is free software; you can redistribute it and/or modify it
9 : under the terms of the GNU Lesser General Public License as published by
10 : the Free Software Foundation; either version 2.1 of the License, or
11 : (at your option) any later version.
12 :
13 : systemd is distributed in the hope that it will be useful, but
14 : WITHOUT ANY WARRANTY; without even the implied warranty of
15 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 : Lesser General Public License for more details.
17 :
18 : You should have received a copy of the GNU Lesser General Public License
19 : along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 : ***/
21 :
22 : #include <sys/statfs.h>
23 : #include <linux/fs.h>
24 : #include <fcntl.h>
25 :
26 : #include "utf8.h"
27 : #include "btrfs-util.h"
28 : #include "path-util.h"
29 : #include "copy.h"
30 : #include "mkdir.h"
31 : #include "rm-rf.h"
32 : #include "machine-image.h"
33 :
34 : static const char image_search_path[] =
35 : "/var/lib/machines\0"
36 : "/var/lib/container\0"
37 : "/usr/local/lib/machines\0"
38 : "/usr/lib/machines\0";
39 :
40 0 : Image *image_unref(Image *i) {
41 0 : if (!i)
42 0 : return NULL;
43 :
44 0 : free(i->name);
45 0 : free(i->path);
46 0 : free(i);
47 0 : return NULL;
48 : }
49 :
50 0 : static int image_new(
51 : ImageType t,
52 : const char *pretty,
53 : const char *path,
54 : const char *filename,
55 : bool read_only,
56 : usec_t crtime,
57 : usec_t mtime,
58 : Image **ret) {
59 :
60 0 : _cleanup_(image_unrefp) Image *i = NULL;
61 :
62 0 : assert(t >= 0);
63 0 : assert(t < _IMAGE_TYPE_MAX);
64 0 : assert(pretty);
65 0 : assert(filename);
66 0 : assert(ret);
67 :
68 0 : i = new0(Image, 1);
69 0 : if (!i)
70 0 : return -ENOMEM;
71 :
72 0 : i->type = t;
73 0 : i->read_only = read_only;
74 0 : i->crtime = crtime;
75 0 : i->mtime = mtime;
76 0 : i->usage = i->usage_exclusive = (uint64_t) -1;
77 0 : i->limit = i->limit_exclusive = (uint64_t) -1;
78 :
79 0 : i->name = strdup(pretty);
80 0 : if (!i->name)
81 0 : return -ENOMEM;
82 :
83 0 : if (path)
84 0 : i->path = strjoin(path, "/", filename, NULL);
85 : else
86 0 : i->path = strdup(filename);
87 :
88 0 : if (!i->path)
89 0 : return -ENOMEM;
90 :
91 0 : path_kill_slashes(i->path);
92 :
93 0 : *ret = i;
94 0 : i = NULL;
95 :
96 0 : return 0;
97 : }
98 :
99 0 : static int image_make(
100 : const char *pretty,
101 : int dfd,
102 : const char *path,
103 : const char *filename,
104 : Image **ret) {
105 :
106 : struct stat st;
107 : bool read_only;
108 : int r;
109 :
110 0 : assert(filename);
111 :
112 : /* We explicitly *do* follow symlinks here, since we want to
113 : * allow symlinking trees into /var/lib/machines/, and treat
114 : * them normally. */
115 :
116 0 : if (fstatat(dfd, filename, &st, 0) < 0)
117 0 : return -errno;
118 :
119 0 : read_only =
120 0 : (path && path_startswith(path, "/usr")) ||
121 0 : (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
122 :
123 0 : if (S_ISDIR(st.st_mode)) {
124 0 : _cleanup_close_ int fd = -1;
125 0 : unsigned file_attr = 0;
126 :
127 0 : if (!ret)
128 0 : return 1;
129 :
130 0 : if (!pretty)
131 0 : pretty = filename;
132 :
133 0 : fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
134 0 : if (fd < 0)
135 0 : return -errno;
136 :
137 : /* btrfs subvolumes have inode 256 */
138 0 : if (st.st_ino == 256) {
139 :
140 0 : r = btrfs_is_filesystem(fd);
141 0 : if (r < 0)
142 0 : return r;
143 0 : if (r) {
144 : BtrfsSubvolInfo info;
145 : BtrfsQuotaInfo quota;
146 :
147 : /* It's a btrfs subvolume */
148 :
149 0 : r = btrfs_subvol_get_info_fd(fd, &info);
150 0 : if (r < 0)
151 0 : return r;
152 :
153 0 : r = image_new(IMAGE_SUBVOLUME,
154 : pretty,
155 : path,
156 : filename,
157 0 : info.read_only || read_only,
158 : info.otime,
159 : 0,
160 : ret);
161 0 : if (r < 0)
162 0 : return r;
163 :
164 0 : r = btrfs_subvol_get_quota_fd(fd, "a);
165 0 : if (r >= 0) {
166 0 : (*ret)->usage = quota.referenced;
167 0 : (*ret)->usage_exclusive = quota.exclusive;
168 :
169 0 : (*ret)->limit = quota.referenced_max;
170 0 : (*ret)->limit_exclusive = quota.exclusive_max;
171 : }
172 :
173 0 : return 1;
174 : }
175 : }
176 :
177 : /* If the IMMUTABLE bit is set, we consider the
178 : * directory read-only. Since the ioctl is not
179 : * supported everywhere we ignore failures. */
180 0 : (void) read_attr_fd(fd, &file_attr);
181 :
182 : /* It's just a normal directory. */
183 0 : r = image_new(IMAGE_DIRECTORY,
184 : pretty,
185 : path,
186 : filename,
187 0 : read_only || (file_attr & FS_IMMUTABLE_FL),
188 : 0,
189 : 0,
190 : ret);
191 0 : if (r < 0)
192 0 : return r;
193 :
194 0 : return 1;
195 :
196 0 : } else if (S_ISREG(st.st_mode) && endswith(filename, ".raw")) {
197 0 : usec_t crtime = 0;
198 :
199 : /* It's a RAW disk image */
200 :
201 0 : if (!ret)
202 0 : return 1;
203 :
204 0 : fd_getcrtime_at(dfd, filename, &crtime, 0);
205 :
206 0 : if (!pretty)
207 0 : pretty = strndupa(filename, strlen(filename) - 4);
208 :
209 0 : r = image_new(IMAGE_RAW,
210 : pretty,
211 : path,
212 : filename,
213 0 : !(st.st_mode & 0222) || read_only,
214 : crtime,
215 : timespec_load(&st.st_mtim),
216 : ret);
217 0 : if (r < 0)
218 0 : return r;
219 :
220 0 : (*ret)->usage = (*ret)->usage_exclusive = st.st_blocks * 512;
221 0 : (*ret)->limit = (*ret)->limit_exclusive = st.st_size;
222 :
223 0 : return 1;
224 : }
225 :
226 0 : return 0;
227 : }
228 :
229 0 : int image_find(const char *name, Image **ret) {
230 : const char *path;
231 : int r;
232 :
233 0 : assert(name);
234 :
235 : /* There are no images with invalid names */
236 0 : if (!image_name_is_valid(name))
237 0 : return 0;
238 :
239 0 : NULSTR_FOREACH(path, image_search_path) {
240 0 : _cleanup_closedir_ DIR *d = NULL;
241 :
242 0 : d = opendir(path);
243 0 : if (!d) {
244 0 : if (errno == ENOENT)
245 0 : continue;
246 :
247 0 : return -errno;
248 : }
249 :
250 0 : r = image_make(NULL, dirfd(d), path, name, ret);
251 0 : if (r == 0 || r == -ENOENT) {
252 0 : _cleanup_free_ char *raw = NULL;
253 :
254 0 : raw = strappend(name, ".raw");
255 0 : if (!raw)
256 0 : return -ENOMEM;
257 :
258 0 : r = image_make(NULL, dirfd(d), path, raw, ret);
259 0 : if (r == 0 || r == -ENOENT)
260 0 : continue;
261 : }
262 0 : if (r < 0)
263 0 : return r;
264 :
265 0 : return 1;
266 : }
267 :
268 0 : if (streq(name, ".host"))
269 0 : return image_make(".host", AT_FDCWD, NULL, "/", ret);
270 :
271 0 : return 0;
272 : };
273 :
274 0 : int image_discover(Hashmap *h) {
275 : const char *path;
276 : int r;
277 :
278 0 : assert(h);
279 :
280 0 : NULSTR_FOREACH(path, image_search_path) {
281 0 : _cleanup_closedir_ DIR *d = NULL;
282 : struct dirent *de;
283 :
284 0 : d = opendir(path);
285 0 : if (!d) {
286 0 : if (errno == ENOENT)
287 0 : continue;
288 :
289 0 : return -errno;
290 : }
291 :
292 0 : FOREACH_DIRENT_ALL(de, d, return -errno) {
293 0 : _cleanup_(image_unrefp) Image *image = NULL;
294 :
295 0 : if (!image_name_is_valid(de->d_name))
296 0 : continue;
297 :
298 0 : if (hashmap_contains(h, de->d_name))
299 0 : continue;
300 :
301 0 : r = image_make(NULL, dirfd(d), path, de->d_name, &image);
302 0 : if (r == 0 || r == -ENOENT)
303 0 : continue;
304 0 : if (r < 0)
305 0 : return r;
306 :
307 0 : r = hashmap_put(h, image->name, image);
308 0 : if (r < 0)
309 0 : return r;
310 :
311 0 : image = NULL;
312 0 : }
313 : }
314 :
315 0 : if (!hashmap_contains(h, ".host")) {
316 0 : _cleanup_(image_unrefp) Image *image = NULL;
317 :
318 0 : r = image_make(".host", AT_FDCWD, NULL, "/", &image);
319 0 : if (r < 0)
320 0 : return r;
321 :
322 0 : r = hashmap_put(h, image->name, image);
323 0 : if (r < 0)
324 0 : return r;
325 :
326 0 : image = NULL;
327 :
328 : }
329 :
330 0 : return 0;
331 : }
332 :
333 0 : void image_hashmap_free(Hashmap *map) {
334 : Image *i;
335 :
336 0 : while ((i = hashmap_steal_first(map)))
337 0 : image_unref(i);
338 :
339 0 : hashmap_free(map);
340 0 : }
341 :
342 0 : int image_remove(Image *i) {
343 0 : _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
344 : int r;
345 :
346 0 : assert(i);
347 :
348 0 : if (path_equal(i->path, "/") ||
349 0 : path_startswith(i->path, "/usr"))
350 0 : return -EROFS;
351 :
352 : /* Make sure we don't interfere with a running nspawn */
353 0 : r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
354 0 : if (r < 0)
355 0 : return r;
356 :
357 0 : switch (i->type) {
358 :
359 : case IMAGE_SUBVOLUME:
360 0 : return btrfs_subvol_remove(i->path, true);
361 :
362 : case IMAGE_DIRECTORY:
363 : /* Allow deletion of read-only directories */
364 0 : (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
365 0 : return rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
366 :
367 : case IMAGE_RAW:
368 0 : if (unlink(i->path) < 0)
369 0 : return -errno;
370 :
371 0 : return 0;
372 :
373 : default:
374 0 : return -EOPNOTSUPP;
375 : }
376 : }
377 :
378 0 : int image_rename(Image *i, const char *new_name) {
379 0 : _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
380 0 : _cleanup_free_ char *new_path = NULL, *nn = NULL;
381 0 : unsigned file_attr = 0;
382 : int r;
383 :
384 0 : assert(i);
385 :
386 0 : if (!image_name_is_valid(new_name))
387 0 : return -EINVAL;
388 :
389 0 : if (path_equal(i->path, "/") ||
390 0 : path_startswith(i->path, "/usr"))
391 0 : return -EROFS;
392 :
393 : /* Make sure we don't interfere with a running nspawn */
394 0 : r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
395 0 : if (r < 0)
396 0 : return r;
397 :
398 : /* Make sure nobody takes the new name, between the time we
399 : * checked it is currently unused in all search paths, and the
400 : * time we take possesion of it */
401 0 : r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
402 0 : if (r < 0)
403 0 : return r;
404 :
405 0 : r = image_find(new_name, NULL);
406 0 : if (r < 0)
407 0 : return r;
408 0 : if (r > 0)
409 0 : return -EEXIST;
410 :
411 0 : switch (i->type) {
412 :
413 : case IMAGE_DIRECTORY:
414 : /* Turn of the immutable bit while we rename the image, so that we can rename it */
415 0 : (void) read_attr_path(i->path, &file_attr);
416 :
417 0 : if (file_attr & FS_IMMUTABLE_FL)
418 0 : (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
419 :
420 : /* fall through */
421 :
422 : case IMAGE_SUBVOLUME:
423 0 : new_path = file_in_same_dir(i->path, new_name);
424 0 : break;
425 :
426 : case IMAGE_RAW: {
427 : const char *fn;
428 :
429 0 : fn = strjoina(new_name, ".raw");
430 0 : new_path = file_in_same_dir(i->path, fn);
431 0 : break;
432 : }
433 :
434 : default:
435 0 : return -EOPNOTSUPP;
436 : }
437 :
438 0 : if (!new_path)
439 0 : return -ENOMEM;
440 :
441 0 : nn = strdup(new_name);
442 0 : if (!nn)
443 0 : return -ENOMEM;
444 :
445 0 : r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
446 0 : if (r < 0)
447 0 : return r;
448 :
449 : /* Restore the immutable bit, if it was set before */
450 0 : if (file_attr & FS_IMMUTABLE_FL)
451 0 : (void) chattr_path(new_path, true, FS_IMMUTABLE_FL);
452 :
453 0 : free(i->path);
454 0 : i->path = new_path;
455 0 : new_path = NULL;
456 :
457 0 : free(i->name);
458 0 : i->name = nn;
459 0 : nn = NULL;
460 :
461 0 : return 0;
462 : }
463 :
464 0 : int image_clone(Image *i, const char *new_name, bool read_only) {
465 0 : _cleanup_release_lock_file_ LockFile name_lock = LOCK_FILE_INIT;
466 : const char *new_path;
467 : int r;
468 :
469 0 : assert(i);
470 :
471 0 : if (!image_name_is_valid(new_name))
472 0 : return -EINVAL;
473 :
474 : /* Make sure nobody takes the new name, between the time we
475 : * checked it is currently unused in all search paths, and the
476 : * time we take possesion of it */
477 0 : r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
478 0 : if (r < 0)
479 0 : return r;
480 :
481 0 : r = image_find(new_name, NULL);
482 0 : if (r < 0)
483 0 : return r;
484 0 : if (r > 0)
485 0 : return -EEXIST;
486 :
487 0 : switch (i->type) {
488 :
489 : case IMAGE_SUBVOLUME:
490 : case IMAGE_DIRECTORY:
491 0 : new_path = strjoina("/var/lib/machines/", new_name);
492 :
493 0 : r = btrfs_subvol_snapshot(i->path, new_path, (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
494 0 : break;
495 :
496 : case IMAGE_RAW:
497 0 : new_path = strjoina("/var/lib/machines/", new_name, ".raw");
498 :
499 0 : r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, false, FS_NOCOW_FL);
500 0 : break;
501 :
502 : default:
503 0 : return -EOPNOTSUPP;
504 : }
505 :
506 0 : if (r < 0)
507 0 : return r;
508 :
509 0 : return 0;
510 : }
511 :
512 0 : int image_read_only(Image *i, bool b) {
513 0 : _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
514 : int r;
515 0 : assert(i);
516 :
517 0 : if (path_equal(i->path, "/") ||
518 0 : path_startswith(i->path, "/usr"))
519 0 : return -EROFS;
520 :
521 : /* Make sure we don't interfere with a running nspawn */
522 0 : r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
523 0 : if (r < 0)
524 0 : return r;
525 :
526 0 : switch (i->type) {
527 :
528 : case IMAGE_SUBVOLUME:
529 0 : r = btrfs_subvol_set_read_only(i->path, b);
530 0 : if (r < 0)
531 0 : return r;
532 :
533 0 : break;
534 :
535 : case IMAGE_DIRECTORY:
536 : /* For simple directory trees we cannot use the access
537 : mode of the top-level directory, since it has an
538 : effect on the container itself. However, we can
539 : use the "immutable" flag, to at least make the
540 : top-level directory read-only. It's not as good as
541 : a read-only subvolume, but at least something, and
542 : we can read the value back.*/
543 :
544 0 : r = chattr_path(i->path, b, FS_IMMUTABLE_FL);
545 0 : if (r < 0)
546 0 : return r;
547 :
548 0 : break;
549 :
550 : case IMAGE_RAW: {
551 : struct stat st;
552 :
553 0 : if (stat(i->path, &st) < 0)
554 0 : return -errno;
555 :
556 0 : if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
557 0 : return -errno;
558 :
559 : /* If the images is now read-only, it's a good time to
560 : * defrag it, given that no write patterns will
561 : * fragment it again. */
562 0 : if (b)
563 0 : (void) btrfs_defrag(i->path);
564 0 : break;
565 : }
566 :
567 : default:
568 0 : return -EOPNOTSUPP;
569 : }
570 :
571 0 : return 0;
572 : }
573 :
574 0 : int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
575 0 : _cleanup_free_ char *p = NULL;
576 0 : LockFile t = LOCK_FILE_INIT;
577 : struct stat st;
578 : int r;
579 :
580 0 : assert(path);
581 0 : assert(global);
582 0 : assert(local);
583 :
584 : /* Locks an image path. This actually creates two locks: one
585 : * "local" one, next to the image path itself, which might be
586 : * shared via NFS. And another "global" one, in /run, that
587 : * uses the device/inode number. This has the benefit that we
588 : * can even lock a tree that is a mount point, correctly. */
589 :
590 0 : if (path_equal(path, "/"))
591 0 : return -EBUSY;
592 :
593 0 : if (!path_is_absolute(path))
594 0 : return -EINVAL;
595 :
596 0 : if (stat(path, &st) >= 0) {
597 0 : if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
598 0 : return -ENOMEM;
599 : }
600 :
601 0 : r = make_lock_file_for(path, operation, &t);
602 0 : if (r < 0)
603 0 : return r;
604 :
605 0 : if (p) {
606 0 : mkdir_p("/run/systemd/nspawn/locks", 0700);
607 :
608 0 : r = make_lock_file(p, operation, global);
609 0 : if (r < 0) {
610 0 : release_lock_file(&t);
611 0 : return r;
612 : }
613 : }
614 :
615 0 : *local = t;
616 0 : return 0;
617 : }
618 :
619 0 : int image_set_limit(Image *i, uint64_t referenced_max) {
620 0 : assert(i);
621 :
622 0 : if (path_equal(i->path, "/") ||
623 0 : path_startswith(i->path, "/usr"))
624 0 : return -EROFS;
625 :
626 0 : if (i->type != IMAGE_SUBVOLUME)
627 0 : return -EOPNOTSUPP;
628 :
629 0 : return btrfs_quota_limit(i->path, referenced_max);
630 : }
631 :
632 0 : int image_name_lock(const char *name, int operation, LockFile *ret) {
633 : const char *p;
634 :
635 0 : assert(name);
636 0 : assert(ret);
637 :
638 : /* Locks an image name, regardless of the precise path used. */
639 :
640 0 : if (!image_name_is_valid(name))
641 0 : return -EINVAL;
642 :
643 0 : if (streq(name, ".host"))
644 0 : return -EBUSY;
645 :
646 0 : mkdir_p("/run/systemd/nspawn/locks", 0700);
647 0 : p = strjoina("/run/systemd/nspawn/locks/name-", name);
648 :
649 0 : return make_lock_file(p, operation, ret);
650 : }
651 :
652 0 : bool image_name_is_valid(const char *s) {
653 0 : if (!filename_is_valid(s))
654 0 : return false;
655 :
656 0 : if (string_has_cc(s, NULL))
657 0 : return false;
658 :
659 0 : if (!utf8_is_valid(s))
660 0 : return false;
661 :
662 : /* Temporary files for atomically creating new files */
663 0 : if (startswith(s, ".#"))
664 0 : return false;
665 :
666 0 : return true;
667 : }
668 :
669 : static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
670 : [IMAGE_DIRECTORY] = "directory",
671 : [IMAGE_SUBVOLUME] = "subvolume",
672 : [IMAGE_RAW] = "raw",
673 : };
674 :
675 0 : DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);
|