Line data Source code
1 : /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2 :
3 : /***
4 : This file is part of systemd.
5 :
6 : Copyright 2013 Lennart Poettering
7 :
8 : systemd is free software; you can redistribute it and/or modify it
9 : under the terms of the GNU Lesser General Public License as published by
10 : the Free Software Foundation; either version 2.1 of the License, or
11 : (at your option) any later version.
12 :
13 : systemd is distributed in the hope that it will be useful, but
14 : WITHOUT ANY WARRANTY; without even the implied warranty of
15 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 : Lesser General Public License for more details.
17 :
18 : You should have received a copy of the GNU Lesser General Public License
19 : along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 : ***/
21 :
22 : #include <fcntl.h>
23 : #include <fnmatch.h>
24 :
25 : #include "process-util.h"
26 : #include "path-util.h"
27 : #include "special.h"
28 : #include "cgroup-util.h"
29 : #include "cgroup.h"
30 :
31 : #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
32 :
33 127 : void cgroup_context_init(CGroupContext *c) {
34 127 : assert(c);
35 :
36 : /* Initialize everything to the kernel defaults, assuming the
37 : * structure is preinitialized to 0 */
38 :
39 127 : c->cpu_shares = (unsigned long) -1;
40 127 : c->startup_cpu_shares = (unsigned long) -1;
41 127 : c->memory_limit = (uint64_t) -1;
42 127 : c->blockio_weight = (unsigned long) -1;
43 127 : c->startup_blockio_weight = (unsigned long) -1;
44 :
45 127 : c->cpu_quota_per_sec_usec = USEC_INFINITY;
46 127 : }
47 :
48 0 : void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
49 0 : assert(c);
50 0 : assert(a);
51 :
52 0 : LIST_REMOVE(device_allow, c->device_allow, a);
53 0 : free(a->path);
54 0 : free(a);
55 0 : }
56 :
57 0 : void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
58 0 : assert(c);
59 0 : assert(w);
60 :
61 0 : LIST_REMOVE(device_weights, c->blockio_device_weights, w);
62 0 : free(w->path);
63 0 : free(w);
64 0 : }
65 :
66 0 : void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
67 0 : assert(c);
68 0 : assert(b);
69 :
70 0 : LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
71 0 : free(b->path);
72 0 : free(b);
73 0 : }
74 :
75 127 : void cgroup_context_done(CGroupContext *c) {
76 127 : assert(c);
77 :
78 254 : while (c->blockio_device_weights)
79 0 : cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
80 :
81 254 : while (c->blockio_device_bandwidths)
82 0 : cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
83 :
84 254 : while (c->device_allow)
85 0 : cgroup_context_free_device_allow(c, c->device_allow);
86 127 : }
87 :
88 4 : void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
89 : CGroupBlockIODeviceBandwidth *b;
90 : CGroupBlockIODeviceWeight *w;
91 : CGroupDeviceAllow *a;
92 : char u[FORMAT_TIMESPAN_MAX];
93 :
94 4 : assert(c);
95 4 : assert(f);
96 :
97 4 : prefix = strempty(prefix);
98 :
99 16 : fprintf(f,
100 : "%sCPUAccounting=%s\n"
101 : "%sBlockIOAccounting=%s\n"
102 : "%sMemoryAccounting=%s\n"
103 : "%sCPUShares=%lu\n"
104 : "%sStartupCPUShares=%lu\n"
105 : "%sCPUQuotaPerSecSec=%s\n"
106 : "%sBlockIOWeight=%lu\n"
107 : "%sStartupBlockIOWeight=%lu\n"
108 : "%sMemoryLimit=%" PRIu64 "\n"
109 : "%sDevicePolicy=%s\n"
110 : "%sDelegate=%s\n",
111 4 : prefix, yes_no(c->cpu_accounting),
112 4 : prefix, yes_no(c->blockio_accounting),
113 4 : prefix, yes_no(c->memory_accounting),
114 : prefix, c->cpu_shares,
115 : prefix, c->startup_cpu_shares,
116 : prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
117 : prefix, c->blockio_weight,
118 : prefix, c->startup_blockio_weight,
119 : prefix, c->memory_limit,
120 : prefix, cgroup_device_policy_to_string(c->device_policy),
121 4 : prefix, yes_no(c->delegate));
122 :
123 4 : LIST_FOREACH(device_allow, a, c->device_allow)
124 0 : fprintf(f,
125 : "%sDeviceAllow=%s %s%s%s\n",
126 : prefix,
127 : a->path,
128 0 : a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
129 :
130 4 : LIST_FOREACH(device_weights, w, c->blockio_device_weights)
131 0 : fprintf(f,
132 : "%sBlockIODeviceWeight=%s %lu",
133 : prefix,
134 : w->path,
135 : w->weight);
136 :
137 4 : LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
138 : char buf[FORMAT_BYTES_MAX];
139 :
140 0 : fprintf(f,
141 : "%s%s=%s %s\n",
142 : prefix,
143 0 : b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
144 : b->path,
145 0 : format_bytes(buf, sizeof(buf), b->bandwidth));
146 : }
147 4 : }
148 :
149 0 : static int lookup_blkio_device(const char *p, dev_t *dev) {
150 : struct stat st;
151 : int r;
152 :
153 0 : assert(p);
154 0 : assert(dev);
155 :
156 0 : r = stat(p, &st);
157 0 : if (r < 0)
158 0 : return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
159 :
160 0 : if (S_ISBLK(st.st_mode))
161 0 : *dev = st.st_rdev;
162 0 : else if (major(st.st_dev) != 0) {
163 : /* If this is not a device node then find the block
164 : * device this file is stored on */
165 0 : *dev = st.st_dev;
166 :
167 : /* If this is a partition, try to get the originating
168 : * block device */
169 0 : block_get_whole_disk(*dev, dev);
170 : } else {
171 0 : log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
172 0 : return -ENODEV;
173 : }
174 :
175 0 : return 0;
176 : }
177 :
178 0 : static int whitelist_device(const char *path, const char *node, const char *acc) {
179 : char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
180 : struct stat st;
181 : int r;
182 :
183 0 : assert(path);
184 0 : assert(acc);
185 :
186 0 : if (stat(node, &st) < 0) {
187 0 : log_warning("Couldn't stat device %s", node);
188 0 : return -errno;
189 : }
190 :
191 0 : if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
192 0 : log_warning("%s is not a device.", node);
193 0 : return -ENODEV;
194 : }
195 :
196 0 : sprintf(buf,
197 : "%c %u:%u %s",
198 0 : S_ISCHR(st.st_mode) ? 'c' : 'b',
199 0 : major(st.st_rdev), minor(st.st_rdev),
200 : acc);
201 :
202 0 : r = cg_set_attribute("devices", path, "devices.allow", buf);
203 0 : if (r < 0)
204 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
205 : "Failed to set devices.allow on %s: %m", path);
206 :
207 0 : return r;
208 : }
209 :
210 0 : static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
211 0 : _cleanup_fclose_ FILE *f = NULL;
212 : char line[LINE_MAX];
213 0 : bool good = false;
214 : int r;
215 :
216 0 : assert(path);
217 0 : assert(acc);
218 0 : assert(type == 'b' || type == 'c');
219 :
220 0 : f = fopen("/proc/devices", "re");
221 0 : if (!f)
222 0 : return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
223 :
224 0 : FOREACH_LINE(line, f, goto fail) {
225 : char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
226 : unsigned maj;
227 :
228 0 : truncate_nl(line);
229 :
230 0 : if (type == 'c' && streq(line, "Character devices:")) {
231 0 : good = true;
232 0 : continue;
233 : }
234 :
235 0 : if (type == 'b' && streq(line, "Block devices:")) {
236 0 : good = true;
237 0 : continue;
238 : }
239 :
240 0 : if (isempty(line)) {
241 0 : good = false;
242 0 : continue;
243 : }
244 :
245 0 : if (!good)
246 0 : continue;
247 :
248 0 : p = strstrip(line);
249 :
250 0 : w = strpbrk(p, WHITESPACE);
251 0 : if (!w)
252 0 : continue;
253 0 : *w = 0;
254 :
255 0 : r = safe_atou(p, &maj);
256 0 : if (r < 0)
257 0 : continue;
258 0 : if (maj <= 0)
259 0 : continue;
260 :
261 0 : w++;
262 0 : w += strspn(w, WHITESPACE);
263 :
264 0 : if (fnmatch(name, w, 0) != 0)
265 0 : continue;
266 :
267 0 : sprintf(buf,
268 : "%c %u:* %s",
269 : type,
270 : maj,
271 : acc);
272 :
273 0 : r = cg_set_attribute("devices", path, "devices.allow", buf);
274 0 : if (r < 0)
275 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
276 : "Failed to set devices.allow on %s: %m", path);
277 0 : }
278 :
279 0 : return 0;
280 :
281 : fail:
282 0 : log_warning_errno(errno, "Failed to read /proc/devices: %m");
283 0 : return -errno;
284 : }
285 :
286 6 : void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
287 : bool is_root;
288 : int r;
289 :
290 6 : assert(c);
291 6 : assert(path);
292 :
293 6 : if (mask == 0)
294 6 : return;
295 :
296 : /* Some cgroup attributes are not supported on the root cgroup,
297 : * hence silently ignore */
298 0 : is_root = isempty(path) || path_equal(path, "/");
299 0 : if (is_root)
300 : /* Make sure we don't try to display messages with an empty path. */
301 0 : path = "/";
302 :
303 : /* We generally ignore errors caused by read-only mounted
304 : * cgroup trees (assuming we are running in a container then),
305 : * and missing cgroups, i.e. EROFS and ENOENT. */
306 :
307 0 : if ((mask & CGROUP_CPU) && !is_root) {
308 0 : char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
309 :
310 0 : sprintf(buf, "%lu\n",
311 0 : IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
312 0 : c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
313 0 : r = cg_set_attribute("cpu", path, "cpu.shares", buf);
314 0 : if (r < 0)
315 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
316 : "Failed to set cpu.shares on %s: %m", path);
317 :
318 0 : sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
319 0 : r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
320 0 : if (r < 0)
321 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
322 : "Failed to set cpu.cfs_period_us on %s: %m", path);
323 :
324 0 : if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
325 0 : sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
326 0 : r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
327 : } else
328 0 : r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
329 0 : if (r < 0)
330 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
331 : "Failed to set cpu.cfs_quota_us on %s: %m", path);
332 : }
333 :
334 0 : if (mask & CGROUP_BLKIO) {
335 0 : char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
336 : DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
337 0 : DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
338 : CGroupBlockIODeviceWeight *w;
339 : CGroupBlockIODeviceBandwidth *b;
340 :
341 0 : if (!is_root) {
342 0 : sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
343 0 : c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
344 0 : r = cg_set_attribute("blkio", path, "blkio.weight", buf);
345 0 : if (r < 0)
346 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
347 : "Failed to set blkio.weight on %s: %m", path);
348 :
349 : /* FIXME: no way to reset this list */
350 0 : LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
351 : dev_t dev;
352 :
353 0 : r = lookup_blkio_device(w->path, &dev);
354 0 : if (r < 0)
355 0 : continue;
356 :
357 0 : sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
358 0 : r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
359 0 : if (r < 0)
360 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
361 : "Failed to set blkio.weight_device on %s: %m", path);
362 : }
363 : }
364 :
365 : /* FIXME: no way to reset this list */
366 0 : LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
367 : const char *a;
368 : dev_t dev;
369 :
370 0 : r = lookup_blkio_device(b->path, &dev);
371 0 : if (r < 0)
372 0 : continue;
373 :
374 0 : a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
375 :
376 0 : sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
377 0 : r = cg_set_attribute("blkio", path, a, buf);
378 0 : if (r < 0)
379 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
380 : "Failed to set %s on %s: %m", a, path);
381 : }
382 : }
383 :
384 0 : if ((mask & CGROUP_MEMORY) && !is_root) {
385 0 : if (c->memory_limit != (uint64_t) -1) {
386 : char buf[DECIMAL_STR_MAX(uint64_t) + 1];
387 :
388 0 : sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
389 0 : r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
390 : } else
391 0 : r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
392 :
393 0 : if (r < 0)
394 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
395 : "Failed to set memory.limit_in_bytes on %s: %m", path);
396 : }
397 :
398 0 : if ((mask & CGROUP_DEVICE) && !is_root) {
399 : CGroupDeviceAllow *a;
400 :
401 : /* Changing the devices list of a populated cgroup
402 : * might result in EINVAL, hence ignore EINVAL
403 : * here. */
404 :
405 0 : if (c->device_allow || c->device_policy != CGROUP_AUTO)
406 0 : r = cg_set_attribute("devices", path, "devices.deny", "a");
407 : else
408 0 : r = cg_set_attribute("devices", path, "devices.allow", "a");
409 0 : if (r < 0)
410 0 : log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
411 : "Failed to reset devices.list on %s: %m", path);
412 :
413 0 : if (c->device_policy == CGROUP_CLOSED ||
414 0 : (c->device_policy == CGROUP_AUTO && c->device_allow)) {
415 : static const char auto_devices[] =
416 : "/dev/null\0" "rwm\0"
417 : "/dev/zero\0" "rwm\0"
418 : "/dev/full\0" "rwm\0"
419 : "/dev/random\0" "rwm\0"
420 : "/dev/urandom\0" "rwm\0"
421 : "/dev/tty\0" "rwm\0"
422 : "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
423 :
424 : const char *x, *y;
425 :
426 0 : NULSTR_FOREACH_PAIR(x, y, auto_devices)
427 0 : whitelist_device(path, x, y);
428 :
429 0 : whitelist_major(path, "pts", 'c', "rw");
430 0 : whitelist_major(path, "kdbus", 'c', "rw");
431 0 : whitelist_major(path, "kdbus/*", 'c', "rw");
432 : }
433 :
434 0 : LIST_FOREACH(device_allow, a, c->device_allow) {
435 : char acc[4];
436 0 : unsigned k = 0;
437 :
438 0 : if (a->r)
439 0 : acc[k++] = 'r';
440 0 : if (a->w)
441 0 : acc[k++] = 'w';
442 0 : if (a->m)
443 0 : acc[k++] = 'm';
444 :
445 0 : if (k == 0)
446 0 : continue;
447 :
448 0 : acc[k++] = 0;
449 :
450 0 : if (startswith(a->path, "/dev/"))
451 0 : whitelist_device(path, a->path, acc);
452 0 : else if (startswith(a->path, "block-"))
453 0 : whitelist_major(path, a->path + 6, 'b', acc);
454 0 : else if (startswith(a->path, "char-"))
455 0 : whitelist_major(path, a->path + 5, 'c', acc);
456 : else
457 0 : log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
458 : }
459 : }
460 : }
461 :
462 905 : CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
463 905 : CGroupControllerMask mask = 0;
464 :
465 : /* Figure out which controllers we need */
466 :
467 1810 : if (c->cpu_accounting ||
468 1805 : c->cpu_shares != (unsigned long) -1 ||
469 1800 : c->startup_cpu_shares != (unsigned long) -1 ||
470 900 : c->cpu_quota_per_sec_usec != USEC_INFINITY)
471 5 : mask |= CGROUP_CPUACCT | CGROUP_CPU;
472 :
473 1810 : if (c->blockio_accounting ||
474 1803 : c->blockio_weight != (unsigned long) -1 ||
475 1796 : c->startup_blockio_weight != (unsigned long) -1 ||
476 1796 : c->blockio_device_weights ||
477 898 : c->blockio_device_bandwidths)
478 7 : mask |= CGROUP_BLKIO;
479 :
480 1810 : if (c->memory_accounting ||
481 905 : c->memory_limit != (uint64_t) -1)
482 3 : mask |= CGROUP_MEMORY;
483 :
484 1810 : if (c->device_allow ||
485 905 : c->device_policy != CGROUP_AUTO)
486 0 : mask |= CGROUP_DEVICE;
487 :
488 905 : return mask;
489 : }
490 :
491 1522 : CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
492 : CGroupContext *c;
493 :
494 1522 : c = unit_get_cgroup_context(u);
495 1522 : if (!c)
496 617 : return 0;
497 :
498 : /* If delegation is turned on, then turn on all cgroups,
499 : * unless the process we fork into it is known to drop
500 : * privileges anyway, and shouldn't get access to the
501 : * controllers anyway. */
502 :
503 905 : if (c->delegate) {
504 : ExecContext *e;
505 :
506 0 : e = unit_get_exec_context(u);
507 0 : if (!e || exec_context_maintains_privileges(e))
508 0 : return _CGROUP_CONTROLLER_MASK_ALL;
509 : }
510 :
511 905 : return cgroup_context_get_mask(c);
512 : }
513 :
514 1586 : CGroupControllerMask unit_get_members_mask(Unit *u) {
515 1586 : assert(u);
516 :
517 1586 : if (u->cgroup_members_mask_valid)
518 761 : return u->cgroup_members_mask;
519 :
520 825 : u->cgroup_members_mask = 0;
521 :
522 825 : if (u->type == UNIT_SLICE) {
523 : Unit *member;
524 : Iterator i;
525 :
526 895 : SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
527 :
528 685 : if (member == u)
529 0 : continue;
530 :
531 685 : if (UNIT_DEREF(member->slice) != u)
532 94 : continue;
533 :
534 591 : u->cgroup_members_mask |=
535 1182 : unit_get_cgroup_mask(member) |
536 591 : unit_get_members_mask(member);
537 : }
538 : }
539 :
540 825 : u->cgroup_members_mask_valid = true;
541 825 : return u->cgroup_members_mask;
542 : }
543 :
544 78 : CGroupControllerMask unit_get_siblings_mask(Unit *u) {
545 78 : assert(u);
546 :
547 78 : if (UNIT_ISSET(u->slice))
548 64 : return unit_get_members_mask(UNIT_DEREF(u->slice));
549 :
550 14 : return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
551 : }
552 :
553 72 : CGroupControllerMask unit_get_target_mask(Unit *u) {
554 : CGroupControllerMask mask;
555 :
556 72 : mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
557 72 : mask &= u->manager->cgroup_supported;
558 :
559 72 : return mask;
560 : }
561 :
562 : /* Recurse from a unit up through its containing slices, propagating
563 : * mask bits upward. A unit is also member of itself. */
564 839 : void unit_update_cgroup_members_masks(Unit *u) {
565 : CGroupControllerMask m;
566 : bool more;
567 :
568 839 : assert(u);
569 :
570 : /* Calculate subtree mask */
571 839 : m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
572 :
573 : /* See if anything changed from the previous invocation. If
574 : * not, we're done. */
575 839 : if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
576 101 : return;
577 :
578 738 : more =
579 744 : u->cgroup_subtree_mask_valid &&
580 744 : ((m & ~u->cgroup_subtree_mask) != 0) &&
581 6 : ((~m & u->cgroup_subtree_mask) == 0);
582 :
583 738 : u->cgroup_subtree_mask = m;
584 738 : u->cgroup_subtree_mask_valid = true;
585 :
586 738 : if (UNIT_ISSET(u->slice)) {
587 107 : Unit *s = UNIT_DEREF(u->slice);
588 :
589 107 : if (more)
590 : /* There's more set now than before. We
591 : * propagate the new mask to the parent's mask
592 : * (not caring if it actually was valid or
593 : * not). */
594 :
595 3 : s->cgroup_members_mask |= m;
596 :
597 : else
598 : /* There's less set now than before (or we
599 : * don't know), we need to recalculate
600 : * everything, so let's invalidate the
601 : * parent's members mask */
602 :
603 104 : s->cgroup_members_mask_valid = false;
604 :
605 : /* And now make sure that this change also hits our
606 : * grandparents */
607 107 : unit_update_cgroup_members_masks(s);
608 : }
609 : }
610 :
611 0 : static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
612 0 : Unit *u = userdata;
613 :
614 0 : assert(mask != 0);
615 0 : assert(u);
616 :
617 0 : while (u) {
618 0 : if (u->cgroup_path &&
619 0 : u->cgroup_realized &&
620 0 : (u->cgroup_realized_mask & mask) == mask)
621 0 : return u->cgroup_path;
622 :
623 0 : u = UNIT_DEREF(u->slice);
624 : }
625 :
626 0 : return NULL;
627 : }
628 :
629 12 : static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
630 : CGroupContext *c;
631 : int r;
632 :
633 12 : assert(u);
634 :
635 12 : c = unit_get_cgroup_context(u);
636 12 : if (!c)
637 0 : return 0;
638 :
639 12 : if (!u->cgroup_path) {
640 24 : _cleanup_free_ char *path = NULL;
641 :
642 12 : path = unit_default_cgroup_path(u);
643 12 : if (!path)
644 0 : return log_oom();
645 :
646 12 : r = hashmap_put(u->manager->cgroup_unit, path, u);
647 12 : if (r < 0) {
648 0 : log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
649 0 : return r;
650 : }
651 12 : if (r > 0) {
652 12 : u->cgroup_path = path;
653 12 : path = NULL;
654 : }
655 : }
656 :
657 : /* First, create our own group */
658 12 : r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
659 12 : if (r < 0)
660 6 : return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
661 :
662 : /* Keep track that this is now realized */
663 6 : u->cgroup_realized = true;
664 6 : u->cgroup_realized_mask = mask;
665 :
666 6 : if (u->type != UNIT_SLICE && !c->delegate) {
667 :
668 : /* Then, possibly move things over, but not if
669 : * subgroups may contain processes, which is the case
670 : * for slice and delegation units. */
671 0 : r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
672 0 : if (r < 0)
673 0 : log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
674 : }
675 :
676 6 : return 0;
677 : }
678 :
679 0 : int unit_attach_pids_to_cgroup(Unit *u) {
680 : int r;
681 0 : assert(u);
682 :
683 0 : r = unit_realize_cgroup(u);
684 0 : if (r < 0)
685 0 : return r;
686 :
687 0 : r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
688 0 : if (r < 0)
689 0 : return r;
690 :
691 0 : return 0;
692 : }
693 :
694 66 : static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
695 66 : assert(u);
696 :
697 66 : return u->cgroup_realized && u->cgroup_realized_mask == mask;
698 : }
699 :
700 : /* Check if necessary controllers and attributes for a unit are in place.
701 : *
702 : * If so, do nothing.
703 : * If not, create paths, move processes over, and set attributes.
704 : *
705 : * Returns 0 on success and < 0 on failure. */
706 18 : static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
707 : CGroupControllerMask mask;
708 : int r;
709 :
710 18 : assert(u);
711 :
712 18 : if (u->in_cgroup_queue) {
713 0 : LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
714 0 : u->in_cgroup_queue = false;
715 : }
716 :
717 18 : mask = unit_get_target_mask(u);
718 :
719 18 : if (unit_has_mask_realized(u, mask))
720 6 : return 0;
721 :
722 : /* First, realize parents */
723 12 : if (UNIT_ISSET(u->slice)) {
724 6 : r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
725 6 : if (r < 0)
726 0 : return r;
727 : }
728 :
729 : /* And then do the real work */
730 12 : r = unit_create_cgroups(u, mask);
731 12 : if (r < 0)
732 6 : return r;
733 :
734 : /* Finally, apply the necessary attributes. */
735 6 : cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
736 :
737 6 : return 0;
738 : }
739 :
740 48 : static void unit_add_to_cgroup_queue(Unit *u) {
741 :
742 48 : if (u->in_cgroup_queue)
743 0 : return;
744 :
745 48 : LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
746 48 : u->in_cgroup_queue = true;
747 : }
748 :
749 0 : unsigned manager_dispatch_cgroup_queue(Manager *m) {
750 : ManagerState state;
751 0 : unsigned n = 0;
752 : Unit *i;
753 : int r;
754 :
755 0 : state = manager_state(m);
756 :
757 0 : while ((i = m->cgroup_queue)) {
758 0 : assert(i->in_cgroup_queue);
759 :
760 0 : r = unit_realize_cgroup_now(i, state);
761 0 : if (r < 0)
762 0 : log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
763 :
764 0 : n++;
765 : }
766 :
767 0 : return n;
768 : }
769 :
770 12 : static void unit_queue_siblings(Unit *u) {
771 : Unit *slice;
772 :
773 : /* This adds the siblings of the specified unit and the
774 : * siblings of all parent units to the cgroup queue. (But
775 : * neither the specified unit itself nor the parents.) */
776 :
777 30 : while ((slice = UNIT_DEREF(u->slice))) {
778 : Iterator i;
779 : Unit *m;
780 :
781 72 : SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
782 60 : if (m == u)
783 6 : continue;
784 :
785 : /* Skip units that have a dependency on the slice
786 : * but aren't actually in it. */
787 54 : if (UNIT_DEREF(m->slice) != slice)
788 6 : continue;
789 :
790 : /* No point in doing cgroup application for units
791 : * without active processes. */
792 48 : if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
793 0 : continue;
794 :
795 : /* If the unit doesn't need any new controllers
796 : * and has current ones realized, it doesn't need
797 : * any changes. */
798 48 : if (unit_has_mask_realized(m, unit_get_target_mask(m)))
799 0 : continue;
800 :
801 48 : unit_add_to_cgroup_queue(m);
802 : }
803 :
804 6 : u = slice;
805 : }
806 12 : }
807 :
808 12 : int unit_realize_cgroup(Unit *u) {
809 : CGroupContext *c;
810 :
811 12 : assert(u);
812 :
813 12 : c = unit_get_cgroup_context(u);
814 12 : if (!c)
815 0 : return 0;
816 :
817 : /* So, here's the deal: when realizing the cgroups for this
818 : * unit, we need to first create all parents, but there's more
819 : * actually: for the weight-based controllers we also need to
820 : * make sure that all our siblings (i.e. units that are in the
821 : * same slice as we are) have cgroups, too. Otherwise, things
822 : * would become very uneven as each of their processes would
823 : * get as much resources as all our group together. This call
824 : * will synchronously create the parent cgroups, but will
825 : * defer work on the siblings to the next event loop
826 : * iteration. */
827 :
828 : /* Add all sibling slices to the cgroup queue. */
829 12 : unit_queue_siblings(u);
830 :
831 : /* And realize this one now (and apply the values) */
832 12 : return unit_realize_cgroup_now(u, manager_state(u->manager));
833 : }
834 :
835 7 : void unit_destroy_cgroup_if_empty(Unit *u) {
836 : int r;
837 :
838 7 : assert(u);
839 :
840 7 : if (!u->cgroup_path)
841 7 : return;
842 :
843 0 : r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
844 0 : if (r < 0) {
845 0 : log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
846 0 : return;
847 : }
848 :
849 0 : hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
850 :
851 0 : free(u->cgroup_path);
852 0 : u->cgroup_path = NULL;
853 0 : u->cgroup_realized = false;
854 0 : u->cgroup_realized_mask = 0;
855 : }
856 :
857 0 : pid_t unit_search_main_pid(Unit *u) {
858 0 : _cleanup_fclose_ FILE *f = NULL;
859 0 : pid_t pid = 0, npid, mypid;
860 :
861 0 : assert(u);
862 :
863 0 : if (!u->cgroup_path)
864 0 : return 0;
865 :
866 0 : if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
867 0 : return 0;
868 :
869 0 : mypid = getpid();
870 0 : while (cg_read_pid(f, &npid) > 0) {
871 : pid_t ppid;
872 :
873 0 : if (npid == pid)
874 0 : continue;
875 :
876 : /* Ignore processes that aren't our kids */
877 0 : if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
878 0 : continue;
879 :
880 0 : if (pid != 0) {
881 : /* Dang, there's more than one daemonized PID
882 : in this group, so we don't know what process
883 : is the main process. */
884 0 : pid = 0;
885 0 : break;
886 : }
887 :
888 0 : pid = npid;
889 : }
890 :
891 0 : return pid;
892 : }
893 :
894 11 : int manager_setup_cgroup(Manager *m) {
895 22 : _cleanup_free_ char *path = NULL;
896 : int r;
897 :
898 11 : assert(m);
899 :
900 : /* 1. Determine hierarchy */
901 11 : free(m->cgroup_root);
902 11 : m->cgroup_root = NULL;
903 :
904 11 : r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
905 11 : if (r < 0)
906 0 : return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
907 :
908 : /* LEGACY: Already in /system.slice? If so, let's cut this
909 : * off. This is to support live upgrades from older systemd
910 : * versions where PID 1 was moved there. */
911 11 : if (m->running_as == MANAGER_SYSTEM) {
912 : char *e;
913 :
914 0 : e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
915 0 : if (!e)
916 0 : e = endswith(m->cgroup_root, "/system");
917 0 : if (e)
918 0 : *e = 0;
919 : }
920 :
921 : /* And make sure to store away the root value without trailing
922 : * slash, even for the root dir, so that we can easily prepend
923 : * it everywhere. */
924 11 : if (streq(m->cgroup_root, "/"))
925 0 : m->cgroup_root[0] = 0;
926 :
927 : /* 2. Show data */
928 11 : r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
929 11 : if (r < 0)
930 0 : return log_error_errno(r, "Cannot find cgroup mount point: %m");
931 :
932 11 : log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
933 11 : if (!m->test_run) {
934 :
935 : /* 3. Install agent */
936 0 : if (m->running_as == MANAGER_SYSTEM) {
937 0 : r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
938 0 : if (r < 0)
939 0 : log_warning_errno(r, "Failed to install release agent, ignoring: %m");
940 0 : else if (r > 0)
941 0 : log_debug("Installed release agent.");
942 : else
943 0 : log_debug("Release agent already installed.");
944 : }
945 :
946 : /* 4. Make sure we are in the root cgroup */
947 0 : r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
948 0 : if (r < 0)
949 0 : return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
950 :
951 : /* 5. And pin it, so that it cannot be unmounted */
952 0 : safe_close(m->pin_cgroupfs_fd);
953 :
954 0 : m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
955 0 : if (m->pin_cgroupfs_fd < 0)
956 0 : return log_error_errno(errno, "Failed to open pin file: %m");
957 :
958 : /* 6. Always enable hierarchical support if it exists... */
959 0 : cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
960 : }
961 :
962 : /* 7. Figure out which controllers are supported */
963 11 : m->cgroup_supported = cg_mask_supported();
964 :
965 11 : return 0;
966 : }
967 :
968 11 : void manager_shutdown_cgroup(Manager *m, bool delete) {
969 11 : assert(m);
970 :
971 : /* We can't really delete the group, since we are in it. But
972 : * let's trim it. */
973 11 : if (delete && m->cgroup_root)
974 11 : cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
975 :
976 11 : m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
977 :
978 11 : free(m->cgroup_root);
979 11 : m->cgroup_root = NULL;
980 11 : }
981 :
982 0 : Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
983 : char *p;
984 : Unit *u;
985 :
986 0 : assert(m);
987 0 : assert(cgroup);
988 :
989 0 : u = hashmap_get(m->cgroup_unit, cgroup);
990 0 : if (u)
991 0 : return u;
992 :
993 0 : p = strdupa(cgroup);
994 : for (;;) {
995 : char *e;
996 :
997 0 : e = strrchr(p, '/');
998 0 : if (e == p || !e)
999 0 : return NULL;
1000 :
1001 0 : *e = 0;
1002 :
1003 0 : u = hashmap_get(m->cgroup_unit, p);
1004 0 : if (u)
1005 0 : return u;
1006 0 : }
1007 : }
1008 :
1009 0 : Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
1010 0 : _cleanup_free_ char *cgroup = NULL;
1011 : int r;
1012 :
1013 0 : assert(m);
1014 :
1015 0 : if (pid <= 1)
1016 0 : return NULL;
1017 :
1018 0 : r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1019 0 : if (r < 0)
1020 0 : return NULL;
1021 :
1022 0 : return manager_get_unit_by_cgroup(m, cgroup);
1023 : }
1024 :
1025 0 : int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1026 : Unit *u;
1027 : int r;
1028 :
1029 0 : assert(m);
1030 0 : assert(cgroup);
1031 :
1032 0 : u = manager_get_unit_by_cgroup(m, cgroup);
1033 0 : if (!u)
1034 0 : return 0;
1035 :
1036 0 : r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1037 0 : if (r <= 0)
1038 0 : return r;
1039 :
1040 0 : if (UNIT_VTABLE(u)->notify_cgroup_empty)
1041 0 : UNIT_VTABLE(u)->notify_cgroup_empty(u);
1042 :
1043 0 : unit_add_to_gc_queue(u);
1044 0 : return 0;
1045 : }
1046 :
1047 0 : int unit_get_memory_current(Unit *u, uint64_t *ret) {
1048 0 : _cleanup_free_ char *v = NULL;
1049 : int r;
1050 :
1051 0 : assert(u);
1052 0 : assert(ret);
1053 :
1054 0 : if (!u->cgroup_path)
1055 0 : return -ENODATA;
1056 :
1057 0 : if ((u->cgroup_realized_mask & CGROUP_MEMORY) == 0)
1058 0 : return -ENODATA;
1059 :
1060 0 : r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
1061 0 : if (r == -ENOENT)
1062 0 : return -ENODATA;
1063 0 : if (r < 0)
1064 0 : return r;
1065 :
1066 0 : return safe_atou64(v, ret);
1067 : }
1068 :
1069 12 : static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
1070 24 : _cleanup_free_ char *v = NULL;
1071 : uint64_t ns;
1072 : int r;
1073 :
1074 12 : assert(u);
1075 12 : assert(ret);
1076 :
1077 12 : if (!u->cgroup_path)
1078 0 : return -ENODATA;
1079 :
1080 12 : if ((u->cgroup_realized_mask & CGROUP_CPUACCT) == 0)
1081 12 : return -ENODATA;
1082 :
1083 0 : r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);
1084 0 : if (r == -ENOENT)
1085 0 : return -ENODATA;
1086 0 : if (r < 0)
1087 0 : return r;
1088 :
1089 0 : r = safe_atou64(v, &ns);
1090 0 : if (r < 0)
1091 0 : return r;
1092 :
1093 0 : *ret = ns;
1094 0 : return 0;
1095 : }
1096 :
1097 0 : int unit_get_cpu_usage(Unit *u, nsec_t *ret) {
1098 : nsec_t ns;
1099 : int r;
1100 :
1101 0 : r = unit_get_cpu_usage_raw(u, &ns);
1102 0 : if (r < 0)
1103 0 : return r;
1104 :
1105 0 : if (ns > u->cpuacct_usage_base)
1106 0 : ns -= u->cpuacct_usage_base;
1107 : else
1108 0 : ns = 0;
1109 :
1110 0 : *ret = ns;
1111 0 : return 0;
1112 : }
1113 :
1114 12 : int unit_reset_cpu_usage(Unit *u) {
1115 : nsec_t ns;
1116 : int r;
1117 :
1118 12 : assert(u);
1119 :
1120 12 : r = unit_get_cpu_usage_raw(u, &ns);
1121 12 : if (r < 0) {
1122 12 : u->cpuacct_usage_base = 0;
1123 12 : return r;
1124 : }
1125 :
1126 0 : u->cpuacct_usage_base = ns;
1127 0 : return 0;
1128 : }
1129 :
1130 : static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1131 : [CGROUP_AUTO] = "auto",
1132 : [CGROUP_CLOSED] = "closed",
1133 : [CGROUP_STRICT] = "strict",
1134 : };
1135 :
1136 14 : DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
|