pids.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Process number limiting controller for cgroups.
  4. *
  5. * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
  6. * after a certain limit is reached.
  7. *
  8. * Since it is trivial to hit the task limit without hitting any kmemcg limits
  9. * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
  10. * preventable in the scope of a cgroup hierarchy by allowing resource limiting
  11. * of the number of tasks in a cgroup.
  12. *
  13. * In order to use the `pids` controller, set the maximum number of tasks in
  14. * pids.max (this is not available in the root cgroup for obvious reasons). The
  15. * number of processes currently in the cgroup is given by pids.current.
  16. * Organisational operations are not blocked by cgroup policies, so it is
  17. * possible to have pids.current > pids.max. However, it is not possible to
  18. * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
  19. * would cause a cgroup policy to be violated.
  20. *
  21. * To set a cgroup to have no limit, set pids.max to "max". This is the default
  22. * for all new cgroups (N.B. that PID limits are hierarchical, so the most
  23. * stringent limit in the hierarchy is followed).
  24. *
  25. * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
  26. * a superset of parent/child/pids.current.
  27. *
  28. * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
  29. */
  30. #include <linux/kernel.h>
  31. #include <linux/threads.h>
  32. #include <linux/atomic.h>
  33. #include <linux/cgroup.h>
  34. #include <linux/slab.h>
  35. #include <linux/sched/task.h>
  36. #define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
  37. #define PIDS_MAX_STR "max"
  38. enum pidcg_event {
  39. /* Fork failed in subtree because this pids_cgroup limit was hit. */
  40. PIDCG_MAX,
  41. /* Fork failed in this pids_cgroup because ancestor limit was hit. */
  42. PIDCG_FORKFAIL,
  43. NR_PIDCG_EVENTS,
  44. };
  45. struct pids_cgroup {
  46. struct cgroup_subsys_state css;
  47. /*
  48. * Use 64-bit types so that we can safely represent "max" as
  49. * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
  50. */
  51. atomic64_t counter;
  52. atomic64_t limit;
  53. int64_t watermark;
  54. /* Handles for pids.events[.local] */
  55. struct cgroup_file events_file;
  56. struct cgroup_file events_local_file;
  57. atomic64_t events[NR_PIDCG_EVENTS];
  58. atomic64_t events_local[NR_PIDCG_EVENTS];
  59. };
  60. static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
  61. {
  62. return container_of(css, struct pids_cgroup, css);
  63. }
  64. static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
  65. {
  66. return css_pids(pids->css.parent);
  67. }
  68. static struct cgroup_subsys_state *
  69. pids_css_alloc(struct cgroup_subsys_state *parent)
  70. {
  71. struct pids_cgroup *pids;
  72. pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
  73. if (!pids)
  74. return ERR_PTR(-ENOMEM);
  75. atomic64_set(&pids->limit, PIDS_MAX);
  76. return &pids->css;
  77. }
  78. static void pids_css_free(struct cgroup_subsys_state *css)
  79. {
  80. kfree(css_pids(css));
  81. }
  82. static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
  83. {
  84. /*
  85. * This is racy, but we don't need perfectly accurate tallying of
  86. * the watermark, and this lets us avoid extra atomic overhead.
  87. */
  88. if (nr_pids > READ_ONCE(p->watermark))
  89. WRITE_ONCE(p->watermark, nr_pids);
  90. }
  91. /**
  92. * pids_cancel - uncharge the local pid count
  93. * @pids: the pid cgroup state
  94. * @num: the number of pids to cancel
  95. *
  96. * This function will WARN if the pid count goes under 0, because such a case is
  97. * a bug in the pids controller proper.
  98. */
  99. static void pids_cancel(struct pids_cgroup *pids, int num)
  100. {
  101. /*
  102. * A negative count (or overflow for that matter) is invalid,
  103. * and indicates a bug in the `pids` controller proper.
  104. */
  105. WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
  106. }
  107. /**
  108. * pids_uncharge - hierarchically uncharge the pid count
  109. * @pids: the pid cgroup state
  110. * @num: the number of pids to uncharge
  111. */
  112. static void pids_uncharge(struct pids_cgroup *pids, int num)
  113. {
  114. struct pids_cgroup *p;
  115. for (p = pids; parent_pids(p); p = parent_pids(p))
  116. pids_cancel(p, num);
  117. }
  118. /**
  119. * pids_charge - hierarchically charge the pid count
  120. * @pids: the pid cgroup state
  121. * @num: the number of pids to charge
  122. *
  123. * This function does *not* follow the pid limit set. It cannot fail and the new
  124. * pid count may exceed the limit. This is only used for reverting failed
  125. * attaches, where there is no other way out than violating the limit.
  126. */
  127. static void pids_charge(struct pids_cgroup *pids, int num)
  128. {
  129. struct pids_cgroup *p;
  130. for (p = pids; parent_pids(p); p = parent_pids(p)) {
  131. int64_t new = atomic64_add_return(num, &p->counter);
  132. pids_update_watermark(p, new);
  133. }
  134. }
  135. /**
  136. * pids_try_charge - hierarchically try to charge the pid count
  137. * @pids: the pid cgroup state
  138. * @num: the number of pids to charge
  139. * @fail: storage of pid cgroup causing the fail
  140. *
  141. * This function follows the set limit. It will fail if the charge would cause
  142. * the new value to exceed the hierarchical limit. Returns 0 if the charge
  143. * succeeded, otherwise -EAGAIN.
  144. */
  145. static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail)
  146. {
  147. struct pids_cgroup *p, *q;
  148. for (p = pids; parent_pids(p); p = parent_pids(p)) {
  149. int64_t new = atomic64_add_return(num, &p->counter);
  150. int64_t limit = atomic64_read(&p->limit);
  151. /*
  152. * Since new is capped to the maximum number of pid_t, if
  153. * p->limit is %PIDS_MAX then we know that this test will never
  154. * fail.
  155. */
  156. if (new > limit) {
  157. *fail = p;
  158. goto revert;
  159. }
  160. /*
  161. * Not technically accurate if we go over limit somewhere up
  162. * the hierarchy, but that's tolerable for the watermark.
  163. */
  164. pids_update_watermark(p, new);
  165. }
  166. return 0;
  167. revert:
  168. for (q = pids; q != p; q = parent_pids(q))
  169. pids_cancel(q, num);
  170. pids_cancel(p, num);
  171. return -EAGAIN;
  172. }
  173. static int pids_can_attach(struct cgroup_taskset *tset)
  174. {
  175. struct task_struct *task;
  176. struct cgroup_subsys_state *dst_css;
  177. cgroup_taskset_for_each(task, dst_css, tset) {
  178. struct pids_cgroup *pids = css_pids(dst_css);
  179. struct cgroup_subsys_state *old_css;
  180. struct pids_cgroup *old_pids;
  181. /*
  182. * No need to pin @old_css between here and cancel_attach()
  183. * because cgroup core protects it from being freed before
  184. * the migration completes or fails.
  185. */
  186. old_css = task_css(task, pids_cgrp_id);
  187. old_pids = css_pids(old_css);
  188. pids_charge(pids, 1);
  189. pids_uncharge(old_pids, 1);
  190. }
  191. return 0;
  192. }
  193. static void pids_cancel_attach(struct cgroup_taskset *tset)
  194. {
  195. struct task_struct *task;
  196. struct cgroup_subsys_state *dst_css;
  197. cgroup_taskset_for_each(task, dst_css, tset) {
  198. struct pids_cgroup *pids = css_pids(dst_css);
  199. struct cgroup_subsys_state *old_css;
  200. struct pids_cgroup *old_pids;
  201. old_css = task_css(task, pids_cgrp_id);
  202. old_pids = css_pids(old_css);
  203. pids_charge(old_pids, 1);
  204. pids_uncharge(pids, 1);
  205. }
  206. }
  207. static void pids_event(struct pids_cgroup *pids_forking,
  208. struct pids_cgroup *pids_over_limit)
  209. {
  210. struct pids_cgroup *p = pids_forking;
  211. /* Only log the first time limit is hit. */
  212. if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) {
  213. pr_info("cgroup: fork rejected by pids controller in ");
  214. pr_cont_cgroup_path(p->css.cgroup);
  215. pr_cont("\n");
  216. }
  217. if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
  218. cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
  219. cgroup_file_notify(&p->events_local_file);
  220. return;
  221. }
  222. atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]);
  223. cgroup_file_notify(&pids_over_limit->events_local_file);
  224. for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) {
  225. atomic64_inc(&p->events[PIDCG_MAX]);
  226. cgroup_file_notify(&p->events_file);
  227. }
  228. }
  229. /*
  230. * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
  231. * on cgroup_threadgroup_change_begin() held by the copy_process().
  232. */
  233. static int pids_can_fork(struct task_struct *task, struct css_set *cset)
  234. {
  235. struct pids_cgroup *pids, *pids_over_limit;
  236. int err;
  237. pids = css_pids(cset->subsys[pids_cgrp_id]);
  238. err = pids_try_charge(pids, 1, &pids_over_limit);
  239. if (err)
  240. pids_event(pids, pids_over_limit);
  241. return err;
  242. }
  243. static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
  244. {
  245. struct pids_cgroup *pids;
  246. pids = css_pids(cset->subsys[pids_cgrp_id]);
  247. pids_uncharge(pids, 1);
  248. }
  249. static void pids_release(struct task_struct *task)
  250. {
  251. struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
  252. pids_uncharge(pids, 1);
  253. }
  254. static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
  255. size_t nbytes, loff_t off)
  256. {
  257. struct cgroup_subsys_state *css = of_css(of);
  258. struct pids_cgroup *pids = css_pids(css);
  259. int64_t limit;
  260. int err;
  261. buf = strstrip(buf);
  262. if (!strcmp(buf, PIDS_MAX_STR)) {
  263. limit = PIDS_MAX;
  264. goto set_limit;
  265. }
  266. err = kstrtoll(buf, 0, &limit);
  267. if (err)
  268. return err;
  269. if (limit < 0 || limit >= PIDS_MAX)
  270. return -EINVAL;
  271. set_limit:
  272. /*
  273. * Limit updates don't need to be mutex'd, since it isn't
  274. * critical that any racing fork()s follow the new limit.
  275. */
  276. atomic64_set(&pids->limit, limit);
  277. return nbytes;
  278. }
  279. static int pids_max_show(struct seq_file *sf, void *v)
  280. {
  281. struct cgroup_subsys_state *css = seq_css(sf);
  282. struct pids_cgroup *pids = css_pids(css);
  283. int64_t limit = atomic64_read(&pids->limit);
  284. if (limit >= PIDS_MAX)
  285. seq_printf(sf, "%s\n", PIDS_MAX_STR);
  286. else
  287. seq_printf(sf, "%lld\n", limit);
  288. return 0;
  289. }
  290. static s64 pids_current_read(struct cgroup_subsys_state *css,
  291. struct cftype *cft)
  292. {
  293. struct pids_cgroup *pids = css_pids(css);
  294. return atomic64_read(&pids->counter);
  295. }
  296. static s64 pids_peak_read(struct cgroup_subsys_state *css,
  297. struct cftype *cft)
  298. {
  299. struct pids_cgroup *pids = css_pids(css);
  300. return READ_ONCE(pids->watermark);
  301. }
  302. static int __pids_events_show(struct seq_file *sf, bool local)
  303. {
  304. struct pids_cgroup *pids = css_pids(seq_css(sf));
  305. enum pidcg_event pe = PIDCG_MAX;
  306. atomic64_t *events;
  307. if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
  308. cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
  309. pe = PIDCG_FORKFAIL;
  310. local = true;
  311. }
  312. events = local ? pids->events_local : pids->events;
  313. seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe]));
  314. return 0;
  315. }
  316. static int pids_events_show(struct seq_file *sf, void *v)
  317. {
  318. __pids_events_show(sf, false);
  319. return 0;
  320. }
  321. static int pids_events_local_show(struct seq_file *sf, void *v)
  322. {
  323. __pids_events_show(sf, true);
  324. return 0;
  325. }
  326. static struct cftype pids_files[] = {
  327. {
  328. .name = "max",
  329. .write = pids_max_write,
  330. .seq_show = pids_max_show,
  331. .flags = CFTYPE_NOT_ON_ROOT,
  332. },
  333. {
  334. .name = "current",
  335. .read_s64 = pids_current_read,
  336. .flags = CFTYPE_NOT_ON_ROOT,
  337. },
  338. {
  339. .name = "peak",
  340. .flags = CFTYPE_NOT_ON_ROOT,
  341. .read_s64 = pids_peak_read,
  342. },
  343. {
  344. .name = "events",
  345. .seq_show = pids_events_show,
  346. .file_offset = offsetof(struct pids_cgroup, events_file),
  347. .flags = CFTYPE_NOT_ON_ROOT,
  348. },
  349. {
  350. .name = "events.local",
  351. .seq_show = pids_events_local_show,
  352. .file_offset = offsetof(struct pids_cgroup, events_local_file),
  353. .flags = CFTYPE_NOT_ON_ROOT,
  354. },
  355. { } /* terminate */
  356. };
  357. static struct cftype pids_files_legacy[] = {
  358. {
  359. .name = "max",
  360. .write = pids_max_write,
  361. .seq_show = pids_max_show,
  362. .flags = CFTYPE_NOT_ON_ROOT,
  363. },
  364. {
  365. .name = "current",
  366. .read_s64 = pids_current_read,
  367. .flags = CFTYPE_NOT_ON_ROOT,
  368. },
  369. {
  370. .name = "peak",
  371. .flags = CFTYPE_NOT_ON_ROOT,
  372. .read_s64 = pids_peak_read,
  373. },
  374. {
  375. .name = "events",
  376. .seq_show = pids_events_show,
  377. .file_offset = offsetof(struct pids_cgroup, events_file),
  378. .flags = CFTYPE_NOT_ON_ROOT,
  379. },
  380. { } /* terminate */
  381. };
  382. struct cgroup_subsys pids_cgrp_subsys = {
  383. .css_alloc = pids_css_alloc,
  384. .css_free = pids_css_free,
  385. .can_attach = pids_can_attach,
  386. .cancel_attach = pids_cancel_attach,
  387. .can_fork = pids_can_fork,
  388. .cancel_fork = pids_cancel_fork,
  389. .release = pids_release,
  390. .legacy_cftypes = pids_files_legacy,
  391. .dfl_cftypes = pids_files,
  392. .threaded = true,
  393. };