cpuset-v1.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. #include "cpuset-internal.h"
  3. /*
  4. * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
  5. */
  6. struct cpuset_remove_tasks_struct {
  7. struct work_struct work;
  8. struct cpuset *cs;
  9. };
  10. /*
  11. * Frequency meter - How fast is some event occurring?
  12. *
  13. * These routines manage a digitally filtered, constant time based,
  14. * event frequency meter. There are four routines:
  15. * fmeter_init() - initialize a frequency meter.
  16. * fmeter_markevent() - called each time the event happens.
  17. * fmeter_getrate() - returns the recent rate of such events.
  18. * fmeter_update() - internal routine used to update fmeter.
  19. *
  20. * A common data structure is passed to each of these routines,
  21. * which is used to keep track of the state required to manage the
  22. * frequency meter and its digital filter.
  23. *
  24. * The filter works on the number of events marked per unit time.
  25. * The filter is single-pole low-pass recursive (IIR). The time unit
  26. * is 1 second. Arithmetic is done using 32-bit integers scaled to
  27. * simulate 3 decimal digits of precision (multiplied by 1000).
  28. *
  29. * With an FM_COEF of 933, and a time base of 1 second, the filter
  30. * has a half-life of 10 seconds, meaning that if the events quit
  31. * happening, then the rate returned from the fmeter_getrate()
  32. * will be cut in half each 10 seconds, until it converges to zero.
  33. *
  34. * It is not worth doing a real infinitely recursive filter. If more
  35. * than FM_MAXTICKS ticks have elapsed since the last filter event,
  36. * just compute FM_MAXTICKS ticks worth, by which point the level
  37. * will be stable.
  38. *
  39. * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
  40. * arithmetic overflow in the fmeter_update() routine.
  41. *
  42. * Given the simple 32 bit integer arithmetic used, this meter works
  43. * best for reporting rates between one per millisecond (msec) and
  44. * one per 32 (approx) seconds. At constant rates faster than one
  45. * per msec it maxes out at values just under 1,000,000. At constant
  46. * rates between one per msec, and one per second it will stabilize
  47. * to a value N*1000, where N is the rate of events per second.
  48. * At constant rates between one per second and one per 32 seconds,
  49. * it will be choppy, moving up on the seconds that have an event,
  50. * and then decaying until the next event. At rates slower than
  51. * about one in 32 seconds, it decays all the way back to zero between
  52. * each event.
  53. */
  54. #define FM_COEF 933 /* coefficient for half-life of 10 secs */
  55. #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
  56. #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
  57. #define FM_SCALE 1000 /* faux fixed point scale */
  58. /* Initialize a frequency meter */
  59. void fmeter_init(struct fmeter *fmp)
  60. {
  61. fmp->cnt = 0;
  62. fmp->val = 0;
  63. fmp->time = 0;
  64. spin_lock_init(&fmp->lock);
  65. }
  66. /* Internal meter update - process cnt events and update value */
  67. static void fmeter_update(struct fmeter *fmp)
  68. {
  69. time64_t now;
  70. u32 ticks;
  71. now = ktime_get_seconds();
  72. ticks = now - fmp->time;
  73. if (ticks == 0)
  74. return;
  75. ticks = min(FM_MAXTICKS, ticks);
  76. while (ticks-- > 0)
  77. fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
  78. fmp->time = now;
  79. fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
  80. fmp->cnt = 0;
  81. }
  82. /* Process any previous ticks, then bump cnt by one (times scale). */
  83. static void fmeter_markevent(struct fmeter *fmp)
  84. {
  85. spin_lock(&fmp->lock);
  86. fmeter_update(fmp);
  87. fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
  88. spin_unlock(&fmp->lock);
  89. }
  90. /* Process any previous ticks, then return current value. */
  91. static int fmeter_getrate(struct fmeter *fmp)
  92. {
  93. int val;
  94. spin_lock(&fmp->lock);
  95. fmeter_update(fmp);
  96. val = fmp->val;
  97. spin_unlock(&fmp->lock);
  98. return val;
  99. }
  100. /*
  101. * Collection of memory_pressure is suppressed unless
  102. * this flag is enabled by writing "1" to the special
  103. * cpuset file 'memory_pressure_enabled' in the root cpuset.
  104. */
  105. int cpuset_memory_pressure_enabled __read_mostly;
  106. /*
  107. * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
  108. *
  109. * Keep a running average of the rate of synchronous (direct)
  110. * page reclaim efforts initiated by tasks in each cpuset.
  111. *
  112. * This represents the rate at which some task in the cpuset
  113. * ran low on memory on all nodes it was allowed to use, and
  114. * had to enter the kernels page reclaim code in an effort to
  115. * create more free memory by tossing clean pages or swapping
  116. * or writing dirty pages.
  117. *
  118. * Display to user space in the per-cpuset read-only file
  119. * "memory_pressure". Value displayed is an integer
  120. * representing the recent rate of entry into the synchronous
  121. * (direct) page reclaim by any task attached to the cpuset.
  122. */
  123. void __cpuset_memory_pressure_bump(void)
  124. {
  125. rcu_read_lock();
  126. fmeter_markevent(&task_cs(current)->fmeter);
  127. rcu_read_unlock();
  128. }
  129. static int update_relax_domain_level(struct cpuset *cs, s64 val)
  130. {
  131. #ifdef CONFIG_SMP
  132. if (val < -1 || val > sched_domain_level_max + 1)
  133. return -EINVAL;
  134. #endif
  135. if (val != cs->relax_domain_level) {
  136. cs->relax_domain_level = val;
  137. if (!cpumask_empty(cs->cpus_allowed) &&
  138. is_sched_load_balance(cs))
  139. rebuild_sched_domains_locked();
  140. }
  141. return 0;
  142. }
  143. static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
  144. s64 val)
  145. {
  146. struct cpuset *cs = css_cs(css);
  147. cpuset_filetype_t type = cft->private;
  148. int retval = -ENODEV;
  149. cpus_read_lock();
  150. cpuset_lock();
  151. if (!is_cpuset_online(cs))
  152. goto out_unlock;
  153. switch (type) {
  154. case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  155. retval = update_relax_domain_level(cs, val);
  156. break;
  157. default:
  158. retval = -EINVAL;
  159. break;
  160. }
  161. out_unlock:
  162. cpuset_unlock();
  163. cpus_read_unlock();
  164. return retval;
  165. }
  166. static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
  167. {
  168. struct cpuset *cs = css_cs(css);
  169. cpuset_filetype_t type = cft->private;
  170. switch (type) {
  171. case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  172. return cs->relax_domain_level;
  173. default:
  174. BUG();
  175. }
  176. /* Unreachable but makes gcc happy */
  177. return 0;
  178. }
  179. /*
  180. * update task's spread flag if cpuset's page/slab spread flag is set
  181. *
  182. * Call with callback_lock or cpuset_mutex held. The check can be skipped
  183. * if on default hierarchy.
  184. */
  185. void cpuset1_update_task_spread_flags(struct cpuset *cs,
  186. struct task_struct *tsk)
  187. {
  188. if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
  189. return;
  190. if (is_spread_page(cs))
  191. task_set_spread_page(tsk);
  192. else
  193. task_clear_spread_page(tsk);
  194. if (is_spread_slab(cs))
  195. task_set_spread_slab(tsk);
  196. else
  197. task_clear_spread_slab(tsk);
  198. }
  199. /**
  200. * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
  201. * @cs: the cpuset in which each task's spread flags needs to be changed
  202. *
  203. * Iterate through each task of @cs updating its spread flags. As this
  204. * function is called with cpuset_mutex held, cpuset membership stays
  205. * stable.
  206. */
  207. void cpuset1_update_tasks_flags(struct cpuset *cs)
  208. {
  209. struct css_task_iter it;
  210. struct task_struct *task;
  211. css_task_iter_start(&cs->css, 0, &it);
  212. while ((task = css_task_iter_next(&it)))
  213. cpuset1_update_task_spread_flags(cs, task);
  214. css_task_iter_end(&it);
  215. }
  216. /*
  217. * If CPU and/or memory hotplug handlers, below, unplug any CPUs
  218. * or memory nodes, we need to walk over the cpuset hierarchy,
  219. * removing that CPU or node from all cpusets. If this removes the
  220. * last CPU or node from a cpuset, then move the tasks in the empty
  221. * cpuset to its next-highest non-empty parent.
  222. */
  223. static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  224. {
  225. struct cpuset *parent;
  226. /*
  227. * Find its next-highest non-empty parent, (top cpuset
  228. * has online cpus, so can't be empty).
  229. */
  230. parent = parent_cs(cs);
  231. while (cpumask_empty(parent->cpus_allowed) ||
  232. nodes_empty(parent->mems_allowed))
  233. parent = parent_cs(parent);
  234. if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
  235. pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
  236. pr_cont_cgroup_name(cs->css.cgroup);
  237. pr_cont("\n");
  238. }
  239. }
  240. static void cpuset_migrate_tasks_workfn(struct work_struct *work)
  241. {
  242. struct cpuset_remove_tasks_struct *s;
  243. s = container_of(work, struct cpuset_remove_tasks_struct, work);
  244. remove_tasks_in_empty_cpuset(s->cs);
  245. css_put(&s->cs->css);
  246. kfree(s);
  247. }
  248. void cpuset1_hotplug_update_tasks(struct cpuset *cs,
  249. struct cpumask *new_cpus, nodemask_t *new_mems,
  250. bool cpus_updated, bool mems_updated)
  251. {
  252. bool is_empty;
  253. cpuset_callback_lock_irq();
  254. cpumask_copy(cs->cpus_allowed, new_cpus);
  255. cpumask_copy(cs->effective_cpus, new_cpus);
  256. cs->mems_allowed = *new_mems;
  257. cs->effective_mems = *new_mems;
  258. cpuset_callback_unlock_irq();
  259. /*
  260. * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
  261. * as the tasks will be migrated to an ancestor.
  262. */
  263. if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
  264. cpuset_update_tasks_cpumask(cs, new_cpus);
  265. if (mems_updated && !nodes_empty(cs->mems_allowed))
  266. cpuset_update_tasks_nodemask(cs);
  267. is_empty = cpumask_empty(cs->cpus_allowed) ||
  268. nodes_empty(cs->mems_allowed);
  269. /*
  270. * Move tasks to the nearest ancestor with execution resources,
  271. * This is full cgroup operation which will also call back into
  272. * cpuset. Execute it asynchronously using workqueue.
  273. */
  274. if (is_empty && cs->css.cgroup->nr_populated_csets &&
  275. css_tryget_online(&cs->css)) {
  276. struct cpuset_remove_tasks_struct *s;
  277. s = kzalloc(sizeof(*s), GFP_KERNEL);
  278. if (WARN_ON_ONCE(!s)) {
  279. css_put(&cs->css);
  280. return;
  281. }
  282. s->cs = cs;
  283. INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
  284. schedule_work(&s->work);
  285. }
  286. }
  287. /*
  288. * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
  289. *
  290. * One cpuset is a subset of another if all its allowed CPUs and
  291. * Memory Nodes are a subset of the other, and its exclusive flags
  292. * are only set if the other's are set. Call holding cpuset_mutex.
  293. */
  294. static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  295. {
  296. return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
  297. nodes_subset(p->mems_allowed, q->mems_allowed) &&
  298. is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
  299. is_mem_exclusive(p) <= is_mem_exclusive(q);
  300. }
  301. /*
  302. * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
  303. * behavior.
  304. */
  305. int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
  306. {
  307. struct cgroup_subsys_state *css;
  308. struct cpuset *c, *par;
  309. int ret;
  310. WARN_ON_ONCE(!rcu_read_lock_held());
  311. /* Each of our child cpusets must be a subset of us */
  312. ret = -EBUSY;
  313. cpuset_for_each_child(c, css, cur)
  314. if (!is_cpuset_subset(c, trial))
  315. goto out;
  316. /* On legacy hierarchy, we must be a subset of our parent cpuset. */
  317. ret = -EACCES;
  318. par = parent_cs(cur);
  319. if (par && !is_cpuset_subset(trial, par))
  320. goto out;
  321. ret = 0;
  322. out:
  323. return ret;
  324. }
  325. static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
  326. {
  327. struct cpuset *cs = css_cs(css);
  328. cpuset_filetype_t type = cft->private;
  329. switch (type) {
  330. case FILE_CPU_EXCLUSIVE:
  331. return is_cpu_exclusive(cs);
  332. case FILE_MEM_EXCLUSIVE:
  333. return is_mem_exclusive(cs);
  334. case FILE_MEM_HARDWALL:
  335. return is_mem_hardwall(cs);
  336. case FILE_SCHED_LOAD_BALANCE:
  337. return is_sched_load_balance(cs);
  338. case FILE_MEMORY_MIGRATE:
  339. return is_memory_migrate(cs);
  340. case FILE_MEMORY_PRESSURE_ENABLED:
  341. return cpuset_memory_pressure_enabled;
  342. case FILE_MEMORY_PRESSURE:
  343. return fmeter_getrate(&cs->fmeter);
  344. case FILE_SPREAD_PAGE:
  345. return is_spread_page(cs);
  346. case FILE_SPREAD_SLAB:
  347. return is_spread_slab(cs);
  348. default:
  349. BUG();
  350. }
  351. /* Unreachable but makes gcc happy */
  352. return 0;
  353. }
  354. static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
  355. u64 val)
  356. {
  357. struct cpuset *cs = css_cs(css);
  358. cpuset_filetype_t type = cft->private;
  359. int retval = 0;
  360. cpus_read_lock();
  361. cpuset_lock();
  362. if (!is_cpuset_online(cs)) {
  363. retval = -ENODEV;
  364. goto out_unlock;
  365. }
  366. switch (type) {
  367. case FILE_CPU_EXCLUSIVE:
  368. retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
  369. break;
  370. case FILE_MEM_EXCLUSIVE:
  371. retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
  372. break;
  373. case FILE_MEM_HARDWALL:
  374. retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
  375. break;
  376. case FILE_SCHED_LOAD_BALANCE:
  377. retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
  378. break;
  379. case FILE_MEMORY_MIGRATE:
  380. retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
  381. break;
  382. case FILE_MEMORY_PRESSURE_ENABLED:
  383. cpuset_memory_pressure_enabled = !!val;
  384. break;
  385. case FILE_SPREAD_PAGE:
  386. retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
  387. break;
  388. case FILE_SPREAD_SLAB:
  389. retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
  390. break;
  391. default:
  392. retval = -EINVAL;
  393. break;
  394. }
  395. out_unlock:
  396. cpuset_unlock();
  397. cpus_read_unlock();
  398. return retval;
  399. }
  400. /*
  401. * for the common functions, 'private' gives the type of file
  402. */
  403. struct cftype cpuset1_files[] = {
  404. {
  405. .name = "cpus",
  406. .seq_show = cpuset_common_seq_show,
  407. .write = cpuset_write_resmask,
  408. .max_write_len = (100U + 6 * NR_CPUS),
  409. .private = FILE_CPULIST,
  410. },
  411. {
  412. .name = "mems",
  413. .seq_show = cpuset_common_seq_show,
  414. .write = cpuset_write_resmask,
  415. .max_write_len = (100U + 6 * MAX_NUMNODES),
  416. .private = FILE_MEMLIST,
  417. },
  418. {
  419. .name = "effective_cpus",
  420. .seq_show = cpuset_common_seq_show,
  421. .private = FILE_EFFECTIVE_CPULIST,
  422. },
  423. {
  424. .name = "effective_mems",
  425. .seq_show = cpuset_common_seq_show,
  426. .private = FILE_EFFECTIVE_MEMLIST,
  427. },
  428. {
  429. .name = "cpu_exclusive",
  430. .read_u64 = cpuset_read_u64,
  431. .write_u64 = cpuset_write_u64,
  432. .private = FILE_CPU_EXCLUSIVE,
  433. },
  434. {
  435. .name = "mem_exclusive",
  436. .read_u64 = cpuset_read_u64,
  437. .write_u64 = cpuset_write_u64,
  438. .private = FILE_MEM_EXCLUSIVE,
  439. },
  440. {
  441. .name = "mem_hardwall",
  442. .read_u64 = cpuset_read_u64,
  443. .write_u64 = cpuset_write_u64,
  444. .private = FILE_MEM_HARDWALL,
  445. },
  446. {
  447. .name = "sched_load_balance",
  448. .read_u64 = cpuset_read_u64,
  449. .write_u64 = cpuset_write_u64,
  450. .private = FILE_SCHED_LOAD_BALANCE,
  451. },
  452. {
  453. .name = "sched_relax_domain_level",
  454. .read_s64 = cpuset_read_s64,
  455. .write_s64 = cpuset_write_s64,
  456. .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
  457. },
  458. {
  459. .name = "memory_migrate",
  460. .read_u64 = cpuset_read_u64,
  461. .write_u64 = cpuset_write_u64,
  462. .private = FILE_MEMORY_MIGRATE,
  463. },
  464. {
  465. .name = "memory_pressure",
  466. .read_u64 = cpuset_read_u64,
  467. .private = FILE_MEMORY_PRESSURE,
  468. },
  469. {
  470. .name = "memory_spread_page",
  471. .read_u64 = cpuset_read_u64,
  472. .write_u64 = cpuset_write_u64,
  473. .private = FILE_SPREAD_PAGE,
  474. },
  475. {
  476. /* obsolete, may be removed in the future */
  477. .name = "memory_spread_slab",
  478. .read_u64 = cpuset_read_u64,
  479. .write_u64 = cpuset_write_u64,
  480. .private = FILE_SPREAD_SLAB,
  481. },
  482. {
  483. .name = "memory_pressure_enabled",
  484. .flags = CFTYPE_ONLY_ON_ROOT,
  485. .read_u64 = cpuset_read_u64,
  486. .write_u64 = cpuset_write_u64,
  487. .private = FILE_MEMORY_PRESSURE_ENABLED,
  488. },
  489. { } /* terminate */
  490. };